# Initial Settings

In [None]:
%%capture
!pip install surprise

In [None]:
import numpy as np
import pandas as pd

# system and settings
from google.colab import drive
from tqdm import tqdm
import os

# ML
from surprise import Dataset, Reader
from surprise.prediction_algorithms.knns import KNNBasic

In [None]:
drive.mount("/content/drive/")

Mounted at /content/drive/


## Datasets

In [None]:
# finding folder path
def find_folder(root, target):
    for root, dirs, _ in os.walk(root):
        if target in dirs:
            return os.path.join(root, target)
    return None

root = '/'
target = 'datasets_imdb'

PATH = find_folder(root, target)
print(f"path to dataset's folders --> {PATH}")

path to dataset's folders --> /content/drive/MyDrive/ic_recsys/imdb_recommender/datasets_imdb


In [None]:
headers = ["userId", "movieId", "rating"]

all_interactions = pd.read_csv(f"{PATH}/refined/all_interactions.csv", names=headers)
all_movies_genres = pd.read_csv(f"{PATH}/raw/movielens_20m_datasets/movie.csv")

train = pd.read_csv(f"{PATH}/refined/train.csv", names=headers)
test = pd.read_csv(f"{PATH}/refined/test.csv", names=headers)

In [None]:
train['userId'] = train['userId'].astype(str)
train['movieId'] = train['movieId'].astype(str)
# train["rating"] = train["rating"] * .1

test['userId'] = test['userId'].astype(str)
test['movieId'] = test['movieId'].astype(str)
# test["rating"] = train["rating"] * .1

all_interactions['movieId'] = all_interactions['movieId'].astype(str)
all_interactions['userId'] = all_interactions['userId'].astype(str)

In [None]:
genres_map = {}
for _, row in all_movies_genres.iterrows():
    movie_id = str(row['movieId'])
    genres = row['genres'].split("|") if row["genres"] else "Não informado"

    if movie_id not in genres_map:
        genres_map[movie_id] = genres

In [None]:
unique_movies = list(all_interactions["movieId"].unique())

In [None]:
interactions_per_movie = train.groupby('movieId').count()[['rating']].sort_values('rating', ascending=False)
interactions_per_movie.columns = ['interactions']

map_id = dict(list(enumerate(interactions_per_movie.index)))
map_id = {v:k+1 for k,v in map_id.items()}  # k+1 --> fix bug when calling calibration algorithm (log method)

# Functions

In [None]:
def calculate_calibration_sum(profile_dist, temporary_list_with_score, user, genres_map, alpha=0.001):
    kl_div = 0.0
    reco_distr = get_user_recommendation_distribution(temporary_list_with_score, genres_map)
    for genre, p in profile_dist.items():
        q = reco_distr.get(genre, 0.0)
        til_q = (1 - alpha) * q + alpha * p

        if p == 0.0 or til_q == 0.0:
            kl_div = kl_div
        else:
            kl_div = kl_div + (p * np.log2(p / til_q))
    return kl_div

In [None]:
def rerank_recommendation(profile_dist, list_recomended_items, user, N, tradeoff):
    re_ranked_list = []
    re_ranked_with_score = []

    for _ in range(N):

        max_mmr = -np.inf
        max_item = None
        max_item_rating = None

        for item, rating in list_recomended_items:
            if item in re_ranked_list:
                continue

            temporary_list = re_ranked_list + [item]
            temporary_list_with_score = re_ranked_with_score + [(item, rating)]

            weight_part = sum(
                recomendation[1]
                for recomendation in temporary_list_with_score
            )

            full_tmp_calib = calculate_calibration_sum(
                profile_dist,
                temporary_list_with_score,
                user,
                genres_map=genres_map
            )

            maximized = (1 - tradeoff)*weight_part - tradeoff*full_tmp_calib

            if maximized > max_mmr:
                max_mmr = maximized
                max_item = item
                max_item_rating = rating

        if max_item is not None:
            re_ranked_list.append(max_item)
            re_ranked_with_score.append((max_item, max_item_rating))

    return re_ranked_list, re_ranked_with_score

In [None]:
def get_user_profile_distribution(df, user, genre_map):
    user_profile_distribution = {}
    n = 0
    for item in df[df['userId'] == user]['movieId'].values:

        for genre in genre_map[item]:
            if genre not in user_profile_distribution:
                user_profile_distribution[genre] = 0
            n += 1
            user_profile_distribution[genre] += 1

    user_profile_distribution = {k: v/n for k, v in sorted(user_profile_distribution.items(), key=lambda item: item[1])}
    return user_profile_distribution

In [None]:
def get_user_recommendation_distribution(prediction_user_map, genres_map):
    user_rec_distribution = {}
    n = 0
    for (item, score) in prediction_user_map:
        for genre in genres_map[item]:
            if genre not in user_rec_distribution:
                user_rec_distribution[genre] = 0
            n += 1
            user_rec_distribution[genre] += 1

    user_rec_distribution = {k: v/n for k, v in sorted(user_rec_distribution.items(), key=lambda item: item[1])}
    return user_rec_distribution

# Calibrating - Popularity

In [None]:
reader = Reader(rating_scale = (0, 5))

trainset = Dataset.load_from_df(train, reader=reader).build_full_trainset()

userknn = KNNBasic(K=10, sim_options={"user_based": True, "similarities": 'cosine'})
userknn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f9309a41840>

In [None]:
tradeoff = 0.9
N = 10

prediction_user_map_after_calibration = {}

for user in tqdm(sorted(test['userId'].unique())[:4000]):

    user_profile_distribution = get_user_profile_distribution(train, user, genres_map)

    data = {"movieId": list(set(all_interactions["movieId"].unique()))}
    user_testset_df = pd.DataFrame(data)
    user_testset_df["rating"] = 0.0
    user_testset_df["userId"] = user

    testset = (
        Dataset.load_from_df(
            user_testset_df[["userId", "movieId", "rating"]],
            reader=reader,
        )
        .build_full_trainset()
        .build_testset()
    )
    predictions = sorted(
        [(pred.iid, pred.est)for pred in userknn.test(testset) if ((pred.uid == user))],
        key=lambda x: x[1],reverse=True
    )


    reranked_list = rerank_recommendation(
        user_profile_distribution,
        predictions[:100],
        user,
        N,
        tradeoff
    )

    prediction_user_map_after_calibration[user] = reranked_list[1]

100%|██████████| 4000/4000 [4:58:54<00:00,  4.48s/it]


# Evaluation

## MAP

In [None]:
# Auxiliar Function, used in calculte_MAP and calculate_MRR

def item_is_relevant(user_id, item_id, df):
    aux = df[df["userId"] == user_id]
    if item_id in list(aux['movieId']):
        return True
    return False

In [None]:
def calculate_MAP(prediction_user_map, movies):

  progress_bar = tqdm(total=len(prediction_user_map), desc='Progress', position=0)
  MAP = 0

  for user_id in prediction_user_map:
    map_user, user_relevants = 0, 0
    for index, (item, _) in enumerate(prediction_user_map[user_id]):
      relevance = item_is_relevant(user_id, item, movies)
      user_relevants += int(relevance)
      map_user += user_relevants / (index + 1) if relevance else 0

    MAP += map_user / user_relevants if user_relevants != 0 else 0

    progress_bar.update(1)
  progress_bar.close()

  return MAP / len(prediction_user_map)

## MMR

In [None]:
def calculate_MRR(map_recommendations, movies):

    progress_bar = tqdm(total=len(map_recommendations), desc='Progress', position=0)
    MRR = 0

    for user_id in map_recommendations:
        user_find_corerect_item = False
        for index, (item, score) in enumerate(map_recommendations[user_id]):
            if user_find_corerect_item is False:
                if item_is_relevant(user_id, item, movies):
                    MRR += (1/(index+1))
                    user_find_corerect_item = True
        progress_bar.update(1)

    progress_bar.close()

    return MRR/len(map_recommendations)

## MRMC

In [None]:
def get_user_recommendation_distribution(prediction_user_map, genre_map):
    user_rec_distribution = {}
    n = 0
    for (item, score) in prediction_user_map:
        for genre in genre_map[item]:
            if genre not in user_rec_distribution:
                user_rec_distribution[genre] = 0
            n += 1
            user_rec_distribution[genre] += 1

    user_rec_distribution = {k: v/n for k, v in sorted(user_rec_distribution.items(), key=lambda item: item[1])}
    return user_rec_distribution

In [None]:
def user_rank_miscalibration(user_profile_dist, rec_profile_dist, alpha=0.001):
    p_g_u = user_profile_dist
    q_g_u = rec_profile_dist

    Ckl = 0
    for genre, p in p_g_u.items():
        q = q_g_u.get(genre, 0.0)
        til_q = (1 - alpha) * q + alpha * p

        if til_q == 0 or p_g_u.get(genre, 0) == 0:
            Ckl = Ckl
        else:
            Ckl += p * np.log2(p / til_q)
    return Ckl

In [None]:
def calculate_MRMC(predictions, df, genre_map):

    progress_bar = tqdm(total=len(predictions), desc='Progress', position=0)
    MRMC = 0

    for user in predictions:
        RMC = 0
        user_profile_dist = get_user_profile_distribution(df, user, genre_map)
        if user_profile_dist == {}:
            continue

        void = user_rank_miscalibration(user_profile_dist, {})
        N = len(predictions[user])
        for i in range(1, N):
            user_rec_dist = get_user_recommendation_distribution(predictions[user][:i], genre_map)
            kl = user_rank_miscalibration(user_profile_dist, user_rec_dist)
            RMC += kl/void

        MRMC += RMC/N
        progress_bar.update(1)

    progress_bar.close()
    return MRMC/len(predictions)

## Computing Metrics

In [None]:
calculate_MAP(prediction_user_map_after_calibration, all_interactions)

Progress: 100%|██████████| 4000/4000 [3:13:15<00:00,  2.90s/it]


0.2036748025321242

In [None]:
calculate_MRR(prediction_user_map_after_calibration, all_interactions)

Progress: 100%|██████████| 4000/4000 [2:07:52<00:00,  1.92s/it]


0.22628978174603212

In [None]:
calculate_MRMC(prediction_user_map_after_calibration, all_interactions, genres_map)

Progress: 100%|██████████| 4000/4000 [19:33<00:00,  3.41it/s]


0.09579093892999073

# Saving Recs

In [None]:
recs = pd.DataFrame([(user, movieId, rating) for user, ratings in prediction_user_map_after_calibration.items() for movieId, rating in ratings],
                          columns=['userId', 'movieId', 'rating'])

recs.head(5)

Unnamed: 0,userId,movieId,rating
0,1,27781,5.0
1,1,5952,5.0
2,1,112809,5.0
3,1,97390,5.0
4,1,45194,5.0


In [None]:
recs.to_csv(f"{PATH}/refined/recs_knn_fairness.csv")