In [1]:
#### Installing CaseRecommender ####
# I've changed the library to add
# a progress bar using tqdm

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
%%capture
!pip install /content/drive/MyDrive/ic_recsys/CaseRecommender

# Initial Settings

In [3]:
# ML
from caserec.recommenders.item_recommendation.bprmf import BprMF
import numpy as np

# data processing
import pandas as pd

# System and Settings
import os
from tqdm import tqdm

In [4]:
# finding folder path
def find_folder(root, target):
    for root, dirs, _ in os.walk(root):
        if target in dirs:
            return os.path.join(root, target)
    return None

In [5]:
root = '/'
target = 'datasets_imdb'

PATH = find_folder(root, target)
print(f"path to dataset's folders --> {PATH}")

path to dataset's folders --> /content/drive/MyDrive/ic_recsys/imdb_recommender/datasets_imdb


# Model

In [6]:
model = BprMF(train_file = f"{PATH}/refined/train.csv",
        test_file = f"{PATH}/refined/test.csv",
        output_file=f"{PATH}/refined/recs_bprmf/bprmf_recs_100.csv",
        sep=",",
        epochs=30,
        random_seed=42,
        rank_length=4000)  # it will return 100 recs per user, them the algorithm will be applied

In [7]:
model.compute()

[Case Recommender: Item Recommendation > BPRMF]

train data:: 27221 users and 18617 items (3200042 interactions) | sparsity:: 99.37%
test data:: 27194 users and 14222 items (800011 interactions) | sparsity:: 99.79%



Training Progress: 100%|██████████| 30/30 [1:05:00<00:00, 130.01s/it]


training_time:: 3900.413327 sec


Prediction Progress: 100%|██████████| 27221/27221 [10:18<00:00, 44.03it/s]


prediction_time:: 618.345321 sec


Eval:: PREC@1: 0.297455 PREC@3: 0.2712 PREC@5: 0.254166 PREC@10: 0.226002 RECALL@1: 0.016878 RECALL@3: 0.044771 RECALL@5: 0.068993 RECALL@10: 0.119652 MAP@1: 0.297455 MAP@3: 0.385701 MAP@5: 0.394807 MAP@10: 0.377546 NDCG@1: 0.297455 NDCG@3: 0.463778 NDCG@5: 0.485236 NDCG@10: 0.489469 


# Evaluation

Ref:  [papper](https://medium.com/swlh/rank-aware-recsys-evaluation-metrics-5191bba16832)

## Functions

In [8]:
def item_is_relevant(user_id, item_id, df):
    aux = df[df["userId"] == user_id]
    if item_id in list(aux['movieId']):
        return True
    return False

### MAP

In [9]:
def calculate_MAP(prediction_user_map, movies):

  progress_bar = tqdm(total=len(prediction_user_map), desc='Progress', position=0)
  MAP = 0

  for user_id in prediction_user_map:
    map_user, user_relevants = 0, 0
    for index, (item, _) in enumerate(prediction_user_map[user_id]):
      relevance = item_is_relevant(user_id, item, movies)
      user_relevants += int(relevance)
      map_user += user_relevants / (index + 1) if relevance else 0

    MAP += map_user / user_relevants if user_relevants != 0 else 0

    progress_bar.update(1)
  progress_bar.close()

  return MAP / len(prediction_user_map)

### MRR

In [10]:
def calculate_MRR(map_recommendations, movies):

    progress_bar = tqdm(total=len(map_recommendations), desc='Progress', position=0)
    MRR = 0

    for user_id in map_recommendations:
        user_find_corerect_item = False
        for index, (item, score) in enumerate(map_recommendations[user_id]):
            if user_find_corerect_item is False:
                if item_is_relevant(user_id, item, movies):
                    MRR += (1/(index+1))
                    user_find_corerect_item = True
        progress_bar.update(1)

    progress_bar.close()

    return MRR/len(map_recommendations)

### MRMC

In [11]:
def get_user_recommendation_distribution(prediction_user_map, genre_map):
    user_rec_distribution = {}
    n = 0
    for (item, score) in prediction_user_map:
        for genre in genre_map[item]:
            if genre not in user_rec_distribution:
                user_rec_distribution[genre] = 0
            n += 1
            user_rec_distribution[genre] += 1

    user_rec_distribution = {k: v/n for k, v in sorted(user_rec_distribution.items(), key=lambda item: item[1])}
    return user_rec_distribution

In [12]:
def get_user_profile_distribution(df, user, genre_map):
    user_profile_distribution = {}
    n = 0
    for item in df[df['userId'] == user]['movieId'].values:

        for genre in genre_map[item]:
            if genre not in user_profile_distribution:
                user_profile_distribution[genre] = 0
            n += 1
            user_profile_distribution[genre] += 1

    user_profile_distribution = {k: v/n for k, v in sorted(user_profile_distribution.items(), key=lambda item: item[1])}
    return user_profile_distribution

In [13]:
def user_rank_miscalibration(user_profile_dist, rec_profile_dist, alpha=0.001):
    p_g_u = user_profile_dist
    q_g_u = rec_profile_dist

    Ckl = 0
    for genre, p in p_g_u.items():
        q = q_g_u.get(genre, 0.0)
        til_q = (1 - alpha) * q + alpha * p

        if til_q == 0 or p_g_u.get(genre, 0) == 0:
            Ckl = Ckl
        else:
            Ckl += p * np.log2(p / til_q)
    return Ckl

In [14]:
def calculate_MRMC(predictions, df, genre_map):

    progress_bar = tqdm(total=len(predictions), desc='Progress', position=0)
    MRMC = 0

    for user in predictions:
        RMC = 0
        user_profile_dist = get_user_profile_distribution(df, user, genre_map)
        if user_profile_dist == {}:
            continue

        void = user_rank_miscalibration(user_profile_dist, {})
        N = len(predictions[user])
        for i in range(1, N):
            user_rec_dist = get_user_recommendation_distribution(predictions[user][:i], genre_map)
            kl = user_rank_miscalibration(user_profile_dist, user_rec_dist)
            RMC += kl/void

        MRMC += RMC/N
        progress_bar.update(1)

    progress_bar.close()
    return MRMC/len(predictions)

#Calibration - Popularity

## Reading Files & Creating Objects

In [15]:
headers = ["userId", "movieId", "rating"]

In [16]:
recs_bprmf = pd.read_csv(f"{PATH}/refined/recs_bprmf/bprmf_recs_100.csv",
                         names=headers)

recs_bprmf.head(5)

Unnamed: 0,userId,movieId,rating
0,1,2640,9.54992
1,1,2529,9.256491
2,1,2571,9.202463
3,1,2858,9.029198
4,1,1270,8.984052


In [17]:
recs_map = recs_bprmf.groupby('userId').apply(lambda group: list(zip(group['movieId'], group['rating']))).to_dict()

In [18]:
train = pd.read_csv(f"{PATH}/refined/train.csv", names=["userId", "movieId", "rating"])
test = pd.read_csv(f"{PATH}/refined/test.csv", names=["userId", "movieId", "rating"])

all_interactions = pd.read_csv(f"{PATH}/refined/all_interactions.csv", names=["userId", "movieId", "rating"])
all_movies_genres = pd.read_csv(f"{PATH}/raw/movielens_20m_datasets/movie.csv")

In [19]:
genres_map = {}
for _, row in all_movies_genres.iterrows():
    movie_id = row['movieId']
    genres = row['genres'].split("|") if row["genres"] else "Não informado"

    if movie_id not in genres_map:
        genres_map[movie_id] = genres

In [20]:
interactions_per_movie = train.groupby('movieId').count()[['rating']].sort_values('rating', ascending=False)
interactions_per_movie.columns = ['interactions']

map_id = dict(list(enumerate(interactions_per_movie.index)))
map_id = {v:k+1 for k,v in map_id.items()}  # k+1 --> fix bug when calling calibration algorithm (log method)

## Calibrating

In [21]:
new_predictions_after = []
data_p2_after = []
prediction_user_map_after = {}

alpha = 0.1

for user in tqdm(test['userId'].unique()[0:4000]):
    data = {"movieId": list(set(all_interactions["movieId"].unique()))}
    user_testset_df = pd.DataFrame(data)
    user_testset_df["rating"] = 0.0
    user_testset_df["userId"] = user

    predictions = sorted(
        [(item, (1-alpha)*rating + (alpha/np.log(map_id.get(item, 1)+1)) )for item, rating in recs_map[user]],
        key=lambda x: x[1],reverse=True
    )

    new_predictions_after.append(predictions[:10])
    prediction_user_map_after[user] = predictions[:10]

100%|██████████| 4000/4000 [08:22<00:00,  7.97it/s]


# Computing Metrics

In [22]:
calculate_MAP(prediction_user_map_after, all_interactions)

Progress: 100%|██████████| 4000/4000 [02:35<00:00, 25.69it/s]


0.5582819938114136

In [23]:
calculate_MRR(prediction_user_map_after, all_interactions)

Progress: 100%|██████████| 4000/4000 [00:48<00:00, 82.39it/s]


0.6355369047619049

In [24]:
calculate_MRMC(prediction_user_map_after, all_interactions, genres_map)

Progress: 100%|██████████| 4000/4000 [00:19<00:00, 205.67it/s]


0.2913035207986491

# Saving Recs

In [25]:
recs = pd.DataFrame([(user, itemId, rating) for user, ratings in prediction_user_map_after.items() for itemId, rating in ratings],
                          columns=['userId', 'itemId', 'rating'])

recs.head(5)

Unnamed: 0,userId,itemId,rating
0,23841,76093,8.555879
1,23841,81847,8.064285
2,23841,7153,7.886363
3,23841,66934,7.865359
4,23841,2959,7.781638


In [26]:
recs.to_csv(f"{PATH}/refined/recs_bprmf/recs_bprmf_popularity.csv")