The notebook includes experiments with various types of similarity-based collaborative filtering approaches:
- in terms of similarity function, cosine and the similarity based on Pearson correlation are used;
- in terms of approaches, both item-item and user-user are investigated.

For the evaluation, RMSE, MAPE, and Precision@K metrics are applied.

The results are shown at the end of the notebook.

The results analysis and interpretation are in the final report.

In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings('ignore')

In [3]:
from src.data_reading import read_ratings_file
from src.evaluation import temporal_split, evaluate_rmse, evaluate_mape, evaluate_precision_at_k
from src.models.similarity_based_cf import predict_rating_cf_user_based, predict_rating_cf_item_based, recommend_k

# Data preparation

In [4]:
# For similarity-based CF, we would use only the file with the movie ratings, we will not need movie metadata or users' features

ratings = read_ratings_file() 

In [5]:
# Split on train and test sets by date

train, test = temporal_split(ratings, test_ratio=0.1)

Train set size is: (900188, 4) 
Test set size is: (100021, 4)
Train set timeframes are: 2000-04-25 23:05:32 - 2000-12-29 23:42:47 
Test set timeframes are 2000-12-29 23:43:34 - 2003-02-28 17:49:50


In [6]:
# Create user_id x movie_id matrix

train_prep = train.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
)
train_prep_ = train_prep.fillna(0)

In [7]:
# Create a dataset to track experiments results

results = pd.DataFrame(
    columns=['rmse', 'mape', 'precision@k'], 
    index=['cos_sim_item_based', 'cos_sim_user_based', 'pearson_sim_item_based', 'pearson_sim_user_based']
)

# Experiments with cosine similarity

In [8]:
# Calculate user similarity and item similarity with cosine distance

user_sim = pd.DataFrame(
    cosine_similarity(train_prep_),
    index=train_prep.index,
    columns=train_prep.index
)

item_sim = pd.DataFrame(
    cosine_similarity(train_prep_.T),
    index=train_prep.columns,
    columns=train_prep.columns
)

In [9]:
# From the test set, let's remove users and movies missing in the train set, as similarity based collaborative filtering algorithms don't support cold-start

test_users = np.intersect1d(test.user_id.unique(), train.user_id.unique())
test_movies = np.intersect1d(test.movie_id.unique(), train.movie_id.unique())

test = test[(test.user_id.isin(test_users)) & (test.movie_id.isin(test_movies))]
print(f'New test set shape is: {test.shape}')

New test set shape is: (95723, 4)


In [10]:
# Evaluate RMSE for item-item approach

results.loc['cos_sim_item_based', 'rmse'] = evaluate_rmse(
    test=test, 
    train_prep=train_prep, 
    sim_df=item_sim, 
    predict_fn=predict_rating_cf_item_based
)

In [11]:
# Evaluate MAPE for item-item approach

results.loc['cos_sim_item_based', 'mape'] = evaluate_mape(
    test=test, 
    train_prep=train_prep, 
    sim_df=item_sim, 
    predict_fn=predict_rating_cf_item_based
)

In [12]:
# Limit the number of users to calculate presion@k faster

test_users = np.random.choice(test.user_id.unique(), size=100, replace=False)
test_ = test[test.user_id.isin(test_users)]

In [13]:
# Evaluate precision@k for item-item approach

results.loc['cos_sim_item_based', 'precision@k'] = evaluate_precision_at_k(
    test=test_, 
    recommend_k_fn=recommend_k, 
    predict_fn=predict_rating_cf_item_based, 
    train_prep=train_prep, 
    sim_df=item_sim, 
    n=10, 
    k=10
)

In [14]:
# Evaluate RMSE for user-user approach

results.loc['cos_sim_user_based', 'rmse'] = evaluate_rmse(
    test=test, 
    train_prep=train_prep, 
    sim_df=user_sim, 
    predict_fn=predict_rating_cf_user_based
)

In [15]:
# Evaluate MAPE for user-user approach

results.loc['cos_sim_user_based', 'mape'] = evaluate_mape(
    test=test, 
    train_prep=train_prep, 
    sim_df=user_sim, 
    predict_fn=predict_rating_cf_user_based
)

In [16]:
# Evaluate precision@k for user-user approach

results.loc['cos_sim_user_based', 'precision@k'] = evaluate_precision_at_k(
    test=test_, 
    recommend_k_fn=recommend_k, 
    predict_fn=predict_rating_cf_user_based, 
    train_prep=train_prep, 
    sim_df=user_sim, 
    n=10, 
    k=10
)

# Experiments with pearson similarity

In [17]:
# Let's calculate user similarity and item similarity with pearson similarity

user_sim_pearson = pd.DataFrame(
    cosine_similarity(
        train_prep.sub(train_prep.mean(axis=1), axis=0)\
        .fillna(0)\
        .values
    ),
    index=train_prep.index,
    columns=train_prep.index
)    

item_sim_pearson = pd.DataFrame(
    cosine_similarity(
        train_prep.sub(train_prep.mean(axis=0), axis=1)\
        .fillna(0)\
        .T\
        .values
    ),
    index=train_prep.columns,
    columns=train_prep.columns
)

In [18]:
# Evaluate RMSE for item-item approach

results.loc['pearson_sim_item_based', 'rmse'] = evaluate_rmse(
    test=test, 
    train_prep=train_prep, 
    sim_df=item_sim_pearson, 
    predict_fn=predict_rating_cf_item_based
)

In [19]:
# Evaluate MAPE for item-item approach

results.loc['pearson_sim_item_based', 'mape'] = evaluate_mape(
    test=test, 
    train_prep=train_prep, 
    sim_df=item_sim_pearson, 
    predict_fn=predict_rating_cf_item_based
)

In [20]:
# Evaluate precision@k for item-item approach

results.loc['pearson_sim_item_based', 'precision@k'] = evaluate_precision_at_k(
    test=test_, 
    recommend_k_fn=recommend_k, 
    predict_fn=predict_rating_cf_item_based, 
    train_prep=train_prep, 
    sim_df=item_sim_pearson, 
    n=10, 
    k=10
)

In [21]:
# Evaluate RMSE for user-user approach

results.loc['pearson_sim_user_based', 'rmse'] = evaluate_rmse(
    test=test, 
    train_prep=train_prep, 
    sim_df=user_sim_pearson, 
    predict_fn=predict_rating_cf_user_based
)

In [22]:
# Evaluate MAPE for user-user approach

results.loc['pearson_sim_user_based', 'mape'] = evaluate_mape(
    test=test, 
    train_prep=train_prep, 
    sim_df=user_sim_pearson, 
    predict_fn=predict_rating_cf_user_based
)

In [23]:
# Evaluate precision@k for user-user approach

results.loc['pearson_sim_user_based', 'precision@k'] = evaluate_precision_at_k(
    test=test_, 
    recommend_k_fn=recommend_k, 
    predict_fn=predict_rating_cf_user_based, 
    train_prep=train_prep, 
    sim_df=user_sim_pearson, 
    n=10, 
    k=10
)

# Summary

In [24]:
# The experiment results:

results

Unnamed: 0,rmse,mape,precision@k
cos_sim_item_based,1.049117,35.172036,0.069
cos_sim_user_based,0.974856,30.233909,0.13
pearson_sim_item_based,2.817651,32.694159,0.081
pearson_sim_user_based,0.974641,29.686667,0.083
