In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

In [2]:
import numpy as np
import pandas as pd

In [3]:
from src.data_reading import read_ratings_file
from src.evaluation import temporal_split, evaluate_rmse, evaluate_precision_at_k
from src.models.similarity_based_cf import predict_rating_cf_user_based, predict_rating_cf_item_based, recommend

# Data preparation

In [4]:
ratings = read_ratings_file() 

In [5]:
train, test = temporal_split(ratings, test_ratio=0.1)

Train set size is: (900188, 4) 
Test set size is: (100021, 4)
Train set timeframes are: 2000-04-25 23:05:32 - 2000-12-29 23:42:47 
Test set timeframes are 2000-12-29 23:43:34 - 2003-02-28 17:49:50


In [6]:
train_prep = train.pivot_table(
    index='user_id',
    columns='movie_id',
    values='rating'
)
train_prep_ = train_prep.fillna(0)

# Experiments with cosine similarity

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

user_sim = pd.DataFrame(
    cosine_similarity(train_prep_),
    index=train_prep.index,
    columns=train_prep.index
)

item_sim = pd.DataFrame(
    cosine_similarity(train_prep_.T),
    index=train_prep.columns,
    columns=train_prep.columns
)

In [8]:
# From the test set, let's remove users and movies missing in the train set, as similarity based collaborative filtering algorithms don't support cold-start

test_users = np.intersect1d(test.user_id.unique(), train.user_id.unique())
test_movies = np.intersect1d(test.movie_id.unique(), train.movie_id.unique())

test = test[(test.user_id.isin(test_users)) & (test.movie_id.isin(test_movies))]
print(f'New test set shape is: {test.shape}')

New test set shape is: (95723, 4)


In [9]:
evaluate_rmse(test=test, train_prep=train_prep, sim_df=item_sim, predict_fn=predict_rating_cf_item_based)

np.float64(1.0491174596850277)

In [10]:
evaluate_rmse(test=test, train_prep=train_prep, sim_df=user_sim, predict_fn=predict_rating_cf_user_based)

np.float64(0.9748556632091601)

In [11]:
test_users = np.random.choice(test.user_id.unique(), size=10, replace=False)
test_ = test[test.user_id.isin(test_users)]

In [12]:
evaluate_precision_at_k(test=test_, recommend_k_fn=recommend, predict_fn=predict_rating_cf_item_based, train_prep=train_prep, sim_df=item_sim, n=10, k=10)

  return np.dot(top_n_similarity.values, top_n_ratings.values) / np.sum(top_n_similarity.values)


0.21000000000000002

In [13]:
evaluate_precision_at_k(test=test_, recommend_k_fn=recommend, predict_fn=predict_rating_cf_user_based, train_prep=train_prep, sim_df=user_sim, n=10, k=10)

  return np.dot(neighbors.values, neighbor_ratings.values) / np.sum(neighbors.values)


0.25

# Experiments with pearson similarity

Approach issues:
- cold start issue
- too long calculations for the cases of too big number of items

In [None]:
def predict_rating_test(
    test: pd.DataFrame, 
    train_prep: pd.DataFrame, 
    sim_df: pd.DataFrame, 
    predict_fn      
):
    preds, actuals = [], []

    for row in test.itertuples():
        user_id = row.user_id
        movie_id = row.movie_id

        pred = predict_fn(
            user_id=user_id, 
            movie_id=movie_id, 
            train_prep=train_prep, 
            sim_df=sim_df
        )
        
        if not np.isnan(pred):
            preds.append(pred)
            actuals.append(row.rating)

    return preds, actuals

def evaluate_rmse_(
    preds: list[float], 
    actuals: list[int]
) -> float:
    return np.sqrt(np.mean((np.array(preds) - np.array(actuals)) ** 2))