# Collaborative filtering

In [1]:
from typing import Tuple
%load_ext autoreload
%autoreload 2

In [3]:
import os
import surprise
from surprise import Dataset, Reader
import pandas as pd
from scipy.sparse import csr_matrix

from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

## Load MovieLens dataset

In [4]:
movielens = Dataset.load_builtin('ml-1m', prompt=False)
ratings_file = f"{surprise.get_dataset_dir()}/ml-1m/ml-1m/ratings.dat"
ratings_df = pd.read_csv(ratings_file, sep="::", names=["UserID", "MovieID", "Rating", "Timestamp"], engine='python')

In [75]:
print(f'Users: {ratings_df["UserID"].max()}')
print(f'Movies: {ratings_df["MovieID"].max()}')

Users: 6040
Movies: 3952


## Prepare train/test data

In [76]:
def convert_to_sparse(df: pd.DataFrame) -> csr_matrix:
    rows, cols, vals = zip(*df.values)
    return csr_matrix((vals, (rows, cols)))

In [77]:
ratings_df.sort_values(by='Timestamp', inplace=True)

ratings_df['UserID'] = ratings_df['UserID'].apply(lambda x: x - 1)
ratings_df['MovieID'] = ratings_df['MovieID'].apply(lambda x: x - 1)

train_df, test_df = train_test_split(
    ratings_df.iloc[:, :3], shuffle=False, test_size=0.1)

train_sparse = convert_to_sparse(train_df)
test_sparse = convert_to_sparse(test_df)

print(train_sparse.shape)
print(test_sparse.shape)

train = train_sparse.toarray()
test = test_sparse.toarray()

(6040, 3952)
(6040, 3952)


## Use Non-negative Matrix Factorization to predict users ratings

In [99]:
def non_zero_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()     # Ignore nonzero terms
    actual = actual[actual.nonzero()].flatten() # Ignore nonzero terms
    return mean_squared_error(pred, actual, squared=False)

In [95]:
nmf = NMF(n_components=15, alpha=0.001, l1_ratio=0.0)
res = nmf.fit_transform(train)
preds = res @ nmf.components_



In [100]:
print(non_zero_rmse(preds, test))


3.189504614173558


## Test ready to use NFM for recommender system implementation

In [1]:
from surprise.prediction_algorithms.matrix_factorization import NMF
from surprise.model_selection import cross_validate

In [2]:
alg = NMF()

In [5]:
cross_validate(alg, movielens, measures=['rmse'], verbose=True)

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9161  0.9171  0.9188  0.9148  0.9159  0.9165  0.0013  
Fit time          65.64   56.90   61.89   76.31   71.97   66.54   6.93    
Test time         1.85    2.73    2.67    2.40    2.39    2.41    0.31    


{'test_rmse': array([0.91611858, 0.91713986, 0.91878369, 0.91478603, 0.91588292]),
 'fit_time': (65.64167737960815,
  56.90305018424988,
  61.89145374298096,
  76.30525422096252,
  71.96712946891785),
 'test_time': (1.8510870933532715,
  2.7301337718963623,
  2.6708340644836426,
  2.3965368270874023,
  2.3918817043304443)}