In [1]:
#!conda install -c conda-forge scikit-surprise

#import package 
from surprise import SVD, NMF, Dataset, Reader, SVDpp, BaselineOnly, KNNBaseline, SlopeOne, accuracy
from surprise.model_selection import cross_validate, GridSearchCV,train_test_split, KFold, GridSearchCV
from sklearn.model_selection import KFold as skFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, Ridge
from project_helpers import *
from math import *
import pandas as pd 

#seed
random.seed(404)
np.random.seed(404)

In [2]:
#we import data_train and we convert it in a surprise format
train = pd.read_csv(r'data_train.csv')
train = df_to_surprise(train)

In [3]:
#Split data into models training_set and blender_set 80% for train 20% for blending 
#traing_set was used to find the best hyperparameters using GS 
#the blender_set is used as a validation set for each model we compute on the traing_set but it also 
#use to compute a ridge regression and find the weight we will apply on each model. 
traing_set = train.sample(frac = 0.8, random_state = 200)
blender_set = train.drop(traing_set.index)

In [4]:
#we compute the global mean but also the mean by user and by movie
mean = global_mean(traing_set)
users = user_mean(traing_set)
movies = movie_mean(traing_set)

In [5]:
#Change both dataset into the surprise format
#setup the rating scale between 1 and 5
reader = Reader(rating_scale = (1, 5))
#surprise configuration
traing_set_surp = Dataset.load_from_df(traing_set, reader)
#load the traing_set as a full surprise trainset
traing_set_surp_train = traing_set_surp.build_full_trainset()
#surprise configuration
blend_surp = Dataset.load_from_df(blender_set, reader)
#load the blend as a full surprise trainset
blend_surp_train = blend_surp.build_full_trainset()

#Load blend train set as a testset for models performance evaluation
blend_surp_test = blend_surp_train.build_testset()

In [6]:
#fit on train set with parameters we found using grid search then we compute the prediction on the blending set
bsl_options = {'method': 'sgd','reg': 10**-11}
bsl_options_knnu = {'method': 'als','n_epochs': 50,}
sim_options_knnu = {'name': 'pearson_baseline', 'user_based' : True}
bsl_options_knni = {'method': 'als','n_epochs': 50,}
sim_options_knni = {'name': 'pearson_baseline', 'user_based' : False}

algo_baseline = BaselineOnly(bsl_options = bsl_options).fit(traing_set_surp_train)
algo_slope_one = SlopeOne().fit(traing_set_surp_train)
#KNN
algo_knn_user = KNNBaseline(k = 400, sim_options = sim_options_knnu, bsl_options = bsl_options_knnu).fit(traing_set_surp_train)
algo_knn_movie = KNNBaseline(k = 200, sim_options = sim_options_knni, bsl_options = bsl_options_knni).fit(traing_set_surp_train)
#SVD
#algo_SVD = SVD(reg_all = 0.01, biased = False, n_factors = 1, lr_all = 0.0015, n_epochs = 500, random_state = 200).fit(traing_set_surp_train)
#algo_SVDpp = SVDpp(random_state = 200).fit(traing_set_surp_train)

Estimating biases using sgd...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [8]:
#SVDb
algo_SVDb = SVD(n_factors = 400, lr_all = 0.0015, biased = True, reg_all = 0.1, n_epochs = 400, random_state = 200).fit(traing_set_surp_train)

In [9]:
#SVDb pred
predictions_SVDb = algo_SVDb.test(blend_surp_test)

In [25]:
algo_NMFb = NMF(n_epochs = 400, biased = True).fit(traing_set_surp_train)

In [26]:
#NMFb pred
predictions_NMFb = algo_NMFb.test(blend_surp_test)

In [19]:
#we compute the prediction on the blending set
predictions_baseline = algo_baseline.test(blend_surp_test)
#predictions_SVDb = algo_SVDb.test(blend_surp_test)
#predictions_SVD = algo_SVD.test(blend_surp_test)
#predictions_SVDpp = algo_SVDpp.test(blend_surp_test)
predictions_slope_one = algo_slope_one.test(blend_surp_test)
predictions_knn_user = algo_knn_user.test(blend_surp_test)
predictions_knn_movie = algo_knn_movie.test(blend_surp_test)

In [27]:
#Recover ids and estimations for each algorithm
#extract user_id (uids) movie_id (mids) 
uids = [pred.uid for pred in predictions_baseline]
mids = [pred.iid for pred in predictions_baseline]
#extract the real grade
ruis = [pred.r_ui for pred in predictions_baseline]

est_baseline = [pred.est for pred in predictions_baseline]
est_SVDb = [pred.est for pred in predictions_SVDb]
#est_SVD = [pred.est for pred in predictions_SVD]
#est_SVDpp = [pred.est for pred in predictions_SVDpp]
est_slope_one = [pred.est for pred in predictions_slope_one]
est_knn_user = [pred.est for pred in predictions_knn_user]
est_knn_movie = [pred.est for pred in predictions_knn_movie]
est_global = [mean for i in range(0,len(ruis))]
est_user_mean = [predict_user(u, users, mean) for u in uids]
est_movie_mean = [predict_movie(m, movies, mean) for m in mids]
est_NMFb = [pred.est for pred in predictions_NMFb]

#compute rmse score for the mean methods
global_rmse = math.sqrt(sum([(x-y)**2 for (x,y) in zip(ruis, est_global)])/len(ruis))
user_rmse = math.sqrt(sum([(x-y)**2 for (x,y) in zip(ruis, est_user_mean)])/len(ruis))
movie_rmse = math.sqrt(sum([(x-y)**2 for (x,y) in zip(ruis, est_movie_mean)])/len(ruis))

#we compute a matrix with all the 10 methods we use
X = np.column_stack((est_global, est_user_mean, est_movie_mean, est_baseline, 
                     est_knn_movie, est_knn_user, est_slope_one,est_SVDb, est_NMFb))#, est_SVD, est_SVDpp))

y = np.array(ruis)
#we split the blending test in a train and test set 25% for the test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 200)

#Cross validation and grid search for the ridge regression we fit for our blending
cv_ridge = skFold(n_splits = 3, random_state = 200)
gs_ridge = RidgeCV(alphas = [10**-i for i in range(-5,10)], fit_intercept = False, scoring = "neg_mean_squared_error", cv = cv_ridge)

In [31]:
#We fit finds the best hyperparameter for the ridge regression then we obtain the weights we will use
gs_ridge.fit(X_train, y_train)
print('Best lambda: ', gs_ridge.alpha_)
#we compute the rmse we obtain on the blend test set using the blending method
preds_blend = gs_ridge.predict(X_test)
#compute rmse for the blending
blend_rmse = np.sqrt(np.mean((y_test-preds_blend)**2))
print('Model blending RMSE on validation set: ', blend_rmse) 

Best lambda:  10.0
Model blending RMSE on validation set:  0.9768643073647673


In [30]:
#we compute the RMSE for each method
print('RMSE on baseline: ', accuracy.rmse(predictions_baseline, verbose = False))
print('RMSE SVDb: ', accuracy.rmse(predictions_SVDb, verbose = False))
#print('RMSE SVD: ', accuracy.rmse(predictions_SVD, verbose = False))
#print('RMSE SVDpp: ', accuracy.rmse(predictions_SVDpp, verbose = False))
print('RMSE Slope one: ', accuracy.rmse(predictions_slope_one, verbose = False))
print('RMSE KNN user: ', accuracy.rmse(predictions_knn_user, verbose = False))
print('RMSE KNN movie: ', accuracy.rmse(predictions_knn_movie, verbose = False))
print('Global mean RMSE on validation set: ', global_rmse)
print('User mean RMSE on validation set: ', user_rmse)
print('Movie mean RMSE on validation set: ', movie_rmse)
print('RMSE on NMFb: ', accuracy.rmse(predictions_NMFb, verbose = False))

RMSE on baseline:  1.00329862204689
RMSE SVDb:  0.9850157000056244
RMSE Slope one:  0.999303955746321
RMSE KNN user:  0.9919923488704668
RMSE KNN movie:  0.9897453934926961
Global mean RMSE on validation set:  1.119984536592249
User mean RMSE on validation set:  1.095988679007401
Movie mean RMSE on validation set:  1.0300326000124342
RMSE on NMFb:  1.0149692594017745
