In [None]:
from surprise import SVD, NMF, Dataset, Reader, SVDpp, BaselineOnly, KNNBaseline, SlopeOne, accuracy
from surprise.model_selection import cross_validate, GridSearchCV,train_test_split, KFold, GridSearchCV
from sklearn.model_selection import KFold as skFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
import pandas as pd 
from project_helpers import *
from math import *

#seed
random.seed(404)
np.random.seed(404)

In [None]:
train = pd.read_csv(r'data_train.csv')
train = df_to_surprise(train)


In [None]:
#Split data into models training_set and blender_set
traing_set = train.sample(frac = 0.8, random_state = 200)
blender_set = train.drop(traing_set.index)

In [None]:
mean = global_mean(traing_set)
users = user_mean(traing_set)
movies = movie_mean(traing_set)

In [None]:
#Change both dataset into the surprise format
#setup the rating scale
reader = Reader(rating_scale=(1, 5))

traing_set_surp = Dataset.load_from_df(traing_set, reader)
traing_set_surp_train = traing_set_surp.build_full_trainset()
blend_surp = Dataset.load_from_df(blender_set, reader)
blend_surp_train = blend_surp.build_full_trainset()

#Load blend train set as a testset for models performance evaluation
blend_surp_test = blend_surp_train.build_testset()

In [None]:
#Baseline
grid_baseline = {'bsl_options': {'method': ['sgd'],'reg': [10**-i for i in range(-3,12)]},'verbose':[False]}
gs_baseline = GridSearchCV(BaselineOnly, grid_baseline, measures = ['rmse'], cv = KFold(n_splits = 3, random_state = 200, shuffle = False))
gs_baseline.fit(traing_set_surp)
print('Best Hyperparameters: ', gs_baseline.best_params['rmse'])
algo_baseline = gs_baseline.best_estimator['rmse']
algo_baseline.fit(traing_set_surp_train)

In [None]:
#SVD + baseline
grid_SVDb = {'reg_all': [10**-i for i in range(-3,12)], 'biased':[True], 'n_factors':[20, 50, 100, 200, 300, 400],'n_epochs':[500], 'lr_all':[0.0015]}
gs_SVDb = GridSearchCV(SVD, grid_SVDb, measures = ['rmse'], cv = KFold(n_splits = 3, random_state = 200, shuffle = False))
gs_SVDb.fit(traing_set_surp)
print('Best Hyperparameters: ', gs_SVDb.best_params['rmse'])
algo_SVDb = gs_SVDb.best_estimator['rmse']
algo_SVDb.fit(traing_set_surp_train)

In [None]:
#SVD without baseline
grid_SVD = {'reg_all':[10**-i for i in range(-3,12)], 'biased':[False], 'n_factors':[1, 5, 10, 100, 200],'n_epochs':[500], 'lr_all':[0.0015]}
gs_SVD = GridSearchCV(SVD, grid_SVD, measures = ['rmse'], cv = KFold(n_splits = 3, random_state = 200, shuffle = False))
gs_SVD.fit(traing_set_surp)
print('Best Hyperparameters: ', gs_SVD.best_params['rmse'])
algo_SVD = gs_SVD.best_estimator['rmse']
algo_SVD.fit(traing_set_surp_train)

In [None]:
#SVD++
grid_SVDpp = {}
gs_SVDpp = GridSearchCV(SVDpp, grid_SVDpp, measures = ['rmse'], cv = KFold(n_splits = 3, random_state = 200, shuffle = False))
gs_SVDpp.fit(traing_set_surp)
print('Best Hyperparameters: ', gs_SVDpp.best_params['rmse'])
algo_SVDpp = gs_SVDpp.best_estimator['rmse']
algo_SVDpp.fit(traing_set_surp_train)

In [None]:
#Slope One
algo_slope_one = SlopeOne()
algo_slope_one.fit((traing_set_surp_train)

In [None]:
#KNN user
grid_knn_user = {'bsl_options': {'method':['als'],'n_epochs':[50]},'k': [150, 200, 250, 300,350,400,450,500,550],
                 'sim_options': {'name': ['pearson_baseline'],'min_support': [1],'user_based': [True]}}
gs_knn_user = GridSearchCV(KNNBaseline, grid_knn_user, measures=['rmse'], cv = KFold(n_splits = 3, random_state = 200
                                                                                     , shuffle = False))
gs_knn_user.fit(traing_set_surp)
print('Best Hyperparameters: ', gs_knn_user.best_params['rmse'])
algo_knn_user = gs_knn_user.best_estimator['rmse']
algo_knn_user.fit(traing_set_surp_train)

In [None]:
#KNN movie
grid_knn_movie = {'bsl_options': {'method':['als'],'n_epochs':[50]},'k': [150, 200, 250, 300,350,400,450,500,550],
                  'sim_options': {'name': ['pearson_baseline'],'min_support': [1],'user_based': [False]}}
gs_knn_movie = GridSearchCV(KNNBaseline, grid_knn_movie, measures = ['rmse'], cv = KFold(n_splits = 3
                            , random_state = 200, shuffle = False))
gs_knn_movie.fit(traing_set_surp)
print('Best Hyperparameters: ', gs_knn_movie.best_params['rmse'])
algo_knn_movie = gs_knn_movie.best_estimator['rmse']
algo_knn_movie.fit(traing_set_surp_train)