# **CrowdAI submission**

In [1]:
#Imports
import numpy as np
import pandas as pd
import random
import math
import plots
from matplotlib import pyplot as plt
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly
from surprise import KNNBaseline
from surprise import SlopeOne
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from sklearn.model_selection import KFold as skFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from Vince_helpers import *

#Reproducibility
my_seed = 200
random.seed(my_seed)
np.random.seed(my_seed)

In [2]:
#Load data train file
train = pd.read_csv('data_train.csv')
train = to_surprise(train)

In [3]:
#Load train set into Surprise
reader = Reader(rating_scale=(1, 5))
train_surp = Dataset.load_from_df(train, reader)
train_surp = train_surp.build_full_trainset()
train_surp_test = train_surp.build_testset()

Fit each model with preselected hyperparameters

In [4]:
mean = global_mean(train)
users = user_mean(train)
movies = movie_mean(train)

In [None]:
#Set previously computed hyperparameters for each algorithm
bsl_options = {'method': 'sgd',
               'reg': 10**-8
              }
bsl_options_knnu = {'method': 'als',
                    'n_epochs': 50,
                   }
sim_options_knnu = {'name': 'pearson_baseline', 'user_based' : True
                   }
bsl_options_knni = {'method': 'als',
                    'n_epochs': 50,
                   }
sim_options_knni = {'name': 'pearson_baseline', 'user_based' : False
                   }

In [None]:
#Fit algorithms on the whole training data with the previousy
algo_baseline = BaselineOnly(bsl_options=bsl_options).fit(train_surp)
algo_SVDb = SVD(n_factors=400, lr_all=0.0015, biased=True, reg_all=0.1, n_epochs=500, random_state=200).fit(train_surp)
algo_SVD = SVD(reg_all=0.01, biased=False, n_factors=1, lr_all=0.0015, n_epochs=500, random_state=200).fit(train_surp)
algo_SVDpp = SVDpp(random_state=200).fit(train_surp)
algo_slope_one = SlopeOne().fit(train_surp)
algo_knn_user = KNNBaseline(k=250, sim_options=sim_options_knnu, bsl_options=bsl_options_knnu).fit(train_surp)
algo_knn_movie = KNNBaseline(k=250, sim_options=sim_options_knni, bsl_options=bsl_options_knni).fit(train_surp)

Moving on to test set predictions

Load user/movie pairs to predict

In [12]:
test_original = pd.read_csv('examples_sample_submission.csv')

In [13]:
#Move testset to surprise testset format
test = test_original.copy()
test = to_surprise(test)
test = Dataset.load_from_df(test, reader)
test = test.build_full_trainset()
test = test.build_testset()

In [14]:
#Predict ratings with every model
predictions_baseline = algo_baseline.test(test)
predictions_SVDb = algo_SVDb.test(test)
predictions_SVD = algo_SVD.test(test)
predictions_SVDpp = algo_SVDpp.test(test)
predictions_slope_one = algo_slope_one.test(test)
predictions_knn_user = algo_knn_user.test(test)
predictions_knn_movie = algo_knn_movie.test(test)

In [15]:
#Extract estimated ratings
uids = [pred.uid for pred in predictions_baseline]
mids = [pred.iid for pred in predictions_baseline]
ruis = [pred.r_ui for pred in predictions_baseline]
est_baseline = [pred.est for pred in predictions_baseline]
est_SVDb = [pred.est for pred in predictions_SVDb]
est_SVD = [pred.est for pred in predictions_SVD]
est_SVDpp = [pred.est for pred in predictions_SVDpp]
est_slope_one = [pred.est for pred in predictions_slope_one]
est_knn_user = [pred.est for pred in predictions_knn_user]
est_knn_movie = [pred.est for pred in predictions_knn_movie]
est_global = [mean for i in range(len(ruis))]
est_user_mean = [predict_user(u, users, mean) for u in uids]
est_movie_mean = [predict_movie(m, movies, mean) for m in mids]

In [16]:
#Build Dataframe containing ratings predictions
est_baseline = np.array(est_baseline)
est_global = np.array(est_global)
est_user_mean = np.array(est_user_mean)
est_movie_mean = np.array(est_movie_mean)
est_knn_movie = np.array(est_knn_movie)
est_knn_user = np.array(est_knn_user)
est_slope_one = np.array(est_slope_one)
est_SVDb = np.array(est_SVDb)
est_SVD = np.array(est_SVD)
est_SVDpp = np.array(est_SVDpp)


X = np.column_stack((est_global, est_user_mean, est_movie_mean, est_baseline, 
                     est_knn_movie, est_knn_user, est_slope_one,
                     est_SVDb, est_SVD, est_SVDpp))

In [23]:
#Linear combination of predictions
#Weights previously computed
weights = np.array([0.12650389, -0.24258255, -0.11721048, -0.0233713, 
                    0.15383295,  0.16873745,  0.16240169,  1.03262748, 
                    -0.39005349,  0.12686713])
preds = X.dot(weights)
#Clip interval to 1-5 and round predictions to nearest integer
preds = np.clip(preds, 1, 5)
preds = np.around(preds)

In [24]:
#Recover proper ids format
ids = np.array(['r'+str(u)+'_c'+str(m) for (u,m) in zip(uids, mids)])

Create submission file

In [25]:
sub = pd.DataFrame({'Id':ids, 'Prediction':preds})

In [26]:
sub.to_csv('subVince', index=False)