# Modeling - Clean

In [None]:
import io
import surprise
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from surprise import Reader, Dataset
from surprise.prediction_algorithms import SVD, SVDpp, BaselineOnly, KNNWithZScore
from surprise.prediction_algorithms import NMF, SlopeOne, NormalPredictor
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

In [None]:
full = pd.read_csv('../Data/filtered-cleaned')
full = full.drop(columns = 'Unnamed: 0')
min_cols = full[[ 'userId', 'movieId', 'rating']]
min_cols = min_cols.sample(500000)
smaller = min_cols.sample(50000)

In [None]:
reader = Reader()
data = Dataset.load_from_df(min_cols, reader)
datasmall = Dataset.load_from_df(smaller, reader)
kdata = datasmall.build_full_trainset()
trainset, testset = train_test_split(data, test_size = 0.10)

In [None]:
BaselineOnly_results = cross_validate(BaselineOnly(), datasmall, verbose = False)

In [None]:
nmf_results = cross_validate(NMF(), datasmall)

In [None]:
NormalPredictor_results = cross_validate(NormalPredictor(), datasmall)

In [None]:
SlopeOne_results = cross_validate(SlopeOne(), datasmall)

In [None]:
SVDpp_results = cross_validate(SVDpp(), datasmall)

In [None]:
SVD_results = cross_validate(SVD(), datasmall)

In [None]:
results_list = [SVD_results, SVDpp_results, SlopeOne_results,
                NormalPredictor_results, nmf_results, BaselineOnly_results]

In [None]:
def get_metrics(lst_dicts, key1, key2):
    values = []
    for dct in lst_dicts:
        values.append([dct[key1], dct[key2]])
    return pd.DataFrame(values, columns=[key1, key2])

In [None]:
metrics_df = get_metrics(results_list, 'test_mae', 'test_rmse')

In [None]:
result_names = pd.Series(['SVD_results', 'SVDpp_results', 'SlopeOne_results',
                          'NormalPredictor_results', 'nmf_results', 'BaselineOnly_results'])

In [None]:
sum_df = metrics_df.merge(result_names.rename('models'), left_index = True, right_index = True)

In [None]:
sum_df

In [None]:
plt.figure(figsize= (12, 7))
plt.plot(sum_df['test_mae'][0])
plt.plot(sum_df['test_mae'][1])
plt.plot(sum_df['test_mae'][2])
plt.plot(sum_df['test_mae'][3])
plt.plot(sum_df['test_mae'][4])
plt.plot(sum_df['test_mae'][5])
plt.title('Baseline CV MAE')
plt.xlabel('CV Folds')
plt.ylabel('M A E')
plt.legend(['SVD', 'SVDpp', 'SlopeOne', 'NormalPredictor', 'NMF', 'BaselineOnly'], loc='best')
plt.show()

In [None]:
plt.figure(figsize= (12, 7))
plt.plot(sum_df['test_rmse'][0])
plt.plot(sum_df['test_rmse'][1])
plt.plot(sum_df['test_rmse'][2])
plt.plot(sum_df['test_rmse'][3])
plt.plot(sum_df['test_rmse'][4])
plt.plot(sum_df['test_rmse'][5])
plt.title('Baseline CV RMSE')
plt.xlabel('CV Folds')
plt.ylabel('R M S E')

plt.legend(['SVD', 'SVDpp', 'SlopeOne', 'NormalPredictor', 'NMF', 'BaselineOnly'], loc='best')
plt.show()

In [None]:
svd_best = SVD(n_epochs = 50, n_factors = 75, reg_all = 0.2)
svd_best.fit(trainset)

In [None]:
test_mae = surprise.accuracy.mae(svd_best.test(testset))
test_mae

In [None]:
predictions  = svd_best.test(testset)

In [None]:
test_list = []
for i in testset:
  test_list.append(i[2])
prediction_list = []
for i in predictions:
  prediction_list.append(i[3])
correlation = pearsonr(prediction_list, test_list)
correlation