# Import Packages

In [2]:
from surprise import NormalPredictor
from surprise import KNNWithMeans
from surprise import NMF,SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate
import pandas as pd
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader
import matplotlib.pyplot as plt

# Import and Inspect Data

In [3]:
r = pd.read_csv('ratings_filtered_data.csv')
r=r.drop( 'Unnamed: 0',axis=1);
r.shape
n_users = len(list(set(r.user_id)))
n_books = len(list(set(r.book_id)))
print('number of users', n_users)
print('number of books', n_books)


number of users 52541
number of books 152


As number of users >> number of sparse books should not be a problem for modeling with this dataset.

# Selection of Datasize for Modeling

As the first step for modeling, lets investigate if we can consider a subset of the available rating dataset to be able to perform all the modeling. This can be explored by increasing number of obversations considered in the dataset at small increments, and then estimating the efficiency of modeling with the increase of the size of the dataset. We will use a simple memory based algorithm (available as KNNWithMeans with the sklearn SURPRISE pakacage) for this step. 

In [None]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))


rmse = [] # Initiate an empty list to store RMSE for every iteration
size_considered = [] # Initiate an empty list to store datasize for every iteration

for i in range (1,25):
    r_selected = r[:i*10000]
    size_considered.append(i*10000)
    
# prepare a dataset object for processing with surprise package .
    data_set = Dataset.load_from_df(r_selected,reader)

#Split into test and train
    train_set, test_set = train_test_split(data_set,test_size =0.2)
# User based collaborative filtering
    knn = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
    knn.fit(train_set)
    knn_pred = knn.test(test_set) # predict ratings for the testset
    rmse.append(accuracy.rmse(knn_pred)) # compute RMSE score


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9093
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8992
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8962
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8769
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8836
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8872
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8916
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done co

In [None]:
plt.scatter(x = size_considered,y = rmse, color = 'g')
plt.xlabel('Number of Observations Considered in the Iteration')
plt.ylabel('RMSE Score')
plt.xticks(rotation = 45)
plt.title('Effect of Datasize on Modeling Accuracy (RMSE)')

## Observation

Since there is not siginificant imporvement of the RMSE with the increase in datasize for modeling, and we can extrapolate that model performance will be similar for predicting large amount of data, and consider the  first 100,000 data for modeling. Lets redefine the dataset with 100,000 observation for all the next steps in modeling.

In [None]:
# Dataset for Rest of the Modeling Steps

In [None]:
# A reader is needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

#Select Dataset with 100,000 observation
r_select = r[:100000]
# Prepare the dataset for processing with surprise package
data_set = Dataset.load_from_df(r_select,reader)

#Split into test and train
train_set, test_set = train_test_split(data_set,test_size =0.2)


# Lets Redo KNNwithMean (this time with 100,000 dataset and crossvalidation)

In [None]:
# knn was previosuly defined as below 
knn = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
# Run 3-fold cross-validation and print results.
knn_cv = cross_validate(knn, data_set, measures=['RMSE', 'MAE'], cv=3, verbose=True)

In [None]:
# Store the data in a dictionary to make a final summary at the end
Model_Summary = {}
Model_Summary ['Model_Name'] = ['KNNWithMeans']
Model_Summary ['GridSearch (Y/N)']  = ['N']
Model_Summary ['Paramters']  = ['k=50, name: pearson_baseline, user_based: True,min_support = 1}']
Model_Summary ['RMSE']  = [knn_cv['test_rmse'].mean()]

# How Does KNN compare to Baseline ML Model ?

In [None]:
# Lets Define a BaseLine Model with Surprise 
nd =  NormalPredictor()

# Run 3-fold cross-validation and print results.
nd_cv = cross_validate(nd, data_set, measures=['RMSE', 'MAE'], cv=3, verbose=True)


In [None]:
# Store the data in a dictionary to make a final summary at the end

Model_Summary ['Model_Name'].append('Normal Predictor')
Model_Summary ['GridSearch (Y/N)'].append('N')
Model_Summary ['Paramters'].append('-')
Model_Summary ['RMSE'].append(nd_cv['test_rmse'].mean())

## Grid Search for KNN

In [None]:
# define all parameter options
k = [30,40,50]
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {'k':k, "sim_options": sim_options}

gs_knn = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs_knn.fit(data_set)

print(gs_knn.best_score["rmse"])
print(gs_knn.best_params["rmse"])

In [None]:
Model_Summary ['Model_Name'].append('KNNWithMeans')
Model_Summary ['GridSearch (Y/N)'].append('Y')
Model_Summary ['Paramters'].append(gs_knn.best_params["rmse"])
Model_Summary ['RMSE'].append(gs_knn.best_score["rmse"])

# SVD for Matrix Factorization

In [None]:
%%time
# simple SVD model
svd = SVD(n_factors=20, n_epochs = 30, biased=False) # initiate a SVD algorithm object

# Run 3-fold cross-validation and print results.
svd_cv = cross_validate(svd, data_set, measures=['RMSE', 'MAE'], cv=3)

In [None]:
Model_Summary ['Model_Name'].append('SVD')
Model_Summary ['GridSearch (Y/N)'].append('N')
Model_Summary ['Paramters'].append('n_factors=20, n_epochs = 30, biased=False')
Model_Summary ['RMSE'].append(svd_cv['test_rmse'].mean())

## Grid Search for SVD

In [None]:
# parameter optimization with scikit-surprise SVD algorithm with bias

# cross validation to optimize parameters of SVD with bias
param_grid = {'n_factors': [10,20,30,50], 'n_epochs': [50,100,200], 'lr_all': [0.005],'reg_all': [0.05], 'biased': [True]}
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse'], cv = 3)
gs_svd.fit(data_set) # gridsesarch optimization on the trainset

# best RMSE score
print(gs_svd.best_score)
# combination of parameters that gave the best RMSE score
print(gs_svd.best_params)

In [None]:
Model_Summary ['Model_Name'].append('SVD')
Model_Summary ['GridSearch (Y/N)'].append('Y')
Model_Summary ['Paramters'].append(gs_svd.best_params)
Model_Summary ['RMSE'].append(gs_svd.best_score['rmse'])

In [None]:
gs_svd.best_score

# NMF

In [None]:
%%time
# simple SVD model
nmf = NMF(n_factors=20, n_epochs = 30, biased = True) # initiate a SVD algorithm object
# Run 3-fold cross-validation and print results.
nmf_cv = cross_validate(nmf, data_set, measures=['RMSE', 'MAE'], cv=3)

In [None]:
Model_Summary ['Model_Name'].append(' NMF')
Model_Summary ['GridSearch (Y/N)'].append ('N')
Model_Summary ['Paramters'].append('n_factors=20, n_epochs = 30, biased = True')
Model_Summary ['RMSE'].append(nmf_cv['test_rmse'].mean())

# Grid Search with NMF

In [None]:
from surprise import NMF
# cross validation to optimize parameters of NMF with no bias
param_grid = {'n_factors': [10,20,30,50], 'n_epochs': [20, 30,40,50],'biased': [False, True]}
gs_nmf = GridSearchCV(NMF, param_grid, measures=['rmse'], cv=3)
gs_nmf.fit(data_set) # gridsesarch optimization on the trainset, need to feed in a Dataset object not a trainset object

# best RMSE score
print(gs_nmf.best_score)
# combination of parameters that gave the best RMSE score
print(gs_nmf.best_params)

In [None]:
Model_Summary ['Model_Name'].append('NMF')
Model_Summary ['GridSearch (Y/N)'].append('Y')
Model_Summary ['Paramters'].append(gs_nmf.best_params["rmse"])
Model_Summary ['RMSE'].append(gs_nmf.best_score["rmse"])

In [None]:
# update settings to display untruncated dataframe
pd.set_option('display.max_colwidth', -1)

# Show Modeling Summary
CF_Modeling_Summary = pd.DataFrame.from_dict(Model_Summary)
CF_Modeling_Summary.sort_values(by = 'RMSE')

# Lets apply the best performing model on unseen data and compare results 

In [None]:
# Lets predict on another set of  data with best performing model 
hold_data = r[100000:200000]

# A reader is needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# Prepare the dataset for processing with surprise package
data_set = Dataset.load_from_df(hold_data,reader)

#Split the data to train and test
train , test = train_test_split(data_set, test_size = 0.5)


# define SVD model with best paramters 
svd = SVD(n_factors = 30, n_epochs = 50, lr_all =  0.005, reg_all = 0.05, biased = True)
svd.fit(train)
test_pred = svd.test(test)
accuracy.rmse(test_pred)

# Round up Predictions and compare  Histograms

In [None]:
comparison = []
actual_rating = []

for user, item, rating in test:
    actual_rating.append(rating)
    
#plt.hist(actual_rating, color = 'g')


for item in test_pred:
    comparison.append((item[3]))
#plt.hist(comparison,color = 'b')


dataset = pd.DataFrame()
dataset ['actual_rating'] =actual_rating
dataset ['predicted_rating'] = (comparison)
dataset [['actual_rating','predicted_rating']].plot(kind='hist',bins=[0, 1, 2, 3, 4, 5], alpha=0.5) 
plt.xlabel('Rating')
plt.title('Comparison of Histograms: Actual Rating vs Predicted Rating')
plt.show()

In [None]:
#Converting Prediction Results to a DataFrame 
test_pred

# Convert the Prediction Results to a DataFrame 

In [None]:
prediction = {'user_id': [], 'book_id': [],'Predicted Rating': [] }
for element in test_pred:
    prediction['user_id'].append(element.uid)
    prediction['book_id'].append(element.iid)
    prediction['Predicted Rating'].append(element.est)
#prediction
prediction_dataframe = pd.DataFrame.from_dict(prediction)  
prediction_dataframe

In [None]:
#Export the dataset
prediction_dataframe.to_csv('Rating_Prediction.csv', encoding = 'utf-8')