# Imports

In [91]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise import SVD, SVDpp, NormalPredictor
from surprise.model_selection import GridSearchCV, cross_validate
from surprise import Dataset, Reader
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler

# Data loading

In [55]:
df = pd.read_csv('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv',encoding='UTF-8')
df_book = pd.read_csv('Data/Cleaned_Data/book_cleaned.csv',encoding='UTF-8')
df_rating = pd.read_csv('Data/Cleaned_Data/rating_cleaned.csv',encoding='UTF-8')
df_user = pd.read_csv('Data/Cleaned_Data/user_cleaned.csv',encoding='UTF-8')

# 

# Data preprocessing

## Drop unnecessary features

In [56]:
df = df.drop(columns=['index'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df_book = df_book.loc[:, ~df_book.columns.str.contains('^Unnamed')]
df_rating = df_rating.loc[:, ~df_rating.columns.str.contains('^Unnamed')]
df_user = df_user.loc[:, ~df_user.columns.str.contains('^Unnamed')]
df = df.drop(columns=['Image_URL','ISBN'])

In [57]:
df.head()

Unnamed: 0,User_ID,Book_Rating,Book_Title,Book_Author,Publication_year,Publisher,Unique_ISBN,Age,Country,Age_Range
0,276747,9,LITTLE ALTARS EVERYWHERE,REBECCA WELLS,2003.0,HARPERTORCH,60517794,25.0,USA,18-36
1,276747,9,WAITING TO EXHALE,TERRY MCMILLAN,1995.0,POCKET,671537458,25.0,USA,18-36
2,276747,8,BIRDSONG: A NOVEL OF LOVE AND WAR,SEBASTIAN FAULKS,1997.0,VINTAGE BOOKS USA,679776818,25.0,USA,18-36
3,276747,7,HOW TO DEAL WITH DIFFICULT PEOPLE,RICK BRINKMAN,1995.0,CAREERTRACK INC.,943066433,25.0,USA,18-36
4,276747,7,THE GOLDEN RULE OF SCHMOOZING,AYE JAYE,1998.0,LISTEN &AMP; LIVE AUDIO,1885408226,25.0,USA,18-36


# Feature Engineering (Encoding)

In [58]:
labelenc = LabelEncoder()
onehot = OneHotEncoder()
ordinal = OrdinalEncoder()

In [61]:
def label_encoding(df,param):
    df[param] = labelenc.fit_transform(df[param].values)
    df[param] = df[param].astype('category')

In [64]:
label_encoding(df, 'User_ID')
label_encoding(df, 'Unique_ISBN')

In [87]:
enc_data = df[['User_ID','Unique_ISBN','Book_Rating']]

# SVD

## Baseline model

In [89]:
svd = SVD()

In [90]:
reader = Reader(rating_scale=(1, 10))
custom_data = Dataset.load_from_df(enc_data[['User_ID', 'Unique_ISBN', 'Book_Rating']], reader)

In [92]:
cross_validate(svd, custom_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5977  1.5878  1.5843  1.5926  1.5918  1.5909  0.0046  
MAE (testset)     1.2268  1.2209  1.2160  1.2256  1.2226  1.2224  0.0038  
Fit time          13.90   14.20   14.29   14.54   15.65   14.52   0.60    
Test time         0.51    0.57    0.36    0.37    0.56    0.47    0.09    


{'test_rmse': array([1.59773155, 1.58780048, 1.58427185, 1.59262894, 1.59181922]),
 'test_mae': array([1.2267656 , 1.22089419, 1.21601679, 1.22560505, 1.22264048]),
 'fit_time': (13.899455070495605,
  14.200200319290161,
  14.288547039031982,
  14.543802976608276,
  15.645019054412842),
 'test_time': (0.5059082508087158,
  0.5714590549468994,
  0.36257362365722656,
  0.36962008476257324,
  0.5601580142974854)}

## Hyperparameters tuning

In [None]:
param_grid = {'n_epochs': [20, 40, 60, 80, 100], 'lr_all': [0.000001,0.00002,0.0003,0.004,0.05],
              'reg_all': [0.2,0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'],cv=10)
gs.fit(custom_data)
# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df

# SVD++

# TruncatedSVD

# KNN