## Model Selection - SVD
### Add Cross validation + GridSearch CV + evaluation

In [3]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

import movieLens_util

In [27]:
df_data, df_movie_meta, df_catalog, df_user_meta = movieLens_util.load_movieLens_dataset('ml-100k')

In [28]:
## since Grid search in surprise need to hardcode rating columns as name "raw_ratings"

df_data_rename = df_data.rename(columns={'rating': 'raw_ratings'})

reader = Reader(rating_scale=(1,5))
algo = SVD()
data_input = Dataset.load_from_df(df_data_rename[['user_id','item_id','raw_ratings']], reader)


### Adding Grid Search

### Grid search cannot handle Trainset object

#### First fine-tunning training need around 1 hours

In [36]:
from surprise.model_selection import GridSearchCV


param_grid = {'n_epochs': [100], 'reg_all': [0.01, 0.03, 0.05, 0.1], 
              'lr_all': [0.001, 0.005, 0.01]}

grid_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
grid_svd.fit(data_input)

In [None]:
### param_grid
1. regulation params: 
    reg_all: [0.01, 0.03, 0.05, 0.1]
    lr_all: [0.001, 0.005, 0.01]
2. iteration
    n_epochs: [100 , 150, 200]
3. top k optimal latent factors
    n_factors: [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]

In [38]:
grid_svd.best_score

{'rmse': 0.9093913347010686, 'mae': 0.718270024311075}

In [39]:
grid_svd.best_params

{'rmse': {'n_epochs': 100, 'reg_all': 0.1, 'lr_all': 0.005},
 'mae': {'n_epochs': 100, 'reg_all': 0.1, 'lr_all': 0.005}}

#### Second fine-tuning training need around 4 hours

Best Params: n_epochs: 100, reg_all: 0.1, lr_all: 0.005, n_factors: 130 (mae), 190(rsme)

In [44]:
param_grid2= {'n_epochs': [100 , 150, 200], 'reg_all': [0.1], 
              'lr_all': [0.005] , 
              'n_factors': [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200]}


grid_svd2 = GridSearchCV(SVD, param_grid2, measures=['rmse', 'mae'], cv=5)
grid_svd2.fit(data_input)

In [46]:
grid_svd2.best_score

{'rmse': 0.9083925353530186, 'mae': 0.7173197514854535}

## Grid Search Visualization