# COEN 241 Yelp Recommender System

Using Scikit.Surprise Python Recommender System Library

In [1]:
import surprise
from surprise import Dataset, Reader, SVD, KNNWithMeans, KNNBaseline, accuracy, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
import pandas as pd
import pprint as pp

In [2]:
# Import the dataset & prepare it

file_path = 'yelp_reviews.csv'
reader = Reader(line_format='user item rating', sep=',', skip_lines=1, rating_scale=(1, 5))
data = Dataset.load_from_file(file_path=file_path, reader=reader)

trainset, testset = train_test_split(data, test_size=.25)

In [3]:
algo = SVD()

algo.fit(trainset)
predictions = algo.test(testset)

In [4]:
accuracy.rmse(predictions)

RMSE: 1.2880


1.2880144812801735

In [4]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, n_jobs=6, measures=['rmse', 'mae'], cv=2)
gs.fit(data)

In [5]:
# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

print(gs.best_estimator['rmse'])
print(gs.best_estimator['mae'])

results_df = pd.DataFrame.from_dict(gs.cv_results)
print(results_df)

1.3230490586612695
1.0969245181091778
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x0000017F840FA7C0>
<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x0000017F840FAC10>
   split0_test_rmse  split1_test_rmse  mean_test_rmse  std_test_rmse  \
0          1.385253          1.386163        1.385708       0.000455   
1          1.388404          1.389312        1.388858       0.000454   
2          1.347388          1.348346        1.347867       0.000479   
3          1.352663          1.353605        1.353134       0.000471   
4          1.356911          1.357747        1.357329       0.000418   
5          1.361742          1.362538        1.362140       0.000398   
6          1.322608          1.323490        1.323049       0.000441   
7          1.329482          1.330438        1.329960       0.000478   

   rank_test_rmse  split0_test_mae  spl

In [14]:
results = cross_validate(algo=algo, data=data, measures=['RMSE', 'MAE'], n_jobs=6, cv=5, verbose=True)
print(results['test_rmse'].mean())

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2847  1.2847  1.2858  1.2855  1.2858  1.2853  0.0005  
MAE (testset)     1.0298  1.0297  1.0306  1.0305  1.0302  1.0302  0.0004  
Fit time          267.01  265.74  265.28  264.69  263.86  265.32  1.05    
Test time         12.45   11.88   11.92   11.96   7.87    11.22   1.69    
1.285315539707498


In [4]:
algo_progress_dir = '.\\algo_checkpoints\\'

In [5]:
# SlopeOne()  # Requires "MemoryError: Unable to allocate 167. GiB for an array with shape (149655, 149655) and data type float64", unable to run on my computer
# KNNBaseline()   # Requires "MemoryError: Unable to allocate 12.5 TiB for an array with shape (1312387, 1312387) and data type float64", unable to run on my computer
# KNNBasic()  # MemoryError: Unable to allocate 12.5 TiB for an array with shape (1311208, 1311208) and data type float64
# KNNWithMeans()  # MemoryError: Unable to allocate 12.5 TiB for an array with shape (1311850, 1311850) and data type float64
# KNNWithZScore() # MemoryError: Unable to allocate 12.5 TiB for an array with shape (1311609, 1311609) and data type float64

# Iterate over all algorithms
benchmark = []
algo_done = 6
for algo_index, algorithm in enumerate([SVD(), SVDpp(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]):
    print(f"Algorithm {algo_index}:\t{algorithm.__class__.__name__}")
    if (algo_index < algo_done):
        print("\tAlready done, skipping")
        tmp = pd.read_csv(f"{algo_progress_dir}algo_{algo_index}_{algorithm.__class__.__name__}.csv")
        benchmark.append(tmp)
        continue
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, n_jobs=3, pre_dispatch='n_jobs', verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    pd.DataFrame.to_csv(tmp, f"{algo_progress_dir}algo_{algo_index}_{str(algorithm.__class__.__name__)}.csv")
    benchmark.append(tmp)
    
    algo_done += 1
    print(f"algo_done: {algo_done}\n")
    
# aggregate_result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Algorithm 0:	SVD
	Already done, skipping
Algorithm 1:	SVDpp
	Already done, skipping
Algorithm 2:	NMF
	Already done, skipping
Algorithm 3:	NormalPredictor
	Already done, skipping
Algorithm 4:	BaselineOnly
	Already done, skipping
Algorithm 5:	CoClustering
	Already done, skipping


In [25]:
cols = ['Algorithms', 'test_rmse', 'fit_time', 'test_time']
df = pd.DataFrame(columns=cols)
for row in benchmark:
    row = row.to_dict()['0']
    df = pd.concat([df, pd.DataFrame([[row[3], row[0], row[1], row[2]]], columns=cols)], ignore_index=True)
print(df)
pd.DataFrame.to_csv(df, f"{algo_progress_dir}benchmark.csv")

        Algorithms           test_rmse            fit_time           test_time
0              SVD  1.3025332424607239  141.97499787807465   22.80950176715851
1            SVDpp  1.3101656108009496   993.4315007925034   77.74049925804138
2              NMF  1.4911652176123789  218.62451767921448   20.68850016593933
3  NormalPredictor  1.9015793378418226   2.453497886657715  19.583502888679504
4     BaselineOnly  1.3044595390621168  3.8379987478256226   18.20549964904785
5     CoClustering   1.437949887236462  113.82850813865662  17.987001180648804


In [5]:
# param_grid = {'bsl_options':[{'method': 'als'}, {'method': 'sgd'}]}
param_grid = {
    'bsl_options': 
        {
            'method': ['als', 'sgd'],
            'reg': [0.02, 0.5, 1, 2], 
            'learning_rate': [0.001, 0.005, 0.01], 
            'n_epochs': [5, 10, 20]
        }
    }

gs = GridSearchCV(BaselineOnly, param_grid, n_jobs=4, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

In [6]:
# best RMSE score
print(gs.best_score['rmse'])
print(gs.best_score['mae'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
print(gs.best_params['mae'])

# print(gs.best_estimator['rmse'])
# print(gs.best_estimator['mae'])

# results_df = pd.DataFrame.from_dict(gs.cv_results)
# print(results_df)

1.2750307867761717
1.0085386976123376
{'bsl_options': {'method': 'sgd', 'reg': 0.02, 'learning_rate': 0.01, 'n_epochs': 20}}
{'bsl_options': {'method': 'sgd', 'reg': 0.02, 'learning_rate': 0.01, 'n_epochs': 20}}
