# Surprise for Collaborative Filtering
## 1. User-User CF (KNNWithMeans)
## 2. Item-Item CF (KNNWithMeans)
## 3-1. Matrix Factorization (SVD - HyperParameter Tuning with GridSearchCV)
## 3-2. Matrix Factorization (SVD - HyperParameter Tuning with RandomizedSearchCV)
### reference: https://kerpanic.wordpress.com/2018/03/26/a-gentle-guide-to-recommender-systems-with-surprise/

In [7]:
file_path = '/Users/morulabs/dev/source/git_hub/recsys/data/ml-latest-small/ratings.csv'

## 1. User-User CF (KNNWithMeans)

In [9]:
from surprise import KNNWithMeans, Dataset, accuracy
from surprise.model_selection import train_test_split
from surprise.reader import Reader

reader = Reader(line_format='user item,rating,timestamp', sep=',')
data = Dataset.load_from_file(file_path, reader=reader)

train_set, test_set = train_test_split(data, test_size=0.2)

algo_user = KNNWithMeans(k=4, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_user.fit(train_set)


uid = str(196)
for movie_id in range(1, 11):
    iid = str(movie_id)
    pred = algo_user.predict(uid, iid, r_ui=4, verbose=True)

test_pred = algo_user.test(test_set)

# get RMSE
print('User-based Model: Test set')
accuracy.rmse(test_pred, verbose=True)

# evaluate on the trianset
print('User-based Model: Training Set')
train_pred = algo_user.test(train_set.build_testset())
accuracy.rmse(train_pred)


ValueError: line_format parameter is incorrect.

## 2. KNNWithMeans (Item-Item CF)

In [7]:
from surprise import KNNWithMeans, Dataset, accuracy
from surprise.model_selection import train_test_split
from surprise.reader import Reader

reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
train_set, test_set = train_test_split(data, test_size=0.2)

algo_item = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo_item.fit(train_set)

test_pred = algo_item.test(test_set)

print('Item-based Model: Test Set')
accuracy.rmse(test_pred, verbose=True)

print('Item-based Model: Training Set')
train_pred = algo_item.test(train_set.build_testset())
accuracy.rmse(train_pred)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Item-based Model: Test Set
RMSE: 0.9182
Item-based Model: Training Set
RMSE: 0.4380


0.4379996865385731

## 3-1. Matrix Factorization (SVD - HyperParameter Tuning with GridSearchCV)
### GridSearchCV.fit 실행시 싱글코어로 도는 현상. 시간이 오래 걸림 

In [8]:
from surprise import SVDpp, SVD, Dataset, accuracy
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from surprise.reader import Reader

reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
train_set, test_set = train_test_split(data, test_size=0.2)

param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 
              'lr_all': [0.001, 0.003], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gs.fit(data)

algo_grid = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
cross_validate(algo_grid, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# insert parameters from the previous process
algo_svd = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
algo_svd.fit(train_set)
test_pred = algo_svd.test(test_set)

print('SVD: Test Set')
accuracy.rmse(test_pred, verbose=True)



KeyboardInterrupt: 

## 3-2. Matrix Factorization (SVD - HyperParameter Tuning with RandomizedSearchCV)
### RandomizedSearchCV.fit 실행시 가동 가능한 모든 코어 가동. GridSearchCV 대비 빨리 돔

In [9]:

from surprise import SVDpp, SVD, Dataset, accuracy
from surprise.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from surprise.reader import Reader

reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
train_set, test_set = train_test_split(data, test_size=0.2)

param_dists = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 
              'lr_all': [0.001, 0.003], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}

rsc = RandomizedSearchCV(SVD, param_dists, n_iter = 10, measures=['rmse', 'mae'], 
                        cv=5, joblib_verbose = 1000, n_jobs = -1)
rsc.fit(data)

algo_grid = rsc.best_estimator['rmse']
print(rsc.best_score['rmse'])
print(rsc.best_params['rmse'])
cross_validate(algo_grid, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# insert parameters from the previous process
algo_svd = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
algo_svd.fit(train_set)
test_pred = algo_svd.test(test_set)

print('SVD: Test Set')
accuracy.rmse(test_pred, verbose=True)




[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  

KeyboardInterrupt: 