In [135]:
import pandas as pd
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [171]:
df = pd.read_csv('final_whiskey_data.csv')
df.shape

(41018, 7)

In [172]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,url,whiskey,user_ID,price(euro),price(dollar)
0,0,88,https://www.whiskybase.com/whiskies/whisky/128...,1770-glasgow-single-malt,852,50.88,56.99
1,3,87,https://www.whiskybase.com/whiskies/whisky/128...,1770-glasgow-single-malt,459,50.88,56.99
2,4,86,https://www.whiskybase.com/whiskies/whisky/128...,1770-glasgow-single-malt,1205,50.88,56.99
3,5,85,https://www.whiskybase.com/whiskies/whisky/128...,1770-glasgow-single-malt,547,50.88,56.99
4,7,84,https://www.whiskybase.com/whiskies/whisky/128...,1770-glasgow-single-malt,562,50.88,56.99


In [173]:
rec_df = df.drop(columns = ['Unnamed: 0', 'url', 'price(euro)', 'price(dollar)'])
rec_df = rec_df[['user_ID', 'whiskey', 'rating']]

In [174]:
rec_df.head()

Unnamed: 0,user_ID,whiskey,rating
0,852,1770-glasgow-single-malt,88
1,459,1770-glasgow-single-malt,87
2,1205,1770-glasgow-single-malt,86
3,547,1770-glasgow-single-malt,85
4,562,1770-glasgow-single-malt,84


In [175]:
print(df.rating.mean())
print(df.rating.std())

92.90055585352772
3.2778365698077585


In [176]:
from surprise import Reader, Dataset
reader = Reader(rating_scale = (1,100))
data = Dataset.load_from_df(rec_df,reader)


In [177]:
from surprise.model_selection import train_test_split
from surprise import accuracy

trainset, testset = train_test_split(data, test_size=0.25)

In [178]:
svd = SVD()
svd.rating_scale = (1,100)
svd.fit(trainset)
preds = svd.test(testset)

In [179]:
accuracy.rmse(preds)

RMSE: 2.3120


2.3119532634626783

In [180]:
uid = str(459)
iid = '1770-glasgow-single-malt'

In [181]:
pred = svd.predict(uid, iid, verbose=True)

user: 459        item: 1770-glasgow-single-malt r_ui = None   est = 90.44   {'was_impossible': False}


In [38]:
dataset = data.build_full_trainset()
print('Number of users: ',dataset.n_users,'\n')
print('Number of items: ',dataset.n_items)

Number of users:  1223 

Number of items:  1092


In [40]:
## Perform a gridsearch with SVD
params = {'n_factors' :[20,50,100],
         'reg_all':[0.02,0.05,0.1]}
g_s_svd = GridSearchCV(SVD,param_grid=params,n_jobs=-1)
g_s_svd.fit(data)

In [41]:
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 87.96164513918447, 'mae': 87.90070237173458}
{'rmse': {'n_factors': 20, 'reg_all': 0.02}, 'mae': {'n_factors': 20, 'reg_all': 0.02}}


<h3>Cross validate with KNN Basic

In [42]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options={'name':'pearson','user_based':True})
cv_knn_basic= cross_validate(knn_basic,data,n_jobs=-1)

In [43]:
for i in cv_knn_basic.items():
    print(i)
print('-----------------------')
print(np.mean(cv_knn_basic['test_rmse']))

('test_rmse', array([87.93006729, 87.96218881, 88.01188641, 87.96345991, 87.94061698]))
('test_mae', array([87.86884447, 87.90102389, 87.95514383, 87.89845179, 87.88004389]))
('fit_time', (1.3787519931793213, 1.9605121612548828, 2.4493520259857178, 2.3677592277526855, 2.0128660202026367))
('test_time', (3.269965171813965, 3.477729082107544, 2.9920778274536133, 2.4421842098236084, 1.8250818252563477))
-----------------------
87.96164388046019


<h3>Cross validate with KNN Baseline

In [44]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options={'name':'pearson','user_based':True})
cv_knn_baseline = cross_validate(knn_baseline,data)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [45]:
for i in cv_knn_baseline.items():
    print(i)

np.mean(cv_knn_baseline['test_rmse'])

('test_rmse', array([88.00971479, 87.98190345, 87.95583155, 87.95244634, 87.90830824]))
('test_mae', array([87.95587518, 87.91162847, 87.89700146, 87.8932098 , 87.84578813]))
('fit_time', (1.4386487007141113, 1.3328099250793457, 1.3431859016418457, 1.3554019927978516, 1.3769490718841553))
('test_time', (2.1821348667144775, 2.051344871520996, 2.101573944091797, 2.245371103286743, 2.297333240509033))


87.96164087244901

<h3>Make Recommendations

In [51]:

svd = SVD(n_factors= 20, reg_all=0.02)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1166349b0>

In [54]:
svd.predict(4,5)

Prediction(uid=4, iid=5, r_ui=None, est=5, details={'was_impossible': False})