In [None]:
import pandas as pd
import numpy as np
from surprise import SVD, NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
from surprise.model_selection import cross_validate, GridSearchCV
import matplotlib.pyplot as plt

### Load Dataset (Preprocessed)

In [None]:
cleaned_data = pd.read_pickle('cleaned_data.infer', compression = 'infer')

In [None]:
cleaned_data.head()

Unnamed: 0,userID,beerName,review
224266,0110x011,15th Anniversary Wood Aged,3.5
662379,0110x011,Chez Monieux,4.5
57926,0110x011,Trade Winds Tripel,4.0
1325182,0110x011,Wachusett IPA (India Pale Ale),3.5
931834,0110x011,Bell's Hopslam Ale,4.5


### Prepare Dataset for Training

In [None]:
# encode user and item ids
user_encoder = LabelEncoder()
beer_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(cleaned_data.userID)
beer_ids = beer_encoder.fit_transform(cleaned_data.beerName)
ratings = cleaned_data.review.values

In [None]:
# compute the number of users and items
num_users = user_ids.max() + 1
num_beers = beer_ids.max() + 1

# ratings normalization
ratings /= 5

In [None]:
data_df = pd.DataFrame({'userId': user_ids, 'beerId': beer_ids, 'review': ratings})

In [None]:
data_df.head(7)

Unnamed: 0,userId,beerId,review
1415391,10706,8096,0.7
1415392,10706,7059,0.6
1415393,10706,3207,0.2
1415394,10706,1346,0.6
1415395,10706,6168,0.9


In [None]:
reader = Reader(rating_scale=(0,1))

In [None]:
data = Dataset.load_from_df(data_df, reader)

In [None]:
trainset, testset = train_test_split(data, test_size=.2, random_state=0)

### Perform Training

In [None]:
svd = SVD()
svd.fit(trainset)

### Prediction and Evaluation

In [None]:
predictions = svd.test(testset)
print(accuracy.mse(predictions))
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

MSE: 0.0149
0.01488165462066341
RMSE: 0.1220
0.12199038741090795
MAE:  0.0918
0.09183066044750644


In [None]:
predictions

[Prediction(uid=2230, iid=9743, r_ui=0.6, est=0.8889590630846814, details={'was_impossible': False}),
 Prediction(uid=6130, iid=1589, r_ui=0.8, est=0.8299958429724436, details={'was_impossible': False}),
 Prediction(uid=3174, iid=2357, r_ui=0.8, est=0.7732969360140785, details={'was_impossible': False}),
 Prediction(uid=4589, iid=4415, r_ui=0.4, est=0.6423384419122726, details={'was_impossible': False}),
 Prediction(uid=4126, iid=4540, r_ui=0.9, est=0.8512591159828296, details={'was_impossible': False}),
 Prediction(uid=5052, iid=9568, r_ui=0.9, est=0.7719668126216968, details={'was_impossible': False}),
 Prediction(uid=1357, iid=10951, r_ui=0.9, est=0.843290679933941, details={'was_impossible': False}),
 Prediction(uid=9325, iid=4947, r_ui=0.7, est=0.8970135922921257, details={'was_impossible': False}),
 Prediction(uid=5706, iid=12618, r_ui=0.7, est=0.7319506228444412, details={'was_impossible': False}),
 Prediction(uid=1387, iid=11393, r_ui=0.9, est=0.869810664078496, details={'was_i

In [None]:
print(svd.pu.shape) # (n_user, n_factors)
print(svd.qi.shape) # (n_items, n_factors)

(2102, 100)
(14228, 100)


In [None]:
def get_not_tried_surprise(ratings, total_beer, userId):
    tried_beer = set(ratings[ratings['userId']== userId]['beerId'].tolist())
    not_tried_beer= [beer for beer in total_beer if beer not in tried_beer]

    return not_tried_beer

In [None]:
def recomm_beer_by_surprise(algo, userId, not_tried_beer, top_n=10):
    predictions = [algo.predict(userId, beerId) for beerId in not_tried_beer]

    def sortkey_est(pred):
      return pred.est

    predictions.sort(key=sortkey_est, reverse=True)
    top_predictions= predictions[:top_n]
    top_beer_ids = [ int(pred.iid) for pred in top_predictions]

    return top_beer_ids

In [None]:
total_beer = set(data_df['beerId'].to_list())
user_Id = 9
top = 5
not_tried_beer = get_not_tried_surprise(data_df, total_beer, user_Id)
top_beer_ids = recomm_beer_by_surprise(svd, user_Id, not_tried_beer, top_n=top)

print(f'##### Top-{top} beer list for user {user_Id} #####\n')
for top_beer in top_beer_ids:
  print(top_beer)