In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import math
from urllib.request import urlopen

### Load Dataset (Preprocessed)

In [None]:
cleaned_data = pd.read_pickle('cleaned_data.infer', compression = 'infer')

### Prepare Dataset for Training

In [None]:
# encode user and item ids
user_encoder = LabelEncoder()
beer_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(cleaned_data.userID)
beer_ids = beer_encoder.fit_transform(cleaned_data.beerName)
ratings = cleaned_data.review.values

In [None]:
# compute the number of users and items
num_users = user_ids.max() + 1
num_beers = beer_ids.max() + 1

# ratings normalization
ratings /= 5

In [None]:
data_df = pd.DataFrame({'userId': user_ids, 'beerId': beer_ids, 'review': ratings})

In [None]:
data_df.tail()

Unnamed: 0,userId,beerId,review
1415391,10706,8096,0.7
1415392,10706,7059,0.6
1415393,10706,3207,0.2
1415394,10706,1346,0.6
1415395,10706,6168,0.9


In [None]:
ratings_matrix = pd.pivot_table(data_df, values='review', index='userId', columns='beerId')

In [None]:
ratings_matrix = ratings_matrix.fillna(0)

In [None]:
ratings_matrix.head(3)

beerId,0,1,2,3,4,5,6,7,8,9,...,13549,13550,13551,13552,13553,13554,13555,13556,13557,13558
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Training: Compute Similarity between Beer Items

In [None]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head(3)

userId,0,1,2,3,4,5,6,7,8,9,...,10697,10698,10699,10700,10701,10702,10703,10704,10705,10706
beerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
ratings_matrix_T.shape

(13559, 10707)

In [None]:
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
                          columns=ratings_matrix.columns)
item_sim_df.head(3)

beerId,0,1,2,3,4,5,6,7,8,9,...,13549,13550,13551,13552,13553,13554,13555,13556,13557,13558
beerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.050453,0.089484,0.046096,0.071882,0.0,...,0.038182,0.0,0.0,0.053007,0.0,0.0,0.078383,0.0,0.07066,0.0
1,0.0,1.0,0.050101,0.03571,0.045957,0.04141,0.088981,0.06726,0.059702,0.0,...,0.026861,0.071688,0.069778,0.108831,0.044738,0.0,0.058253,0.0,0.146185,0.0
2,0.0,0.050101,1.0,0.0,0.062578,0.00635,0.070751,0.068813,0.085006,0.0,...,0.135358,0.054507,0.029523,0.096871,0.0,0.0,0.168149,0.122656,0.066762,0.0


In [None]:
ratings_matrix_T.index

Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            13549, 13550, 13551, 13552, 13553, 13554, 13555, 13556, 13557,
            13558],
           dtype='int64', name='beerId', length=13559)

In [None]:
item_sim_df[5].sort_values(ascending=False)[:6]

beerId
5        1.000000
5267     0.261209
6511     0.251257
9469     0.250383
994      0.231531
11119    0.225942
Name: 5, dtype: float64

### Prediction and Evaluation: Memory based Filtering - Item based

In [None]:
def predict_rating(ratings_arr, item_sim_arr ):
    ratings_pred = ratings_arr.dot(item_sim_arr)/ np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [None]:
ratings_pred = predict_rating(ratings_matrix.values , item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index= ratings_matrix.index,
                                   columns = ratings_matrix.columns)
print(ratings_pred_matrix.shape)
ratings_pred_matrix.head(5)

(10707, 13559)


beerId,0,1,2,3,4,5,6,7,8,9,...,13549,13550,13551,13552,13553,13554,13555,13556,13557,13558
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.012801,0.021844,0.015875,0.01657,0.012754,0.016169,0.019685,0.0165,0.018419,0.0113,...,0.019773,0.018761,0.019717,0.023987,0.024268,0.014839,0.014774,0.017257,0.019189,0.029777
1,0.002228,0.001932,0.001772,0.002511,0.002077,0.002159,0.00225,0.001699,0.003067,0.004528,...,0.001689,0.001618,0.002013,0.002193,0.001919,0.001623,0.002231,0.001812,0.002228,0.001999
2,0.012824,0.015852,0.011756,0.01777,0.012694,0.017467,0.016493,0.01305,0.017682,0.012955,...,0.013684,0.014911,0.016498,0.018456,0.020088,0.014031,0.013206,0.011449,0.016502,0.020481
3,0.005779,0.005539,0.005094,0.006417,0.005165,0.004993,0.0054,0.004617,0.007585,0.00268,...,0.00422,0.004374,0.004481,0.00535,0.003375,0.004298,0.006301,0.003823,0.005814,0.005926
4,0.002561,0.002469,0.002447,0.002034,0.001927,0.002686,0.002802,0.002368,0.002391,0.001361,...,0.002904,0.002717,0.002866,0.00267,0.002552,0.00136,0.002206,0.002698,0.00253,0.003539


In [None]:
def get_mse(pred, actual):
  # Ignore nonzero terms.
  pred = pred[actual.nonzero()].flatten()
  actual = actual[actual.nonzero()].flatten()
  print(pred)
  print(actual)
  return mean_squared_error(pred, actual)

In [None]:
def get_rmse(pred, actual):
  # Ignore nonzero terms.
  pred = pred[actual.nonzero()].flatten()
  actual = actual[actual.nonzero()].flatten()
  print(pred)
  print(actual)
  return math.sqrt(mean_squared_error(pred, actual))

In [None]:
def get_mae(pred, actual):
  # Ignore nonzero terms.
  pred = pred[actual.nonzero()].flatten()
  actual = actual[actual.nonzero()].flatten()
  print(pred)
  print(actual)
  return mean_absolute_error(pred, actual)

In [None]:
print('RMSE: ', get_rmse(ratings_pred, ratings_matrix.values ))
print('MSE: ', get_mse(ratings_pred, ratings_matrix.values ))
print('MAE: ', get_mae(ratings_pred, ratings_matrix.values ))

[0.02554794 0.02383771 0.02207227 ... 0.01731815 0.01569091 0.01545595]
[0.7 0.7 0.9 ... 1.  0.7 1. ]
RMSE:  0.7015482832344644
[0.02554794 0.02383771 0.02207227 ... 0.01731815 0.01569091 0.01545595]
[0.7 0.7 0.9 ... 1.  0.7 1. ]
MSE:  0.4921699937092243
[0.02554794 0.02383771 0.02207227 ... 0.01731815 0.01569091 0.01545595]
[0.7 0.7 0.9 ... 1.  0.7 1. ]
MAE:  0.6825286520504341
