In [1]:
import gzip
import json
import random
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Activation, Add, Dropout, Multiply
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt


### Load Dataset

In [2]:
# definition to parse a path consisting .gz file
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [3]:
# load the dataset
data_dir = ''
dataset = list(parse(data_dir + 'beeradvocate.json.gz'))

In [4]:
# view the dataset
print('%s\n' % dataset[0])
print('Length of dataset: %s\n' % len(dataset))

{'beer/name': 'Sausa Weizen', 'beer/beerId': '47986', 'beer/brewerId': '10325', 'beer/ABV': '5.00', 'beer/style': 'Hefeweizen', 'review/appearance': '2.5', 'review/aroma': '2', 'review/palate': '1.5', 'review/taste': '1.5', 'review/overall': '1.5', 'review/time': '1234817823', 'review/profileName': 'stcules', 'review/text': 'A lot of foam. But a lot.\tIn the smell some banana, and then lactic and tart. Not a good start.\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\tAgain tending to lactic sourness.\tSame for the taste. With some yeast and banana.'}

Length of dataset: 1586615



### Pre-process Dataset

In [5]:
# pre-process the data to get rid of empty 'beer/brewerId'
featured_dataset = []
for data in dataset:
    if 'beer/brewerId' not in data.keys():
        continue
    else:
        featured_dataset.append([data['review/profileName'], data['beer/name'], data['review/overall']])

In [6]:
# pre-process the data to get rid of beers <10 and users <10 and sort the data by userID
ratings = pd.DataFrame(featured_dataset, columns=['userID','beerName','review'])
ratings = ratings.astype({'userID': 'string',  'review': 'float'})

def preprocessing(data, n):
    min_id = data['userID'].value_counts() >= n
    min_id = min_id[min_id].index.to_list()
    data = data[data['userID'].isin(min_id)]

    min_beer = data['beerName'].value_counts() >= n
    min_beer = min_beer[min_beer].index.to_list()
    data = data[data['beerName'].isin(min_beer)]

    return data

ratings = preprocessing(ratings,10) 
sorted_ratings = ratings.sort_values('userID')
print('%s\n' % sorted_ratings)

              userID                          beerName  review
200362                             2° Below Winter Ale     2.5
1128554                                    Miller Lite     4.0
741908                           Steelhead Extra Stout     4.5
974101                                  Leffe Radieuse     3.5
915743                              Red Duck Amber Ale     3.0
...              ...                               ...     ...
731594   zymurgy4all           Murphy's Irish Red Beer     3.5
975579   zymurgy4all                      Leffe Blonde     3.0
1074665  zymurgy4all  Crazy Ed's Cave Creek Chili Beer     1.0
13336    zymurgy4all                Benchwarmer Porter     3.0
1348028  zymurgy4all                    India Pale Ale     4.5

[1415715 rows x 3 columns]



In [7]:
# prepare cleaned data by dropping empty userId(s)
sorted_ratings['userID'].replace('', 'dropthis', inplace = True)
cleaned_data = sorted_ratings[sorted_ratings['userID'].str.contains('dropthis') == False]
print('%s\n' % cleaned_data)

              userID                          beerName  review
224266      0110x011        15th Anniversary Wood Aged     3.5
662379      0110x011                      Chez Monieux     4.5
57926       0110x011                Trade Winds Tripel     4.0
1325182     0110x011    Wachusett IPA (India Pale Ale)     3.5
931834      0110x011                Bell's Hopslam Ale     4.5
...              ...                               ...     ...
731594   zymurgy4all           Murphy's Irish Red Beer     3.5
975579   zymurgy4all                      Leffe Blonde     3.0
1074665  zymurgy4all  Crazy Ed's Cave Creek Chili Beer     1.0
13336    zymurgy4all                Benchwarmer Porter     3.0
1348028  zymurgy4all                    India Pale Ale     4.5

[1415396 rows x 3 columns]



### Checkpoint - Dump the Cleaned Dataset

In [8]:
# checkpoint
cleaned_data.to_pickle('cleaned_data.infer')

In [9]:
# loading in data
cleaned_data = pd.read_pickle('cleaned_data.infer', compression = 'infer')
print('%s\n' % cleaned_data)
print('%s\n' % np.std(cleaned_data.review))

              userID                          beerName  review
224266      0110x011        15th Anniversary Wood Aged     3.5
662379      0110x011                      Chez Monieux     4.5
57926       0110x011                Trade Winds Tripel     4.0
1325182     0110x011    Wachusett IPA (India Pale Ale)     3.5
931834      0110x011                Bell's Hopslam Ale     4.5
...              ...                               ...     ...
731594   zymurgy4all           Murphy's Irish Red Beer     3.5
975579   zymurgy4all                      Leffe Blonde     3.0
1074665  zymurgy4all  Crazy Ed's Cave Creek Chili Beer     1.0
13336    zymurgy4all                Benchwarmer Porter     3.0
1348028  zymurgy4all                    India Pale Ale     4.5

[1415396 rows x 3 columns]

0.7066823977837944



### Prepare Dataset for Training

In [10]:
# encode user and item ids
user_encoder = LabelEncoder()
beer_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(cleaned_data.userID)
beer_ids = beer_encoder.fit_transform(cleaned_data.beerName)

In [11]:
# train - validation split
num_train = int(len(user_ids) * 0.8)
train_user_ids = user_ids[:num_train]
train_beer_ids = beer_ids[:num_train]
train_ratings = cleaned_data.review.values[:num_train]
val_user_ids = user_ids[num_train:]
val_beer_ids = beer_ids[num_train:]
val_ratings = cleaned_data.review.values[num_train:]

In [12]:
# compute the number of users and items
num_users = user_ids.max() + 1
num_beers = beer_ids.max() + 1

# ratings normalization
train_ratings /= 5
val_ratings /= 5

print('%s\n' % cleaned_data)

              userID                          beerName  review
224266      0110x011        15th Anniversary Wood Aged     0.7
662379      0110x011                      Chez Monieux     0.9
57926       0110x011                Trade Winds Tripel     0.8
1325182     0110x011    Wachusett IPA (India Pale Ale)     0.7
931834      0110x011                Bell's Hopslam Ale     0.9
...              ...                               ...     ...
731594   zymurgy4all           Murphy's Irish Red Beer     0.7
975579   zymurgy4all                      Leffe Blonde     0.6
1074665  zymurgy4all  Crazy Ed's Cave Creek Chili Beer     0.2
13336    zymurgy4all                Benchwarmer Porter     0.6
1348028  zymurgy4all                    India Pale Ale     0.9

[1415396 rows x 3 columns]



### Perform Training

In [13]:
# define the neural collaborative filtering model
def define_ncf_model():
    user_input_layer = Input((1,))
    user_hidden_layer = Embedding(input_dim = num_users, output_dim = 64)(user_input_layer)
    user_hidden_layer = Flatten()(user_hidden_layer)
    
    item_input_layer = Input((1,))
    item_hidden_layer = Embedding(input_dim = num_beers, output_dim = 64)(item_input_layer)
    item_hidden_layer = Flatten()(item_hidden_layer)

    mf_output = Multiply()([user_hidden_layer, item_hidden_layer]) # perform element-wise multiplication
    
    hidden_layer = concatenate([user_hidden_layer, item_hidden_layer])
    hidden_layer = Dense(128, activation='relu')(hidden_layer)
    hidden_layer = Dropout(0.2)(hidden_layer)
    mlp_output = Dense(64, activation='relu')(hidden_layer)    

    
    output_layer = concatenate([mf_output, mlp_output])
    output_layer = Dense(1, activation='sigmoid')(output_layer)
    
    model = Model(inputs = [user_input_layer, item_input_layer], outputs = output_layer)
    model.compile(loss='mse', optimizer='adam')
    return model

In [14]:
# instantiate view the defined model
model = define_ncf_model()
model.summary()

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-06-09 08:54:41.559399: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-06-09 08:54:41.563599: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 64)        685248      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 64)        867776      ['input_2[0][0]']                
                                                                                              

In [15]:
# implement early stopping: waiting for 1 epoch
callbacks = [EarlyStopping(patience =  1)]

# execute training (50 epochs)
model.fit([train_user_ids, train_beer_ids], train_ratings, validation_data = ([val_user_ids, val_beer_ids], val_ratings), epochs = 50, batch_size = 128, callbacks = callbacks)

Epoch 1/50


2022-06-09 08:54:42.278203: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-06-09 08:54:42.773221: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-06-09 08:57:07.371763: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50


<keras.callbacks.History at 0x3b7607e80>

### Prediction and Evaluation

In [16]:
# make predictions
predictions = model.predict([val_user_ids, val_beer_ids])

2022-06-09 09:03:57.657170: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [17]:
# evaluate the results (for scale 0-1)
mse = mean_squared_error(predictions, val_ratings, squared = False)
mae = mean_absolute_error(predictions, val_ratings)
rmse = sqrt(mse)
print('MSE: %s\n' % mse)
print('MAE: %s\n' % mae)
print('RMSE: %s\n' % rmse)

MSE: 0.12393512069358081

MAE: 0.09476706534904145

RMSE: 0.3520442027552518



In [18]:
# evaluate the results (for scale 1-5)
mse = mean_squared_error(predictions * 5, val_ratings * 5, squared = False)
mae = mean_absolute_error(predictions * 5, val_ratings * 5)
rmse = sqrt(mse)
print('MSE: %s\n' % mse)
print('MAE: %s\n' % mae)
print('RMSE: %s\n' % rmse)

MSE: 0.619675603132297

MAE: 0.4738353265524416

RMSE: 0.7871947682322952



### Define and Evaluate Baseline

In [19]:
# define baseline: ratings have already been normalized
mean_rating = cleaned_data['review'].mean()
print('Mean rating: %s\n' % mean_rating)


Mean rating: 0.764637529002484



In [20]:
# evaluate predictions using baseline
predictions_baseline = np.full_like(val_ratings, mean_rating)

# evaluate the results (for scale 0-1)
mse = mean_squared_error(predictions_baseline, val_ratings, squared = False)
mae = mean_absolute_error(predictions_baseline, val_ratings)
rmse = sqrt(mse)
print('MSE: %s\n' % mse)
print('MAE: %s\n' % mae)
print('RMSE: %s\n' % rmse)

MSE: 0.1413255616535251

MAE: 0.10798819656749262

RMSE: 0.3759329217473845

