In [1]:
from math import sqrt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" #skip this line if you want to use gpu
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
import tensorflow
import random

Using TensorFlow backend.


### Set random seed

In [3]:
random.seed(2020)
np.random.seed(2020)
tensorflow.random.set_seed(2020)

### Root Mean Squared Error (RMSE) is used to evaluate the performance of a recommendation algorithm, so we need to define the following utility function to compute the RMSE given the predicted ratings and the ground truth ratings. 

In [4]:
'''
params:
    -pred: an array containing all predicted ratings
    -actual: an array containing all ground truth ratings
    
return:
    a scalar whose value is the rmse
    
'''
def rmse(pred, actual):
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

# Implement Nueral Collaborative Filtering (NCF) model

### Here we implement two instantiations of NCF model. 
### The first instantiation computes the recommendation score (e.g., ratings) between a pair of user and item using dot product of their embeddings, which is equivalent to matrix factorization model for recommendation.
### The second instantiation concatenates the user's and item's embeddings, then feed the the concatenated vector into a MLP to calculate the recommendation score. Adoption of MLP equips the model with high flexibility and non-linearity to effectively learn the interaction between user and item latent features.

In [43]:
'''
params:
    -n_users: number of user embedding vectors
    -n_items: number of item embedding vectors
    -embed_size: dimension of each embedding vector
    -output_layer: which instantiation of NCF to use ('dot' or 'mlp')

return:
    a keras Model object for the constructed ncf model 
'''

def build_ncf_model(n_users, n_items, embed_size, output_layer='dot'):
    
    # Get the users and items input
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')

    
    # Get the embeddings of users and items
    
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)
    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)
    

    
    if output_layer == 'dot':
        # Compute the dot product of users' and items' embeddings as the model output
        model_output = Dot(axes=1)([user_emb, item_emb])
        
    elif output_layer == 'mlp':
        
        # Concatenate the users' and items' embeddings as the input of MLP 
        mlp_input = Concatenate()([user_emb, item_emb])
        
        # First fully-connected layer
        dense_1 = Dense(128, activation='relu')(mlp_input)
        dense_1_dp = Dropout(0)(dense_1)
        
        # Second fully-connected layer
        dense_2 = Dense(64, activation='relu')(dense_1_dp)
        dense_2_dp = Dropout(0)(dense_2)
        
        # Final fully-connected layer to compute model output
        model_output = Dense(1)(dense_2_dp)
    else:
        raise NotImplementedError

    model = Model(inputs=[user_input, item_input],
                  outputs=model_output)
    return model
    
    

# Rating Prediction

### Load train and validation rating table

In [6]:
tr_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/valid.csv")

### Build two dictionaries mapping original user and item ids to corresponding indices in respective embedding matrices

In [7]:
#Get the set of all user ids and set of all business ids in train set
user_set = set(tr_df.user_id.unique())
business_set = set(tr_df.business_id.unique())

#Build user vocabulary
user_vocab = dict(zip(user_set, range(1, len(user_set) + 1)))

#reserve the first row of the embedding matrix for users unseen in the training set
user_vocab['unk'] = 0 
n_users = len(user_vocab)

#Build business vocabulary
business_vocab = dict(zip(business_set, range(1, len(business_set) + 1)))
#reserve the first row of the embedding matrix for businesses unseen in the training set
business_vocab['unk'] = 0
n_items = len(business_vocab)


### Replace the original user and item ids in train and valdiation set with indices in embedding matrices

In [8]:
tr_users = tr_df.user_id.apply(lambda x: user_vocab[x]).values
tr_items = tr_df.business_id.apply(lambda x: business_vocab[x]).values
val_users = val_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
val_items = val_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values


### Get ratings in train and validation set

In [9]:
tr_ratings = tr_df.stars.values
val_ratings = val_df.stars.values

### Build the NCF model defined above

In [40]:
model = build_ncf_model(
        n_users, n_items, 
        embed_size=100,
        output_layer='mlp')

### Train the model using Adam optimizer and mean squared error loss

In [41]:
random.seed(2020)
np.random.seed(2020)
tensorflow.random.set_seed(2020)

model.compile(optimizer='adagrad', loss='mse')

history = model.fit(
        [tr_users, tr_items], 
        tr_ratings, 
        epochs=1, 
        verbose=1,
        callbacks=[ModelCheckpoint('model.h5')])


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


### Evaluate the model on train and validation sets using RMSE

In [12]:
y_pred = model.predict([tr_users, tr_items])
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  0.9930638128415068
VALID RMSE:  1.0836642741354767


In [35]:
# embedding = 100
# dropout = 1.0
# optimizer = adagrad
y_pred = model.predict([tr_users, tr_items])
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  0.902204873281488
VALID RMSE:  1.0655312708196742


In [42]:
# embedding = 100
# dropout = 1.0
# optimizer = adagrad
y_pred = model.predict([tr_users, tr_items])
print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  0.9527370087846538
VALID RMSE:  1.0672388630674958
