The collaborative filter approach focuses on finding users who have given similar ratings to the same restaurant, thus creating a link between users, to whom will be suggested restaurants that were reviewed in a positive way. In this way, we look for associations between users, not between restaurants.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [2]:
resto_rating = pd.read_csv('final_ratings.csv', sep=',', on_bad_lines="skip", encoding="latin-1")
#cols = ['rating', 'food_rating', 'service_rating']
#resto_rating.drop(cols, axis=1, inplace=True)
resto_rating.head()

Unnamed: 0,userID,restoID,rating
0,U1001,R0001,4
1,U1002,R0001,5
2,U1003,R0001,4
3,U1004,R0001,3
4,U1005,R0001,5


In [3]:
resto_rating.head(3)

Unnamed: 0,userID,restoID,rating
0,U1001,R0001,4
1,U1002,R0001,5
2,U1003,R0001,4


In [4]:
rating_count = (resto_rating.
     groupby(by = ['restoID'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'RatingCount_resto'})
     [['restoID', 'RatingCount_resto']]
    )
rating_count.head()

Unnamed: 0,restoID,RatingCount_resto
0,R0001,29
1,R0002,50
2,R0003,24
3,R0004,50
4,R0005,20


In [5]:
threshold = 0
rating_count = rating_count.query('RatingCount_resto >= @threshold')
rating_count.head()

Unnamed: 0,restoID,RatingCount_resto
0,R0001,29
1,R0002,50
2,R0003,24
3,R0004,50
4,R0005,20


In [6]:
resto_rating.head(3)

Unnamed: 0,userID,restoID,rating
0,U1001,R0001,4
1,U1002,R0001,5
2,U1003,R0001,4


In [7]:
user_rating = pd.merge(rating_count, resto_rating, left_on='restoID', right_on='restoID', how='left')

In [8]:
user_rating.head(3)

Unnamed: 0,restoID,RatingCount_resto,userID,rating
0,R0001,29,U1001,4
1,R0001,29,U1002,5
2,R0001,29,U1003,4


In [9]:
user_count = (user_rating.
     groupby(by = ['userID'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'RatingCount_user'})
     [['userID', 'RatingCount_user']]
    )
user_count.head()

Unnamed: 0,userID,RatingCount_user
0,U1001,15
1,U1002,15
2,U1003,12
3,U1004,12
4,U1005,12


In [10]:
threshold = 0
user_count = user_count.query('RatingCount_user >= @threshold')
user_count.head(200)

Unnamed: 0,userID,RatingCount_user
0,U1001,15
1,U1002,15
2,U1003,12
3,U1004,12
4,U1005,12
...,...,...
131,U1132,1
132,U1133,1
133,U1134,1
134,U1135,1


In [11]:
combined = user_rating.merge(user_count, left_on = 'userID', right_on = 'userID', how = 'inner')

In [12]:
combined.head(3)

Unnamed: 0,restoID,RatingCount_resto,userID,rating,RatingCount_user
0,R0001,29,U1001,4,15
1,R0002,50,U1001,5,15
2,R0003,24,U1001,5,15


In [13]:
combined.shape

(765, 5)

In [14]:
print('Number of unique resto: ', combined['restoID'].nunique()) #harusnya ada 18 resto
print('Number of unique users: ', combined['userID'].nunique()) #ini 136 harusnya user karena ada 2 user yang tidak kasih rating sama sekali

Number of unique resto:  18
Number of unique users:  136


Normalize the ratings.

In [15]:
scaler = MinMaxScaler()
combined['rating'] = combined['rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(combined['rating'].values.reshape(-1,1)))
combined['rating'] = rating_scaled

Abd build the user resto matrix.

In [16]:
combined = combined.drop_duplicates(['userID', 'restoID'])
user_resto_matrix = combined.pivot(index='userID', columns='restoID', values='rating')
user_resto_matrix.fillna(0, inplace=True)

users = user_resto_matrix.index.tolist()
resto = user_resto_matrix.columns.tolist()

user_resto_matrix = user_resto_matrix.values

tf.placeholder only available in v1, so we have to work around. 

In [17]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


We will initialize the TensorFlow placeholder. Then, weights and biases are randomly initialized, the following code are taken from the book: Python Machine Learning Cook Book - Second Edition

In [18]:
num_input = combined['restoID'].nunique()
num_hidden_1 = 64
num_hidden_2 = 32

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

Now, we can build the encoder and decoder model, as follows:

In [19]:
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

We will construct the model and the predictions

In [20]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

y_pred = decoder_op

y_true = X

define loss function and optimizer, and minimize the squared error, and define the evaluation metrics

In [21]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Initialize the variables. Because TensorFlow uses computational graphs for its operations, placeholders and variables must be initialized.

In [22]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()

We can finally start to train our model.

We split training data into batches, and we feed the network with them.

We train our model with vectors of user ratings, each vector represents a user and each column a resto, and entries are ratings that the user gave to restaurants. 

After a few trials, I discovered that training model for 5 epochs with a batch size of 10 would be consum enough memory. This means that the entire training set will feed our neural network 20 times, every time using 50 users.

In [23]:
with tf.Session() as session:
    epochs = 100
    batch_size = 5

    session.run(init)
    session.run(local_init)

    num_batches = int(user_resto_matrix.shape[0] / batch_size)
    user_resto_matrix = np.array_split(user_resto_matrix, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in user_resto_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    user_resto_matrix = np.concatenate(user_resto_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: user_resto_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))

    pred_data = pred_data.stack().reset_index(name='rating')
    pred_data.columns = ['userID', 'restoID', 'rating']
    pred_data['userID'] = pred_data['userID'].map(lambda value: users[value])
    pred_data['restoID'] = pred_data['restoID'].map(lambda value: resto[value])
    
    keys = ['userID', 'restoID']
    index_1 = pred_data.set_index(keys).index
    index_2 = combined.set_index(keys).index

    top_ten_ranked = pred_data[~index_1.isin(index_2)]
    top_ten_ranked = top_ten_ranked.sort_values(['userID', 'rating'], ascending=[True, False])
    top_ten_ranked = top_ten_ranked.groupby('userID').head(10)

epoch: 1 Loss: 0.4274621473418342
epoch: 2 Loss: 0.38511708709928727
epoch: 3 Loss: 0.3437917701624058
epoch: 4 Loss: 0.325143873139664
epoch: 5 Loss: 0.2629288561918117
epoch: 6 Loss: 0.18270599372960902
epoch: 7 Loss: 0.09489756227574414
epoch: 8 Loss: 0.08796459853787113
epoch: 9 Loss: 0.07117271393158094
epoch: 10 Loss: 0.055728391795936555
epoch: 11 Loss: 0.045540491188013996
epoch: 12 Loss: 0.03622641944524797
epoch: 13 Loss: 0.034208345671046386
epoch: 14 Loss: 0.0346320042240054
epoch: 15 Loss: 0.033488396236121105
epoch: 16 Loss: 0.025196002789824787
epoch: 17 Loss: 0.02839418174698949
epoch: 18 Loss: 0.021914447362413974
epoch: 19 Loss: 0.022096616121685925
epoch: 20 Loss: 0.020276792218080826
epoch: 21 Loss: 0.02184932517034381
epoch: 22 Loss: 0.015597330597099952
epoch: 23 Loss: 0.017497537187645556
epoch: 24 Loss: 0.018994322409896455
epoch: 25 Loss: 0.014574329627470838
epoch: 26 Loss: 0.012835004417596522
epoch: 27 Loss: 0.01615034205153481
epoch: 28 Loss: 0.016617472143

  pred_data = pred_data.append(pd.DataFrame(preds))


In [24]:
top_ten_ranked.loc[top_ten_ranked['userID'] == "U1001"]

Unnamed: 0,userID,restoID,rating
4,U1001,R0005,0.0007015399
6,U1001,R0007,0.0001392611
13,U1001,R0014,7.023169e-08


In [25]:
resto_rating.loc[resto_rating['userID'] == "U1001"].sort_values(by=['rating'], ascending=False)

Unnamed: 0,userID,restoID,rating
29,U1001,R0002,5
79,U1001,R0003,5
423,U1001,R0011,5
473,U1001,R0012,5
616,U1001,R0015,5
666,U1001,R0016,5
715,U1001,R0017,5
739,U1001,R0018,5
0,U1001,R0001,4
173,U1001,R0006,4
