The collaborative filter approach focuses on finding users who have given similar ratings to the same restaurant, thus creating a link between users, to whom will be suggested restaurants that were reviewed in a positive way. In this way, we look for associations between users, not between restaurants.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [32]:
resto_rating = pd.read_csv('data/rating_final.csv', sep=',', error_bad_lines=False, encoding="latin-1")
cols = ['rating', 'food_rating', 'service_rating']
resto_rating.drop(cols, axis=1, inplace=True)
resto_rating.head()



  resto_rating = pd.read_csv('data/rating_final.csv', sep=',', error_bad_lines=False, encoding="latin-1")


Unnamed: 0,userID,placeID,total_rating
0,1077,135085,6
1,1077,135038,5
2,1077,132825,6
3,1077,135060,5
4,1068,135104,4


In [33]:
resto_rating.head(3)

Unnamed: 0,userID,placeID,total_rating
0,1077,135085,6
1,1077,135038,5
2,1077,132825,6


In [36]:
rating_count = (resto_rating.
     groupby(by = ['placeID'])['total_rating'].
     count().
     reset_index().
     rename(columns = {'total_rating': 'RatingCount_resto'})
     [['placeID', 'RatingCount_resto']]
    )
rating_count.head()

Unnamed: 0,placeID,RatingCount_resto
0,132560,4
1,132561,4
2,132564,4
3,132572,15
4,132583,4


In [38]:
threshold = 0
rating_count = rating_count.query('RatingCount_resto >= @threshold')
rating_count.head()

Unnamed: 0,placeID,RatingCount_resto
0,132560,4
1,132561,4
2,132564,4
3,132572,15
4,132583,4


In [22]:
resto_rating.head(3)

Unnamed: 0,userID,placeID,total_rating
0,1077,135085,6
1,1077,135038,5
2,1077,132825,6


In [39]:
user_rating = pd.merge(rating_count, resto_rating, left_on='placeID', right_on='placeID', how='left')

In [43]:
user_rating.head(3)

Unnamed: 0,placeID,RatingCount_resto,userID,total_rating
0,132560,4,1067,1
1,132560,4,1082,0
2,132560,4,1050,2


In [45]:
user_count = (user_rating.
     groupby(by = ['userID'])['total_rating'].
     count().
     reset_index().
     rename(columns = {'total_rating': 'RatingCount_user'})
     [['userID', 'RatingCount_user']]
    )
user_count.head()

Unnamed: 0,userID,RatingCount_user
0,1001,9
1,1002,10
2,1003,13
3,1004,8
4,1005,9


In [46]:
threshold = 0
user_count = user_count.query('RatingCount_user >= @threshold')
user_count.head()

Unnamed: 0,userID,RatingCount_user
0,1001,9
1,1002,10
2,1003,13
3,1004,8
4,1005,9
...,...,...
133,1134,16
134,1135,14
135,1136,10
136,1137,14


In [47]:
combined = user_rating.merge(user_count, left_on = 'userID', right_on = 'userID', how = 'inner')

In [50]:
combined.head(3)

Unnamed: 0,placeID,RatingCount_resto,userID,total_rating,RatingCount_user
0,132560,4,1067,1,6
1,132584,6,1067,6,6
2,132630,6,1067,2,6


In [51]:
combined.shape

(1161, 5)

In [52]:
print('Number of unique resto: ', combined['placeID'].nunique()) #harusnya ada 130 resto
print('Number of unique users: ', combined['userID'].nunique()) #ini 138 harusnya user

Number of unique resto:  130
Number of unique users:  138


Normalize the ratings.

In [53]:
scaler = MinMaxScaler()
combined['total_rating'] = combined['total_rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(combined['total_rating'].values.reshape(-1,1)))
combined['total_rating'] = rating_scaled

Abd build the user resto matrix.

In [56]:
combined = combined.drop_duplicates(['userID', 'placeID'])
user_resto_matrix = combined.pivot(index='userID', columns='placeID', values='total_rating')
user_resto_matrix.fillna(0, inplace=True)

users = user_resto_matrix.index.tolist()
resto = user_resto_matrix.columns.tolist()

user_resto_matrix = user_resto_matrix.values

tf.placeholder only available in v1, so we have to work around. 

In [57]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


We will initialize the TensorFlow placeholder. Then, weights and biases are randomly initialized, the following code are taken from the book: Python Machine Learning Cook Book - Second Edition

In [58]:
num_input = combined['placeID'].nunique()
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

Now, we can build the encoder and decoder model, as follows:

In [59]:
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

We will construct the model and the predictions

In [60]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

y_pred = decoder_op

y_true = X

define loss function and optimizer, and minimize the squared error, and define the evaluation metrics

In [61]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Initialize the variables. Because TensorFlow uses computational graphs for its operations, placeholders and variables must be initialized.

In [62]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()

We can finally start to train our model.

We split training data into batches, and we feed the network with them.

We train our model with vectors of user ratings, each vector represents a user and each column a resto, and entries are ratings that the user gave to restaurants. 

After a few trials, I discovered that training model for 5 epochs with a batch size of 10 would be consum enough memory. This means that the entire training set will feed our neural network 20 times, every time using 50 users.

In [63]:
with tf.Session() as session:
    epochs = 100
    batch_size = 35

    session.run(init)
    session.run(local_init)

    num_batches = int(user_resto_matrix.shape[0] / batch_size)
    user_resto_matrix = np.array_split(user_resto_matrix, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in user_resto_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    user_resto_matrix = np.concatenate(user_resto_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: user_resto_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))

    pred_data = pred_data.stack().reset_index(name='total_rating')
    pred_data.columns = ['userID', 'placeID', 'total_rating']
    pred_data['userID'] = pred_data['userID'].map(lambda value: users[value])
    pred_data['placeID'] = pred_data['placeID'].map(lambda value: resto[value])
    
    keys = ['userID', 'placeID']
    index_1 = pred_data.set_index(keys).index
    index_2 = combined.set_index(keys).index

    top_ten_ranked = pred_data[~index_1.isin(index_2)]
    top_ten_ranked = top_ten_ranked.sort_values(['userID', 'total_rating'], ascending=[True, False])
    top_ten_ranked = top_ten_ranked.groupby('userID').head(10)

epoch: 1 Loss: 0.30553404490152997
epoch: 2 Loss: 0.3054317732652028
epoch: 3 Loss: 0.305312047402064
epoch: 4 Loss: 0.3051718572775523
epoch: 5 Loss: 0.30500778555870056
epoch: 6 Loss: 0.30481570959091187
epoch: 7 Loss: 0.3045908709367116
epoch: 8 Loss: 0.30432777603467304
epoch: 9 Loss: 0.3040199279785156
epoch: 10 Loss: 0.30365971724192303
epoch: 11 Loss: 0.3032383620738983
epoch: 12 Loss: 0.30274562040964764
epoch: 13 Loss: 0.30216949184735614
epoch: 14 Loss: 0.30149611830711365
epoch: 15 Loss: 0.3007093568642934
epoch: 16 Loss: 0.29979052146275836
epoch: 17 Loss: 0.2987179756164551
epoch: 18 Loss: 0.2974668045838674
epoch: 19 Loss: 0.2960083285967509
epoch: 20 Loss: 0.29430969556172687
epoch: 21 Loss: 0.2923334538936615
epoch: 22 Loss: 0.2900370756785075
epoch: 23 Loss: 0.28737273812294006
epoch: 24 Loss: 0.2842869957288106
epoch: 25 Loss: 0.28072089950243634
epoch: 26 Loss: 0.27660999695460003
epoch: 27 Loss: 0.27188528577486676
epoch: 28 Loss: 0.2664741377035777
epoch: 29 Loss: 

  pred_data = pred_data.append(pd.DataFrame(preds))


In [64]:
top_ten_ranked.loc[top_ten_ranked['userID'] == 1067]

Unnamed: 0,userID,placeID,total_rating
8642,1067,134986,0.136525
8609,1067,132768,0.103456
8703,1067,135085,0.081977
8645,1067,134996,0.073723
8683,1067,135058,0.071007
8656,1067,135026,0.062692
8708,1067,135108,0.05786
8611,1067,132825,0.055632
8601,1067,132723,0.055408
8654,1067,135021,0.054872


In [67]:
resto_rating.loc[resto_rating['userID'] == 1067].sort_values(by=['total_rating'], ascending=False)

Unnamed: 0,userID,placeID,total_rating
9,1067,132584,6
11,1067,132732,5
10,1067,132733,3
12,1067,132630,2
14,1067,132560,1
13,1067,135104,0
