In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

In [2]:
# Preparing the data
ratings_df = pd.read_csv('u.data', sep='\t')
ratings_pt = ratings_df.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)
ratings_pt.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Training and test set
train_ratings_df = ratings_df[0:80000]
test_ratings_df = ratings_df[80000:100000]

user_indicies_train = [i-1 for i in train_ratings_df.user_id.values]
item_indicies_train = [i-1 for i in train_ratings_df.item_id.values]
R_known_train = train_ratings_df.rating.values

user_indicies_test = [x-1 for x in test_ratings_df.user_id.values]
item_indicies_test = [x-1 for x in test_ratings_df.item_id.values]
R_known_test = test_ratings_df.rating.values

In [4]:
# Prepare data
R = ratings_pt
N = 943      # number of users
M = 1682     # number of items
K = 10       # number of hidden features
p = np.random.rand(N,K)
q = np.random.rand(K,M)

## Prediction: $\hat{r} = pq$

In [5]:
# Prediction
P = tf.Variable(p, dtype=tf.float32, name='P')
Q = tf.Variable(q, dtype=tf.float32, name='Q')
R_hat = tf.matmul(P, Q)

# For cost function
R_hat_flatten = tf.reshape(R_hat, [-1])
R_hat_train_known = tf.gather(R_hat_flatten, user_indicies_train * tf.shape(R_hat)[1] + 
              item_indicies_train, name='extracting_user_rate')

## Cost: $\left \|  \hat{r}-r\right \|$

In [6]:
# Cost function
diff_ratings = tf.subtract(R_hat_train_known, R_known_train, name='diff_ratings')
diff_ratings_abs = tf.abs(diff_ratings, name="diff_ratings_abs")
base_cost = tf.reduce_sum(diff_ratings_abs, name="sum_abs_error")

## Regularization: $\sum \lambda(\left \| p \right \|+\left \| q \right \|)$

In [7]:
# Regularization
lda = tf.constant(.001, name='lambda')
norm_sums = tf.add(tf.reduce_sum(tf.abs(P, name='user_abs'), name='user_norm'),
                   tf.reduce_sum(tf.abs(Q, name='item_abs'), name='item_norm'))
regularizer = tf.multiply(norm_sums, lda, 'regularizer')

## Total Cost: $\sum \left \|  \hat{r}-r\right \|+\lambda(\left \| p \right \|+\left \| q \right \|)$

In [8]:
# Total Cost
cost = tf.add(base_cost, regularizer)

In [9]:
# Create an optimizer to minimize the loss
optimizer = tf.train.GradientDescentOptimizer(0.001)
train = optimizer.minimize(cost)

In [10]:
# Execute the TF session
sess = tf.Session();
init = tf.global_variables_initializer()
sess.run(init)

for i in range(500):
    sess.run(train)

In [11]:
# Accuracy for known R ratings vs predicted R_hat ratings of testset
R_hat_test_known = tf.gather(R_hat_flatten, user_indicies_test * tf.shape(R_hat)[1] + item_indicies_test, name='extracting_user_rate_test')
diff_ratings_test = tf.subtract(R_hat_test_known, R_known_test, name='diff_ratings_test')
diff_ratings_abs_test = tf.abs(diff_ratings_test, name="diff_ratings_abs_test")
sum_diff_ratings_abs_test = tf.reduce_sum(diff_ratings_abs_test, name="sum_diff_ratings_abs_test")
accuracy_test = tf.div(sum_diff_ratings_abs_test, R_known_test.size, name="accuracy_test")
print(sess.run(accuracy_test))

0.8136718


In [12]:
# Visualize known R ratings
R.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Visualize R_hat rating predictions
pred = sess.run(R_hat)
pred_df = pd.DataFrame(pred)
pred_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,4.4051,2.887877,3.101701,4.149094,3.111232,4.981924,5.407735,4.876051,4.895416,4.046372,...,2.894594,2.432946,3.292226,2.893279,3.631972,1.478077,3.490481,3.396599,3.583007,4.190441
1,4.280867,3.132992,2.983157,3.943537,3.836057,4.536208,3.511249,4.208818,4.486645,4.203749,...,2.878471,2.727131,3.321235,2.921521,4.004354,1.818719,3.041725,2.82223,2.853581,4.736969
2,2.860859,2.43903,2.491947,2.493153,2.496724,2.57728,3.329211,3.015415,3.893726,2.731525,...,2.283432,1.317578,2.784636,2.902761,2.500135,1.178581,1.788507,1.871505,2.256375,3.423628
3,5.180293,3.979222,4.240703,4.953258,4.277489,4.882944,5.136844,4.846506,5.466421,4.888123,...,3.96685,2.905998,4.291851,3.619139,4.456864,1.964099,3.305583,3.824111,3.754071,5.394825
4,3.088748,1.835549,3.175317,3.350162,2.426601,3.331636,4.405009,2.967184,4.424548,2.03319,...,2.296248,1.920253,3.121054,2.480732,2.661177,0.100165,1.937275,2.729859,2.786322,4.029554
