In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# Preparing the data
# Import the data
movie_ratings_raw = pd.read_csv('movie_smallset.csv')

user_indices = [i-1 for i in movie_ratings_raw.user_id.values]
item_indices = [i-1 for i in movie_ratings_raw.item_id.values]
R_known = movie_ratings_raw.rating.values

In [3]:
# Pivot table to unstack data
movie_ratings_pt = movie_ratings_raw.pivot(index = 'user_id', columns ='item_id', values = 'rating').fillna(0)

#Unpivot table to name columns and rows
index = movie_ratings_pt.index.union(movie_ratings_pt.columns)
movie_ratings = movie_ratings_pt.reindex(index=index, columns=index, fill_value=0)
movie_ratings.columns = ['Toy Story','Terminator','Terminator 2','Lion King','Despicable Me','Despicable Me 2',
                                       'Die Hard', 'Die Hard 2','Toy Story 2','Die Hard 3']
movie_ratings.index = ['Justin','Mike','Stef','Jim','Claire','Joe','Amie','Charles','Katie','Chuck']
movie_ratings

Unnamed: 0,Toy Story,Terminator,Terminator 2,Lion King,Despicable Me,Despicable Me 2,Die Hard,Die Hard 2,Toy Story 2,Die Hard 3
Justin,1.0,5.0,0.0,1.0,2.0,1.0,5.0,5.0,0.0,4.0
Mike,2.0,4.0,5.0,0.0,1.0,1.0,5.0,0.0,1.0,5.0
Stef,0.0,5.0,4.0,1.0,0.0,1.0,5.0,4.0,1.0,5.0
Jim,1.0,5.0,0.0,1.0,1.0,1.0,5.0,0.0,1.0,5.0
Claire,1.0,0.0,5.0,2.0,1.0,2.0,4.0,5.0,0.0,5.0
Joe,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Amie,5.0,0.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,0.0
Charles,0.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0
Katie,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Chuck,5.0,1.0,0.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0


In [4]:
R = np.array(movie_ratings)
R_known = movie_ratings_raw.rating.values
N = 10    # number of users
M = 10    # number of items
K = 2     # number of hidden features
p = np.random.rand(N,K)
q = np.random.rand(K,M)

## Prediction: $\hat{r} = pq$

In [5]:
# Prediction
P = tf.Variable(p, dtype=tf.float32, name='P')
Q = tf.Variable(q, dtype=tf.float32, name='Q')
R_hat = tf.matmul(P, Q)

# Cost function
R_hat_flat = tf.reshape(R_hat, [-1])
R_hat_known = tf.gather(R_hat_flat, user_indices * tf.shape(R_hat)[1] + 
              item_indices, name='extracting_user_rate')

## Cost: $\left \|  \hat{r}-r\right \|$

In [6]:
# Cost
diff_ratings = tf.subtract(R_hat_known, R_known, name='diff_ratings')
diff_ratings_abs = tf.abs(diff_ratings, name="diff_ratings_abs")
base_cost = tf.reduce_sum(diff_ratings_abs, name="sum_abs_error")

## Regularization: $\sum \lambda(\left \| p \right \|+\left \| q \right \|)$

In [7]:
# Regularization
lda = tf.constant(.001, name='lambda')
norm_sums = tf.add(tf.reduce_sum(tf.abs(P, name='user_abs'), name='user_norm'),
                   tf.reduce_sum(tf.abs(Q, name='item_abs'), name='item_norm'))
regularizer = tf.multiply(norm_sums, lda, 'regularizer')

## Total Cost: $\sum \left \|  \hat{r}-r\right \|+\lambda(\left \| p \right \|+\left \| q \right \|)$

In [8]:
# Total Cost
cost = tf.add(base_cost, regularizer)

In [9]:
# Create an optimizer to minimize the loss
optimizer = tf.train.GradientDescentOptimizer(0.001)
train = optimizer.minimize(cost)

In [10]:
# Execute the TF session
sess = tf.Session();
init = tf.global_variables_initializer()
sess.run(init)

for i in range(5000):
    sess.run(train)

In [11]:
# Accuracy for known R ratings vs predicted R_hat ratings
diff_ratings = tf.subtract(R_hat_known, R_known, name='diff_ratings')
diff_ratings_abs = tf.abs(diff_ratings, name="diff_ratings_abs")
sum_diff_ratings_abs = tf.reduce_sum(diff_ratings_abs, name="sum_diff_ratings_abs")
accuracy = tf.div(sum_diff_ratings_abs, R_known.size, name="accuracy")
print(sess.run(accuracy))

0.12156976


In [12]:
# Visualize known R ratings
movie_ratings

Unnamed: 0,Toy Story,Terminator,Terminator 2,Lion King,Despicable Me,Despicable Me 2,Die Hard,Die Hard 2,Toy Story 2,Die Hard 3
Justin,1.0,5.0,0.0,1.0,2.0,1.0,5.0,5.0,0.0,4.0
Mike,2.0,4.0,5.0,0.0,1.0,1.0,5.0,0.0,1.0,5.0
Stef,0.0,5.0,4.0,1.0,0.0,1.0,5.0,4.0,1.0,5.0
Jim,1.0,5.0,0.0,1.0,1.0,1.0,5.0,0.0,1.0,5.0
Claire,1.0,0.0,5.0,2.0,1.0,2.0,4.0,5.0,0.0,5.0
Joe,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Amie,5.0,0.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,0.0
Charles,0.0,1.0,1.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0
Katie,5.0,1.0,1.0,0.0,5.0,5.0,1.0,1.0,5.0,1.0
Chuck,5.0,1.0,0.0,5.0,5.0,5.0,1.0,1.0,5.0,1.0


In [13]:
# Visualize R_hat rating predictions
pred = sess.run(R_hat)
pred_df = pd.DataFrame(pred)
pred_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.013814,5.000956,4.987432,1.03034,1.013169,1.021186,5.009359,4.992245,1.014314,4.994885
1,1.027642,5.001032,4.98756,1.044259,1.027001,1.035105,5.009406,4.992388,1.028159,4.994987
2,0.998083,4.986865,4.97333,1.014478,0.997436,1.00535,4.995273,4.978117,0.998566,4.980789
3,1.014933,5.014312,5.000747,1.031493,1.014286,1.022315,5.022742,5.00557,1.015433,5.008223
4,1.817064,5.003448,4.992934,1.838857,1.816655,1.829708,5.010101,4.998588,1.818531,4.998799
5,5.005643,1.017142,1.032425,5.04045,5.006928,5.038639,1.008357,1.038417,5.011527,1.024423
6,5.008144,1.011107,1.026421,5.042954,5.009431,5.041155,1.002303,1.032411,5.014031,1.018402
7,4.997941,1.012207,1.027478,5.032687,4.999225,5.030886,1.003428,1.033458,5.003816,1.019482
8,5.007029,1.019993,1.035272,5.04185,5.008314,5.040034,1.011211,1.041267,5.012914,1.027272
9,5.005131,1.017375,1.032656,5.039934,5.006416,5.038124,1.008592,1.038647,5.011014,1.024655
