In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [2]:
df = pd.read_csv('data/ratings.csv')
# df.head()

In [3]:
Y_df = pd.DataFrame(index=sorted(df['movieId'].unique().tolist()), columns=df['userId'].unique().tolist())
# Y_df.head()

In [4]:
# Create a pivot table from the dataframe df
pivot_df = df.pivot(index='movieId', columns='userId', values='rating')

# Reindex to match Y_df's index and columns, filling missing values with 0 or NaN as needed
Y_df = pivot_df.reindex(index=Y_df.index, columns=Y_df.columns)

# Make a numpy array of Y
Y = Y_df.to_numpy()

Y_no_null = np.nan_to_num(Y, copy=True, nan=0)

means = (Y_no_null.sum(axis=1)/(Y.shape[1] - np.isnan(Y).sum(axis=1))).reshape(-1,1)

Y_normalized = Y - means

In [5]:
nm, nu = Y_normalized.shape
nf = 20

In [6]:
tf.random.set_seed(1234)

X = tf.Variable(tf.random.normal((nm, nf), dtype=tf.float64), name='X')
W = tf.Variable(tf.random.normal((nu, nf), dtype=tf.float64), name='W')
b = tf.Variable(tf.random.normal((1,  nu), dtype=tf.float64), name='b')

# X[0], W[0], b[2,0]

b[0][0:2]

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([-0.08194051,  0.7000041 ])>

In [7]:
def calculate_cost(X, W, b, Y, lambda_):

    matrix = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)
    regularization = lambda_ * (tf.reduce_sum(W ** 2) + tf.reduce_sum(X ** 2))

    # Replace NaNs with zeros (or any value that doesn't affect the sum)
    matrix_no_nan = tf.where(tf.math.is_nan(matrix), tf.zeros_like(matrix), matrix)

    # Sum all non-NaN values
    sum_non_nan = tf.reduce_sum(matrix_no_nan ** 2)

    return (sum_non_nan + regularization) / 2

In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

In [15]:
epochs = 1000
lambda_ = 0.8
for epoch in range(epochs):
    with tf.GradientTape() as tape:
        cost = calculate_cost(X, W, b, Y_normalized, lambda_)

    grads = tape.gradient(cost, [X,W,b])
    optimizer.apply_gradients(zip(grads, [X,W,b]))
    print(epoch, cost)

0 tf.Tensor(8839.77698616207, shape=(), dtype=float64)
1 tf.Tensor(8839.764368783519, shape=(), dtype=float64)
2 tf.Tensor(8839.695174279253, shape=(), dtype=float64)
3 tf.Tensor(8839.51724151485, shape=(), dtype=float64)
4 tf.Tensor(8839.205696500088, shape=(), dtype=float64)
5 tf.Tensor(8838.812178017124, shape=(), dtype=float64)
6 tf.Tensor(8838.44350900648, shape=(), dtype=float64)
7 tf.Tensor(8838.20282414652, shape=(), dtype=float64)
8 tf.Tensor(8838.123183093698, shape=(), dtype=float64)
9 tf.Tensor(8838.163591394914, shape=(), dtype=float64)
10 tf.Tensor(8838.247590587951, shape=(), dtype=float64)
11 tf.Tensor(8838.294164344581, shape=(), dtype=float64)
12 tf.Tensor(8838.235191885104, shape=(), dtype=float64)
13 tf.Tensor(8838.021211690362, shape=(), dtype=float64)
14 tf.Tensor(8837.65005033282, shape=(), dtype=float64)
15 tf.Tensor(8837.181717609554, shape=(), dtype=float64)
16 tf.Tensor(8836.732585642892, shape=(), dtype=float64)
17 tf.Tensor(8836.407863641562, shape=(), dtyp

In [16]:
Y_pred = tf.tensordot(X, tf.transpose(W), axes=1) + b + means

In [17]:
movie_df = pd.read_csv('data/movies.csv')
movie_df = movie_df.set_index('movieId')
movie_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [18]:
movie_df.loc[50]

title     Usual Suspects, The (1995)
genres        Crime|Mystery|Thriller
Name: 50, dtype: object

In [19]:
new_df = pd.DataFrame(Y_pred, index=Y_df.index, columns=Y_df.columns)
new_df['movie'] = movie_df['title']
new_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,602,603,604,605,606,607,608,609,610,movie
1,4.458194,3.785611,2.463377,3.490856,4.108457,4.594029,4.832552,3.750765,4.401797,2.492934,...,2.803998,3.636357,3.853113,3.286371,3.303864,3.943725,2.898362,3.427894,4.102014,Toy Story (1995)
2,3.533119,3.372849,1.888656,4.08238,3.346765,4.798604,5.55098,3.331697,3.353951,2.797858,...,2.843532,3.626481,3.632485,3.225822,2.936344,4.570332,2.177316,2.997797,3.29122,Jumanji (1995)
3,3.916412,3.808275,1.761731,2.361536,2.939818,4.849714,2.876672,2.156931,2.81053,2.018967,...,3.69871,2.260791,3.532127,2.713254,1.983638,4.578943,2.239201,3.226809,4.022022,Grumpier Old Men (1995)
4,3.104851,2.31682,0.623274,1.846938,2.116555,2.971177,2.93916,2.429337,2.167157,1.373569,...,1.640708,2.100108,2.290511,1.947532,2.310876,3.914377,2.363728,2.222921,2.233882,Waiting to Exhale (1995)
5,3.578339,2.570234,0.021246,2.411958,2.900644,4.862568,2.061179,1.620861,3.557551,6.793971,...,1.378319,-1.555199,2.884313,3.644645,2.04933,3.634344,3.516432,3.026309,1.372119,Father of the Bride Part II (1995)


#### predictions for user 1

In [23]:
new_df[[1, 'movie']]

Unnamed: 0,1,movie
1,4.458194,Toy Story (1995)
2,3.533119,Jumanji (1995)
3,3.916412,Grumpier Old Men (1995)
4,3.104851,Waiting to Exhale (1995)
5,3.578339,Father of the Bride Part II (1995)
...,...,...
193581,4.666257,Black Butler: Book of the Atlantic (2017)
193583,4.166266,No Game No Life: Zero (2017)
193585,4.166265,Flint (2017)
193587,4.166252,Bungo Stray Dogs: Dead Apple (2018)
