In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '20'

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
import time

## Loading dataset

In [2]:
movies_df = pd.read_csv('./temp/ml-latest-small/movies.csv')
# In collaborative filtering method, we imagine there is no information about the movies like their categories
movies_df = movies_df.drop('genres', axis=1) 
movies_df.index.name = 'index'
# helper dictionaries to help memory usage for embeddings
index_to_movieId = dict(movies_df.movieId)
movieId_to_index = {v:k for k, v in index_to_movieId.items()}
movies_df = movies_df.drop('movieId', axis=1)
movies_df

Unnamed: 0_level_0,title
index,Unnamed: 1_level_1
0,Toy Story (1995)
1,Jumanji (1995)
2,Grumpier Old Men (1995)
3,Waiting to Exhale (1995)
4,Father of the Bride Part II (1995)
...,...
9737,Black Butler: Book of the Atlantic (2017)
9738,No Game No Life: Zero (2017)
9739,Flint (2017)
9740,Bungo Stray Dogs: Dead Apple (2018)


In [3]:
users_df = pd.read_csv('./temp/ml-latest-small/ratings.csv')
users_df = users_df.drop('timestamp', axis=1)
users_df['index'] = users_df['movieId'].map(lambda x: movieId_to_index[x])

# Take the average rating for mean normalization
average_rating = users_df.drop('userId', axis=1).groupby('movieId').mean()

users_df = users_df.set_index('index')
users_df = users_df.drop('movieId', axis=1)

users_df

Unnamed: 0_level_0,userId,rating
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,4.0
2,1,4.0
5,1,4.0
43,1,5.0
46,1,5.0
...,...,...
9434,610,4.0
9461,610,5.0
9462,610,5.0
9463,610,5.0


### Create Movie/User rating dataset

In [4]:
movie_rating_df = users_df.join(movies_df, on='index')
# save the titles mapping, movieId to title, for evaluation
titles = movie_rating_df['title']
movie_rating_df = movie_rating_df.drop('title', axis=1)
movie_rating_df.pivot_table(values='rating', index='index', columns='userId').fillna(0)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Getting data ready

In [5]:
from sklearn.model_selection import train_test_split

movies = movie_rating_df.index.values
users = movie_rating_df['userId'].values
Y = movie_rating_df['rating'].values.astype('float32')


# mean normalize the labels
mu = np.mean(Y, axis=0)
Y = Y - mu


# split the data into training set and validation set
training_movies, test_movies, training_users, test_users, training_ratings, test_ratings = train_test_split(movies,
                                                                                                            users, 
                                                                                                            Y,
                                                                                                            test_size=0.2, shuffle=True)
print(f'Training movies has the shape', training_movies.shape)
print(f'Training users has the shape', training_users.shape)
print(f'Training ratings has the shape', training_ratings.shape)
print()
print(f'Test movies has the shape', test_movies.shape)
print(f'Test users has the shape', test_users.shape)
print(f'Test ratings has the shape', test_ratings.shape)

Training movies has the shape (80668,)
Training users has the shape (80668,)
Training ratings has the shape (80668,)

Test movies has the shape (20168,)
Test users has the shape (20168,)
Test ratings has the shape (20168,)


### Create the pipeline

In [6]:
BATCH_SIZE = 512
BUFFER = 1000
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices((training_movies, training_users, training_ratings))
train_ds = train_ds.shuffle(BUFFER)
train_ds = train_ds.batch(BATCH_SIZE, num_parallel_calls=AUTOTUNE)
train_ds = train_ds.prefetch(AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((test_movies, test_users, test_ratings))
test_ds = test_ds.batch(BATCH_SIZE, num_parallel_calls=AUTOTUNE)
test_ds = test_ds.prefetch(AUTOTUNE)

In [7]:
for movie, user, rate in train_ds.take(1):
    print(movie)
    print(user)
    print(rate)

tf.Tensor(
[5366  474 6534 3640 8301   31    0 5334  314 9369 4361 2028  994 2091
 5166  793 8569 2045 5391    5  540 4425 5263 1224  596 6797 3151 3873
 1761  505  404 9552 5277 2514  422 1949 2806 5200 9720 7212  969 5819
  239  275 3638 8023 7697  794  704 1623  131 6422  907  504 8841 6310
 1807  398  733  418 2635  185 1592  302  722 2326 7230 1044 3207 6954
 1284 1102 4618 1493 3001 5021 4791 2953 2214 1979    6 6058 5729  948
   31  858 3288  828 6405  257  315 2834 1660 2806 4801 6522 2559  926
 6102 9706 2642  778 1070 4551  412  900 3232  337 6349 2067 7166 5897
  118  911  192  622 4133 6965  126 1719 1002  907 4923   68  239 3010
   47  820 7394 1989  123  939 7248 6040  920 2450  969  898 1592 2976
 2226 7302  276 5725 2949 2461 4529  339 1162 2867 5269  224 5943 3562
 3674 1460  409  506 7214 6689 3885 5794 2079 4309 8404 3002  515 2028
 1224    0 7415 4940 3903 6607 2491  957 6947  964 7609 2122 1778 4644
 2916 2656  443 8537 3589 2195  257 8964 5335 5911 1789  483 6331 

## Define the model

In [8]:
from keras.layers import Dense, Activation, Dropout, Input, Dot, Embedding, Layer
from keras.losses import MeanSquaredError
from keras.optimizers import Adam
from keras.metrics import MeanAbsoluteError, Metric
from keras.models import Model
from keras.regularizers import L2

In [9]:
class CollaborativeFiltering(Model):
    def __init__(self, 
                 units,
                 num_movies, 
                 num_users, 
                 user_embedding_dim, 
                 movie_embbeding_dim):
        
        super(CollaborativeFiltering, self).__init__()
        self.user_embedding = Embedding(input_dim=num_users, 
                                        output_dim=user_embedding_dim, 
                                        name='user_embed')
        self.movie_embedding = Embedding(input_dim=num_movies, 
                                         output_dim=movie_embbeding_dim, 
                                         name='movie_embed')

        self.user_mapping = Dense(units=units, kernel_regularizer=L2(0.1))
        self.movie_mapping = Dense(units=units, kernel_regularizer=L2(0.1))
        self.dot = Dot(axes=1, normalize=True, name='dot')

        self.build()

    
    def build(self):
        user = Input(shape=())
        movie = Input(shape=())
        self.call(movie, user)
        self.built = True
    
    def call(self, movie, user, training=False):
        user = self.user_embedding(user)
        movie = self.movie_embedding(movie)

        user = self.user_mapping(user)
        movie = self.movie_mapping(movie)
        return self.dot([user, movie])

In [10]:
model = CollaborativeFiltering(units=512,
                               num_movies=np.unique(movies).shape[0] + 1, 
                               num_users=np.unique(users).shape[0] + 1, 
                               user_embedding_dim=512, 
                               movie_embbeding_dim=512)
model.summary()

Model: "collaborative_filtering"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 user_embed (Embedding)      (None, 512)               312832    
                                                                 
 movie_embed (Embedding)     (None, 512)               4979200   
                                                                 
 dense (Dense)               (None, 512)               262656    
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dot (Dot)                   (None, 1)                 0         
                                                                 
Total params: 5817344 (22.19 MB)
Trainable params: 5817344 (22.19 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
loss_func = MeanSquaredError()

In [12]:
def get_scheduler(initial_learning_rate=1e-3, min_learning_rate=1e-5, weight=0.9):
    def func(epoch):
        return max(initial_learning_rate * weight ** (epoch), min_learning_rate)
    return func

In [13]:
train_mae = MeanAbsoluteError(name='train_mae')
validation_mae = MeanAbsoluteError(name='test_mae')
optimizer = Adam()
scheduler = get_scheduler(initial_learning_rate=1e-2, weight=0.93)

## Train

In [14]:
@tf.function
def training_step(movie, user, y_true):
    loss = 0.0
    with tf.GradientTape() as tape:
        y_pred = model(movie, user, training=True)
        loss = loss_func(y_true, y_pred)
        train_mae.update_state(y_true, y_pred)
        
    variables = model.trainable_variables
    grads = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(grads, variables))
    return loss

In [15]:
@tf.function
def validation_step(movie, user, y_true):
    y_pred = model(movie, user, training=False)
    loss = loss_func(y_true, y_pred)
    validation_mae.update_state(y_true, y_pred)
    return loss

In [16]:
EPOCHS = 100
train_mean_losses = []
valid_mean_losses = []
train_maes = []
valid_maes = []

for epoch in range(EPOCHS):
    train_mae.reset_states()
    validation_mae.reset_states()
    mean_train_loss = 0.0
    mean_valid_loss = 0.0
    start = time.time()
    learning_rate = scheduler(epoch)
    optimizer.learning_rate = learning_rate
    print(f'Epoch {epoch + 1:>3}')

    for step, (movie, user, y_true) in enumerate(train_ds):
        loss = training_step(movie, user, y_true)
        
        mae = train_mae.result()
        mean_train_loss = mean_train_loss + (1 / (step + 1)) * (loss - mean_train_loss)
        end = time.time()
        print(f'\r{int(end - start):>3} sec | Step {step:>3}\tLoss: {mean_train_loss:>2.4f}\t MAE: {mae:>2.4f}', end='')
    print()
    train_mean_losses.append(mean_train_loss)
    train_maes.append(mae)

    for step, (movie, user, y_true) in enumerate(test_ds):
        loss = validation_step(movie, user, y_true)

        mae = validation_mae.result()
        mean_valid_loss = mean_valid_loss + (1 / (step + 1)) * (loss - mean_valid_loss)
        end = time.time()
        print(f'\r{int(end - start):>3} sec | Step {step:>3}\tLoss: {mean_valid_loss:>2.4f}\t MAE: {mae:>2.4f}', end='')
    print()
    valid_mean_losses.append(mean_valid_loss)
    valid_maes.append(mae)

history = {'loss':train_mean_losses, 'val_loss':valid_mean_losses, 'mae':train_maes, 'val_mae':valid_maes}

Epoch   1


I0000 00:00:1715017203.010304   96659 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  8 sec | Step 157	Loss: 0.8541	 MAE: 0.7140
  8 sec | Step  39	Loss: 0.7701	 MAE: 0.6755
Epoch   2
  1 sec | Step 157	Loss: 0.6790	 MAE: 0.6300
  1 sec | Step  39	Loss: 0.8078	 MAE: 0.6931
Epoch   3
  0 sec | Step 157	Loss: 0.6282	 MAE: 0.6038
  0 sec | Step  39	Loss: 0.8085	 MAE: 0.6924
Epoch   4
  0 sec | Step 157	Loss: 0.6062	 MAE: 0.5927
  0 sec | Step  39	Loss: 0.8000	 MAE: 0.6884
Epoch   5
  0 sec | Step 157	Loss: 0.5954	 MAE: 0.5859
  0 sec | Step  39	Loss: 0.7913	 MAE: 0.6834
Epoch   6
  0 sec | Step 157	Loss: 0.5813	 MAE: 0.5765
  0 sec | Step  39	Loss: 0.7856	 MAE: 0.6805
Epoch   7
  0 sec | Step 157	Loss: 0.5695	 MAE: 0.5673
  0 sec | Step  39	Loss: 0.7823	 MAE: 0.6792
Epoch   8
  0 sec | Step 157	Loss: 0.5612	 MAE: 0.5614
  0 sec | Step  39	Loss: 0.7804	 MAE: 0.6785
Epoch   9
  0 sec | Step 157	Loss: 0.5548	 MAE: 0.5572
  0 sec | Step  39	Loss: 0.7787	 MAE: 0.6778
Epoch  10
  0 sec | Step 157	Loss: 0.5508	 MAE: 0.5544
  0 sec | Step  39	Loss: 0.7782	 MAE: 0.6777
Epoch  11


## Evaluation

In [17]:
unique_movies = np.unique(movies)
test_users = (np.ones(unique_movies.shape[0]) * 249).astype('float32')
test_movies = unique_movies.astype('float32')
pred = model(test_movies, test_users, training=False)[:, 0]
indx_pred = np.argsort(pred, axis=0)[::-1]

for i in range(10):
    print(titles.iloc[indx_pred[i]])
    print(pred[indx_pred[i]].numpy() + mu)
    print()

Nine to Five (a.k.a. 9 to 5) (1980)
4.495363

Third Man, The (1949)
4.423199

Dracula (Bram Stoker's Dracula) (1992)
4.409568

Gravity (2013)
4.4060683

Braveheart (1995)
4.3816185

Pest, The (1997)
4.3786983

Three Colors: Blue (Trois couleurs: Bleu) (1993)
4.376232

Face/Off (1997)
4.3742337

Mallrats (1995)
4.3741074

Smokin' Aces (2006)
4.372439



In [18]:
max_loss = 0
y_pred = model(test_movies, test_users, training=False)
for i in range(100):
    y_hat = y_pred[i].numpy()
    y_true = test_ratings[i]
    print(f'Predicted {y_hat[0] + mu:>2.2} and the true rating was {y_true + mu:>2.2}')
    max_loss = max(max_loss, np.abs(y_hat - y_true))

Predicted 4.2 and the true rating was 4.0
Predicted 3.7 and the true rating was 4.0
Predicted 3.6 and the true rating was 3.0
Predicted 3.4 and the true rating was 5.0
Predicted 3.4 and the true rating was 3.0
Predicted 4.2 and the true rating was 2.0
Predicted 3.2 and the true rating was 4.5
Predicted 3.1 and the true rating was 2.0
Predicted 3.3 and the true rating was 5.0
Predicted 3.8 and the true rating was 4.0
Predicted 3.5 and the true rating was 3.0
Predicted 3.2 and the true rating was 2.0
Predicted 3.3 and the true rating was 2.5
Predicted 3.8 and the true rating was 4.0
Predicted 3.6 and the true rating was 3.5
Predicted 4.1 and the true rating was 2.0
Predicted 4.0 and the true rating was 2.5
Predicted 3.9 and the true rating was 3.5
Predicted 3.3 and the true rating was 4.5
Predicted 3.7 and the true rating was 4.5
Predicted 3.5 and the true rating was 3.5
Predicted 3.6 and the true rating was 3.0
Predicted 3.5 and the true rating was 2.0
Predicted 3.2 and the true rating 

In [19]:
max_loss

array([2.6941442], dtype=float32)

In [20]:
model.save('./models/collaborative_filtering_v1.tf')

INFO:tensorflow:Assets written to: ./models/collaborative_filtering_v1.tf/assets


INFO:tensorflow:Assets written to: ./models/collaborative_filtering_v1.tf/assets
