In [1]:
# filter out unncessary warnings
import warnings
warnings.filterwarnings('ignore')

In [14]:
# To store\load the data
import pandas as pd

# To do linear algebra
import numpy as np

# To create plots
import matplotlib.pyplot as plt
import seaborn as sns


# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# data load progress bars
from tqdm import tqdm

from collections import deque

# To create deep learning models
import tensorflow as tf
import keras
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To stack sparse matrices
from scipy.sparse import vstack

In [15]:
# check keras and TF version used
print('TF Version:', tf.__version__)
print('Keras Version:', keras.__version__)
# TF Version: 1.15.0
# Keras Version: 2.2.5

TF Version: 2.3.1
Keras Version: 2.4.3


In [16]:
path = "./DataSet/ml-25m/ratings.csv"

In [17]:
df = pd.read_csv(path)

In [18]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


### Reduce dimension - remove rarely rating movie and user

In [12]:
# Filter sparse movies
min_movie_ratings = 1000
filter_movies = (df['movieId'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['userId'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filtered = df[(df['movieId'].isin(filter_movies)) & (df['userId'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

Shape User-Ratings unfiltered:	(25000095, 4)
Shape User-Ratings filtered:	(13642536, 4)


### Create Train Test set

In [15]:
# Shuffle DataFrame
df_filtered = df_filtered.drop('timestamp', axis=1).sample(frac=1).reset_index(drop=True)

# Testingsize
n = 200000

# Split train- & testset
df_train = df_filtered[:-n]
df_test = df_filtered[-n:]
df_train.shape, df_test.shape

((13442536, 3), (200000, 3))

### Build item2Vec model for extracting latent feature

In [13]:
# Create user and movie-id mapping to convert to numbers
user_id_mapping = {id:i for i, id in enumerate(df_filtered['userId'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filtered['movieId'].unique())}

In [16]:
# use dataframe map function to map users & movies to mapped ids based on above mapping
train_user_data = df_train['userId'].map(user_id_mapping)
train_movie_data = df_train['movieId'].map(movie_id_mapping)


# do the same for test data
test_user_data = df_test['userId'].map(user_id_mapping)
test_movie_data = df_test['movieId'].map(movie_id_mapping)

### !!! generate full-dataset without filtering

In [19]:
full_movies = df.movieId.index.tolist()

full_users = df.userId.index.tolist()

print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))

Shape User-Ratings unfiltered:	(25000095, 4)


In [20]:
df_full_dataset = df.drop('timestamp', axis=1).sample(frac=1).reset_index(drop=True)


## Test dataset size
n = 400000

## split train-test dataset
df_train_full = df_full_dataset[:-n]
df_test_full = df_full_dataset[-n:]
df_train_full.shape, df_test_full.shape

((24600095, 3), (400000, 3))

In [21]:
user_id_mapping_full = {id:i for i, id in enumerate(df_full_dataset['userId'].unique())}
movie_id_mapping_full = {id:i for i, id in enumerate(df_full_dataset['movieId'].unique())}

In [14]:
# use dataframe map function to map users & movies to mapped ids based on above mapping
train_user_data_full = df_train_full['userId'].map(user_id_mapping_full)
train_movie_data_full = df_train_full['movieId'].map(movie_id_mapping_full)

# do the same for test data
test_user_data_full = df_test_full['userId'].map(user_id_mapping_full)
test_movie_data_full = df_test_full['movieId'].map(movie_id_mapping_full)

### !!! Missing MovieId in Rating data(Some movie never rate by users)

In [49]:
movie_meta_df = pd.read_csv("./DataSet/ml-25m/movies.csv")

total_movie_in_DB = len(movie_meta_df.movieId.unique())

In [50]:
total_movie_in_DB

62423

In [52]:
# Get input variable-sizes
users = len(user_id_mapping_full)
movies = total_movie_in_DB
embedding_size = 100

#### Item2Vec Model

In [53]:
# use Input() to create tensors for - 'user' and 'movie'
user_id_input = Input(shape=(1,), name='user')
movie_id_input = Input(shape=(1,), name='movie')

# Create embedding layer for users 
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)

# create embedding layer for movies just like users
movie_embedding = Embedding(output_dim=embedding_size,
                            input_dim=movies, 
                            input_length=1, 
                            name='movie_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)


# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 100)       16254100    user[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 100)       6242300     movie[0][0]                      
_______________________________________________________________________________________

### Train with filter data

In [None]:
# Fit model
X = [train_user_data, train_movie_data]
y = df_train['rating']

batch_size = 1024
epochs = 10 
validation_split = 0.1

model.fit(X, y,
          batch_size=batch_size, 
          epochs=epochs,
          validation_split=validation_split,
          shuffle=True,
          verbose=1)

In [14]:
# Test model by making predictions on test data
y_pred = model.predict([test_user_data, test_movie_data]).ravel()
# clip upper and lower ratings
y_pred = list(map(lambda x: 1.0 if x < 1 else 5.0 if x > 5.0 else x, y_pred))
# get true labels
y_true = df_test['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With DL Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With DL Matrix-Factorization: 0.7874 RMSE


### Train with full dataset

In [None]:
# Fit model
X = [train_user_data_full, train_movie_data_full]
y = df_train_full['rating']

batch_size = 1024
epochs = 10 
validation_split = 0.1

model.fit(X, y,
          batch_size=batch_size, 
          epochs=epochs,
          validation_split=validation_split,
          shuffle=True,
          verbose=1)

In [62]:
# Test model by making predictions on test data
y_pred_full = model.predict([test_user_data_full, test_movie_data_full]).ravel()
# clip upper and lower ratings
y_pred_full = list(map(lambda x: 1.0 if x < 1 else 5.0 if x > 5.0 else x, y_pred_full))
# get true labels
y_true_full = df_test_full['rating'].values

#  Compute RMSE
mse = np.sqrt(mean_squared_error(y_pred=y_pred_full, y_true=y_true_full))
print('\n\nTesting Result With DL Matrix-Factorization: {:.4f} RMSE'.format(mse))



Testing Result With DL Matrix-Factorization: 1.4455 RMSE


In [56]:
model.save('embedding_fulldata')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: embedding_fulldata\assets


In [52]:
full_movie_ratings = np.dot(model.layers[2].get_weights()[0], np.transpose(model.layers[3].get_weights()[0]))

In [65]:
predict_user_rating = {}
for i in user_id_mapping.keys():
    _idx = user_id_mapping[i]
    predict_user_rating[i] = full_movie_ratings[_idx][:]

### Load Trained Model

In [1]:
from keras.models import load_model

model = load_model('neural_cf')



In [8]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 100)       16254100    user[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 100)       5904700     movie[0][0]                      
_______________________________________________________________________________________

In [14]:
model.layers[2].get_weights()[0].shape

(162541, 100)

In [7]:
model.layers[3].get_weights()[0].shape

(59047, 100)

### Deep Learning Model - Neural CF

In [11]:
# setup NN parameters
user_embed_dim = 100
movie_embed_dim = 100
userid_input_shape = 1
movieid_input_shape = 1

In [12]:
# user and movie input layers
user_id_input = Input(shape=(userid_input_shape,), name='user')
movie_id_input = Input(shape=(movieid_input_shape,), name='movie')

In [13]:
# Create embeddings layers for users and movies

# user embedding
user_embedding = Embedding(output_dim=user_embed_dim,
                           input_dim=len(user_id_mapping_full),
                           input_length=userid_input_shape,
                           name='user_embedding')(user_id_input)

# movie embedding
movie_embedding = Embedding(output_dim=movie_embed_dim,
                            input_dim=len(movie_id_mapping_full),
                            input_length=movieid_input_shape,
                            name='movie_embedding')(movie_id_input)

In [14]:
# Reshape both user and movie embedding layers
user_vectors = Reshape([user_embed_dim])(user_embedding)
movie_vectors = Reshape([movie_embed_dim])(movie_embedding)

In [15]:
# Concatenate all layers into one 
hybrid_layer = Concatenate()([user_vectors, movie_vectors])

In [16]:
# add in dense and output layers
dense = Dense(512, activation='relu')(hybrid_layer)
dense = Dropout(0.2)(dense)
output = Dense(1)(dense)

In [18]:
# create and view model summary
model = Model(inputs=[user_id_input, movie_id_input], outputs=output)
model.compile(loss='mse', optimizer='adam')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 100)       16254100    user[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 100)       5904700     movie[0][0]                      
_______________________________________________________________________________________

In [None]:
# fit the model
batch_size=1024
epochs=10
X = [train_user_data_full, train_movie_data_full]
y = df_train_full['rating']
model.fit(X, y,
          batch_size=batch_size, 
          epochs=epochs, ## Change the epochs to find better improved model.
          validation_split=0.1,
          shuffle=True)

In [20]:
# Test model by making predictions on test data
y_pred_full = model.predict([test_user_data_full, test_movie_data_full]).ravel()
# clip upper and lower ratings
y_pred_full = list(map(lambda x: 1.0 if x < 1 else 5.0 if x > 5.0 else x, y_pred_full))
# get true labels
y_true_full = df_test_full['rating'].values

#  Compute RMSE
mse = np.sqrt(mean_squared_error(y_pred=y_pred_full, y_true=y_true_full))
print('\n\nTesting Result With DL Matrix-Factorization: {:.4f} RMSE'.format(mse))



Testing Result With DL Matrix-Factorization: 0.7718 RMSE


In [21]:
model.save('neural_cf')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: neural_cf\assets


In [45]:
predict_df = pd.DataFrame(
    {
        'UserId': test_user_data_full.values,
        'MovieId': test_movie_data_full.values,
        'Predict_rating': np.round(pred_ratings,1),
        'Actual_rating': y_true_full
    }   
)

In [46]:
predict_df.head()

Unnamed: 0,UserId,MovieId,Predict_rating,Actual_rating
0,13957,3159,2.7,3.5
1,1547,976,3.5,4.0
2,32372,866,3.7,4.5
3,10942,1766,3.8,4.0
4,35973,1628,3.0,2.5


### Load User movie ID as json

In [34]:
user_id_mapping_full = {int(id):int(i) for i, id in enumerate(df_full_dataset['userId'].unique())}
movie_id_mapping_full = {int(id):int(i) for i, id in enumerate(df_full_dataset['movieId'].unique())}

In [38]:
import json

with open('user_id_map.json','w+') as f:
    json.dump(user_id_mapping_full, f)
    
with open('movie_id_map.json', 'w+') as f:
    json.dump(movie_id_mapping_full, f)