In [1]:
# filter out unncessary warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# To store\load the data
import pandas as pd

# To do linear algebra
import numpy as np

# To create plots
import matplotlib.pyplot as plt
import seaborn as sns


# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# data load progress bars
from tqdm import tqdm

from collections import deque

# To create deep learning models
import tensorflow as tf
import keras
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To stack sparse matrices
from scipy.sparse import vstack

In [3]:
# check keras and TF version used
print('TF Version:', tf.__version__)
print('Keras Version:', keras.__version__)
# TF Version: 1.15.0
# Keras Version: 2.2.5

TF Version: 2.3.1
Keras Version: 2.4.3


In [4]:
path = "./DataSet/ml-25m/ratings.csv"

In [5]:
df = pd.read_csv(path)

In [9]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


### Reduce dimension - remove rarely rating movie and user

In [12]:
# Filter sparse movies
min_movie_ratings = 1000
filter_movies = (df['movieId'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['userId'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filtered = df[(df['movieId'].isin(filter_movies)) & (df['userId'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

Shape User-Ratings unfiltered:	(25000095, 4)
Shape User-Ratings filtered:	(13642536, 4)


### Create Train Test set

In [13]:
# Shuffle DataFrame
df_filtered = df_filtered.drop('timestamp', axis=1).sample(frac=1).reset_index(drop=True)

# Testingsize
n = 200000

# Split train- & testset
df_train = df_filtered[:-n]
df_test = df_filtered[-n:]
df_train.shape, df_test.shape

((13442536, 3), (200000, 3))

### Build item2Vec model for extracting latent feature

In [15]:
# Create user and movie-id mapping to convert to numbers
user_id_mapping = {id:i for i, id in enumerate(df_filtered['userId'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filtered['movieId'].unique())}

In [19]:
# use dataframe map function to map users & movies to mapped ids based on above mapping
train_user_data = df_train['userId'].map(user_id_mapping)
train_movie_data = df_train['movieId'].map(movie_id_mapping)


# do the same for test data
test_user_data = df_test['userId'].map(user_id_mapping)
test_movie_data = df_test['movieId'].map(movie_id_mapping)

In [20]:
# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 100

#### Item2Vec Model

In [23]:
# use Input() to create tensors for - 'user' and 'movie'
user_id_input = Input(shape=(1,), name='user')
movie_id_input = Input(shape=(1,), name='movie')

# Create embedding layer for users 
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)

# create embedding layer for movies just like users
movie_embedding = Embedding(output_dim=embedding_size,
                            input_dim=movies, 
                            input_length=1, 
                            name='movie_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)


# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 100)       3284800     user[0][0]                       
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 100)       379000      movie[0][0]                      
_______________________________________________________________________________________

In [None]:
# Fit model
X = [train_user_data, train_movie_data]
y = df_train['rating']

batch_size = 1024
epochs = 5
validation_split = 0.1

model.fit(X, y,
          batch_size=batch_size, 
          epochs=epochs,
          validation_split=validation_split,
          shuffle=True,
          verbose=1)

Epoch 1/5


  291/11815 [..............................] - ETA: 0s - loss: 13.11 - ETA: 6:33 - loss: 13.30 - ETA: 8:37 - loss: 13.33 - ETA: 9:33 - loss: 13.31 - ETA: 10:02 - loss: 13.308 - ETA: 10:22 - loss: 13.272 - ETA: 10:38 - loss: 13.267 - ETA: 10:48 - loss: 13.251 - ETA: 11:05 - loss: 13.258 - ETA: 11:12 - loss: 13.247 - ETA: 11:18 - loss: 13.240 - ETA: 11:19 - loss: 13.230 - ETA: 11:23 - loss: 13.239 - ETA: 11:28 - loss: 13.245 - ETA: 11:31 - loss: 13.233 - ETA: 11:32 - loss: 13.232 - ETA: 11:35 - loss: 13.212 - ETA: 11:36 - loss: 13.215 - ETA: 11:39 - loss: 13.225 - ETA: 11:40 - loss: 13.231 - ETA: 11:42 - loss: 13.235 - ETA: 11:43 - loss: 13.230 - ETA: 11:44 - loss: 13.222 - ETA: 11:46 - loss: 13.217 - ETA: 11:51 - loss: 13.215 - ETA: 11:51 - loss: 13.222 - ETA: 11:52 - loss: 13.220 - ETA: 11:53 - loss: 13.230 - ETA: 11:53 - loss: 13.226 - ETA: 11:53 - loss: 13.217 - ETA: 11:52 - loss: 13.208 - ETA: 11:52 - loss: 13.204 - ETA: 11:52 - loss: 13.207 - ETA: 11:51 - loss: 13.203 - ETA: 11:50 

  582/11815 [>.............................] - ETA: 12:02 - loss: 13.173 - ETA: 12:02 - loss: 13.173 - ETA: 12:02 - loss: 13.173 - ETA: 12:03 - loss: 13.173 - ETA: 12:03 - loss: 13.172 - ETA: 12:03 - loss: 13.171 - ETA: 12:03 - loss: 13.170 - ETA: 12:03 - loss: 13.171 - ETA: 12:03 - loss: 13.169 - ETA: 12:03 - loss: 13.168 - ETA: 12:03 - loss: 13.168 - ETA: 12:04 - loss: 13.168 - ETA: 12:04 - loss: 13.168 - ETA: 12:04 - loss: 13.168 - ETA: 12:04 - loss: 13.170 - ETA: 12:04 - loss: 13.169 - ETA: 12:04 - loss: 13.169 - ETA: 12:04 - loss: 13.169 - ETA: 12:05 - loss: 13.169 - ETA: 12:05 - loss: 13.168 - ETA: 12:05 - loss: 13.167 - ETA: 12:05 - loss: 13.169 - ETA: 12:05 - loss: 13.169 - ETA: 12:05 - loss: 13.168 - ETA: 12:05 - loss: 13.168 - ETA: 12:05 - loss: 13.168 - ETA: 12:05 - loss: 13.168 - ETA: 12:05 - loss: 13.168 - ETA: 12:05 - loss: 13.168 - ETA: 12:06 - loss: 13.168 - ETA: 12:06 - loss: 13.168 - ETA: 12:06 - loss: 13.168 - ETA: 12:06 - loss: 13.168 - ETA: 12:07 - loss: 13.168 - E

  873/11815 [=>............................] - ETA: 11:55 - loss: 12.936 - ETA: 11:54 - loss: 12.933 - ETA: 11:54 - loss: 12.930 - ETA: 11:54 - loss: 12.928 - ETA: 11:54 - loss: 12.924 - ETA: 11:54 - loss: 12.921 - ETA: 11:54 - loss: 12.918 - ETA: 11:54 - loss: 12.915 - ETA: 11:54 - loss: 12.911 - ETA: 11:54 - loss: 12.908 - ETA: 11:53 - loss: 12.905 - ETA: 11:53 - loss: 12.902 - ETA: 11:53 - loss: 12.899 - ETA: 11:53 - loss: 12.896 - ETA: 11:53 - loss: 12.892 - ETA: 11:53 - loss: 12.888 - ETA: 11:52 - loss: 12.885 - ETA: 11:52 - loss: 12.882 - ETA: 11:52 - loss: 12.878 - ETA: 11:52 - loss: 12.875 - ETA: 11:52 - loss: 12.871 - ETA: 11:52 - loss: 12.868 - ETA: 11:52 - loss: 12.864 - ETA: 11:52 - loss: 12.861 - ETA: 11:53 - loss: 12.857 - ETA: 11:53 - loss: 12.853 - ETA: 11:53 - loss: 12.849 - ETA: 11:53 - loss: 12.845 - ETA: 11:53 - loss: 12.842 - ETA: 11:53 - loss: 12.838 - ETA: 11:54 - loss: 12.834 - ETA: 11:54 - loss: 12.830 - ETA: 11:54 - loss: 12.827 - ETA: 11:54 - loss: 12.822 - E

 1179/11815 [=>............................] - ETA: 11:42 - loss: 10.851 - ETA: 11:42 - loss: 10.841 - ETA: 11:42 - loss: 10.832 - ETA: 11:42 - loss: 10.823 - ETA: 11:42 - loss: 10.814 - ETA: 11:41 - loss: 10.805 - ETA: 11:41 - loss: 10.796 - ETA: 11:41 - loss: 10.786 - ETA: 11:41 - loss: 10.777 - ETA: 11:41 - loss: 10.768 - ETA: 11:41 - loss: 10.759 - ETA: 11:41 - loss: 10.750 - ETA: 11:41 - loss: 10.741 - ETA: 11:41 - loss: 10.732 - ETA: 11:41 - loss: 10.723 - ETA: 11:40 - loss: 10.714 - ETA: 11:40 - loss: 10.705 - ETA: 11:40 - loss: 10.696 - ETA: 11:40 - loss: 10.687 - ETA: 11:40 - loss: 10.677 - ETA: 11:40 - loss: 10.669 - ETA: 11:40 - loss: 10.659 - ETA: 11:40 - loss: 10.650 - ETA: 11:39 - loss: 10.641 - ETA: 11:39 - loss: 10.632 - ETA: 11:39 - loss: 10.624 - ETA: 11:39 - loss: 10.614 - ETA: 11:39 - loss: 10.606 - ETA: 11:39 - loss: 10.597 - ETA: 11:39 - loss: 10.588 - ETA: 11:39 - loss: 10.579 - ETA: 11:38 - loss: 10.570 - ETA: 11:38 - loss: 10.561 - ETA: 11:38 - loss: 10.552 - E

 1493/11815 [==>...........................] - ETA: 11:33 - loss: 8.48 - ETA: 11:33 - loss: 8.47 - ETA: 11:33 - loss: 8.47 - ETA: 11:33 - loss: 8.46 - ETA: 11:33 - loss: 8.46 - ETA: 11:33 - loss: 8.45 - ETA: 11:33 - loss: 8.44 - ETA: 11:33 - loss: 8.44 - ETA: 11:33 - loss: 8.43 - ETA: 11:32 - loss: 8.42 - ETA: 11:32 - loss: 8.42 - ETA: 11:32 - loss: 8.41 - ETA: 11:32 - loss: 8.41 - ETA: 11:32 - loss: 8.40 - ETA: 11:32 - loss: 8.39 - ETA: 11:32 - loss: 8.39 - ETA: 11:32 - loss: 8.38 - ETA: 11:31 - loss: 8.38 - ETA: 11:31 - loss: 8.37 - ETA: 11:31 - loss: 8.36 - ETA: 11:31 - loss: 8.36 - ETA: 11:31 - loss: 8.35 - ETA: 11:31 - loss: 8.35 - ETA: 11:31 - loss: 8.34 - ETA: 11:31 - loss: 8.33 - ETA: 11:31 - loss: 8.33 - ETA: 11:31 - loss: 8.32 - ETA: 11:31 - loss: 8.32 - ETA: 11:30 - loss: 8.31 - ETA: 11:30 - loss: 8.30 - ETA: 11:30 - loss: 8.30 - ETA: 11:30 - loss: 8.29 - ETA: 11:30 - loss: 8.29 - ETA: 11:30 - loss: 8.28 - ETA: 11:30 - loss: 8.27 - ETA: 11:30 - loss: 8.27 - ETA: 11:30 - loss

 1807/11815 [===>..........................] - ETA: 11:10 - loss: 6.90 - ETA: 11:10 - loss: 6.89 - ETA: 11:10 - loss: 6.89 - ETA: 11:10 - loss: 6.88 - ETA: 11:10 - loss: 6.88 - ETA: 11:10 - loss: 6.88 - ETA: 11:09 - loss: 6.87 - ETA: 11:09 - loss: 6.87 - ETA: 11:09 - loss: 6.86 - ETA: 11:09 - loss: 6.86 - ETA: 11:09 - loss: 6.86 - ETA: 11:09 - loss: 6.85 - ETA: 11:09 - loss: 6.85 - ETA: 11:09 - loss: 6.84 - ETA: 11:09 - loss: 6.84 - ETA: 11:08 - loss: 6.84 - ETA: 11:08 - loss: 6.83 - ETA: 11:08 - loss: 6.83 - ETA: 11:08 - loss: 6.82 - ETA: 11:08 - loss: 6.82 - ETA: 11:08 - loss: 6.82 - ETA: 11:08 - loss: 6.81 - ETA: 11:08 - loss: 6.81 - ETA: 11:08 - loss: 6.80 - ETA: 11:07 - loss: 6.80 - ETA: 11:07 - loss: 6.80 - ETA: 11:07 - loss: 6.79 - ETA: 11:07 - loss: 6.79 - ETA: 11:07 - loss: 6.78 - ETA: 11:07 - loss: 6.78 - ETA: 11:07 - loss: 6.78 - ETA: 11:07 - loss: 6.77 - ETA: 11:06 - loss: 6.77 - ETA: 11:06 - loss: 6.77 - ETA: 11:06 - loss: 6.76 - ETA: 11:06 - loss: 6.76 - ETA: 11:06 - loss

 2121/11815 [====>.........................] - ETA: 10:52 - loss: 5.84 - ETA: 10:52 - loss: 5.83 - ETA: 10:52 - loss: 5.83 - ETA: 10:52 - loss: 5.83 - ETA: 10:51 - loss: 5.83 - ETA: 10:51 - loss: 5.82 - ETA: 10:51 - loss: 5.82 - ETA: 10:51 - loss: 5.82 - ETA: 10:51 - loss: 5.82 - ETA: 10:51 - loss: 5.81 - ETA: 10:51 - loss: 5.81 - ETA: 10:51 - loss: 5.81 - ETA: 10:51 - loss: 5.80 - ETA: 10:51 - loss: 5.80 - ETA: 10:51 - loss: 5.80 - ETA: 10:51 - loss: 5.80 - ETA: 10:51 - loss: 5.79 - ETA: 10:50 - loss: 5.79 - ETA: 10:50 - loss: 5.79 - ETA: 10:50 - loss: 5.79 - ETA: 10:50 - loss: 5.78 - ETA: 10:50 - loss: 5.78 - ETA: 10:50 - loss: 5.78 - ETA: 10:50 - loss: 5.77 - ETA: 10:50 - loss: 5.77 - ETA: 10:50 - loss: 5.77 - ETA: 10:50 - loss: 5.77 - ETA: 10:50 - loss: 5.76 - ETA: 10:49 - loss: 5.76 - ETA: 10:49 - loss: 5.76 - ETA: 10:49 - loss: 5.75 - ETA: 10:49 - loss: 5.75 - ETA: 10:49 - loss: 5.75 - ETA: 10:49 - loss: 5.75 - ETA: 10:49 - loss: 5.74 - ETA: 10:49 - loss: 5.74 - ETA: 10:49 - loss

 2435/11815 [=====>........................] - ETA: 10:26 - loss: 5.09 - ETA: 10:26 - loss: 5.09 - ETA: 10:26 - loss: 5.08 - ETA: 10:26 - loss: 5.08 - ETA: 10:26 - loss: 5.08 - ETA: 10:26 - loss: 5.08 - ETA: 10:26 - loss: 5.07 - ETA: 10:26 - loss: 5.07 - ETA: 10:26 - loss: 5.07 - ETA: 10:26 - loss: 5.07 - ETA: 10:26 - loss: 5.07 - ETA: 10:25 - loss: 5.06 - ETA: 10:25 - loss: 5.06 - ETA: 10:25 - loss: 5.06 - ETA: 10:25 - loss: 5.06 - ETA: 10:25 - loss: 5.06 - ETA: 10:25 - loss: 5.05 - ETA: 10:25 - loss: 5.05 - ETA: 10:25 - loss: 5.05 - ETA: 10:25 - loss: 5.05 - ETA: 10:25 - loss: 5.05 - ETA: 10:25 - loss: 5.04 - ETA: 10:25 - loss: 5.04 - ETA: 10:24 - loss: 5.04 - ETA: 10:24 - loss: 5.04 - ETA: 10:24 - loss: 5.04 - ETA: 10:24 - loss: 5.03 - ETA: 10:24 - loss: 5.03 - ETA: 10:24 - loss: 5.03 - ETA: 10:24 - loss: 5.03 - ETA: 10:24 - loss: 5.03 - ETA: 10:24 - loss: 5.02 - ETA: 10:24 - loss: 5.02 - ETA: 10:24 - loss: 5.02 - ETA: 10:24 - loss: 5.02 - ETA: 10:24 - loss: 5.02 - ETA: 10:23 - loss

 2749/11815 [=====>........................] - ETA: 10:04 - loss: 4.53 - ETA: 10:04 - loss: 4.53 - ETA: 10:04 - loss: 4.53 - ETA: 10:03 - loss: 4.52 - ETA: 10:03 - loss: 4.52 - ETA: 10:03 - loss: 4.52 - ETA: 10:03 - loss: 4.52 - ETA: 10:03 - loss: 4.52 - ETA: 10:03 - loss: 4.52 - ETA: 10:03 - loss: 4.51 - ETA: 10:03 - loss: 4.51 - ETA: 10:03 - loss: 4.51 - ETA: 10:03 - loss: 4.51 - ETA: 10:03 - loss: 4.51 - ETA: 10:03 - loss: 4.51 - ETA: 10:02 - loss: 4.51 - ETA: 10:02 - loss: 4.50 - ETA: 10:02 - loss: 4.50 - ETA: 10:02 - loss: 4.50 - ETA: 10:02 - loss: 4.50 - ETA: 10:02 - loss: 4.50 - ETA: 10:02 - loss: 4.50 - ETA: 10:02 - loss: 4.49 - ETA: 10:02 - loss: 4.49 - ETA: 10:02 - loss: 4.49 - ETA: 10:02 - loss: 4.49 - ETA: 10:02 - loss: 4.49 - ETA: 10:01 - loss: 4.49 - ETA: 10:01 - loss: 4.49 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss: 4.48 - ETA: 10:01 - loss





































