# Movies Recommender System using Neural Network

# import important libraries
- We will use familiar packages, NumPy, TensorFlow and helpful routines from scikit-learn. We will also use tabulate to neatly print tables and Pandas to organize tabular data.

In [1]:
import numpy as np
import numpy.ma as ma
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
#from recsysNN_utils import *
import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.precision", 1)

In [2]:
movies = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\movies.csv")
links = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\links.csv")
ratings = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\ratings.csv")
tags = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\tags.csv")

# Movies DataFrame Exploration and Manipulation

In [3]:
# we have 9742 movie
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
# split year from title to use it as a feature engineering 
movies[['title','year','0','1','2']] = movies.title.apply(lambda x: pd.Series(str(x).split("(")))
movies = movies.drop(['0','1','2'], axis=1)
movies['year'] = movies['year'].str.replace(')','')

In [5]:
# ectract years from movies data Frame
years = pd.DataFrame()
years['year'] = movies['year']
# generate new list to identify the non integer values
lis = pd.DataFrame()
lis['x'] = years.year.str.isnumeric()
# list the index of non integer values inside array
y = lis.index[lis['x'] == False].tolist()
# insert the year mode inside the cells
for i in y:
    years.at[i, 'year'] = 2015

In [6]:
movies = movies.drop('year', axis=1)
movies = pd.concat([movies, years], axis=1)

In [7]:
movies = movies.fillna(0)

In [8]:
# add the features' labels of the movies to the original dataFrame
movies[['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
          'War', 'Western']] = 0

In [9]:
# split the movie features to be able to reformulate the features in proper way for the algorithm learning phase
# g symbole refers to genre
movies[['g0','g1','g2','g3','g4','g5','g6','g7','g8','g9']] = movies.genres.apply(lambda x: pd.Series(str(x).split("|")))

In [10]:
# the original data Frame into two dataFrames 
# the dataFrame in which the features will be extracted
# the other dataFrame that the features will be assigned
movies_1 = movies.filter(movies.loc[:, 'Action':'Western'].columns, axis=1)
movies_2 = movies.drop(movies.loc[:, 'movieId':'Western'].columns, axis=1)

In [11]:
# loop through the features of the movies
# extract the desirable index
# append on the features dataFrame in a proper way
for j in movies_1:
    for i in movies_2.columns:
        indx = movies_2[movies_2[i]==j].index.values
        for i in indx:
            movies_1.at[i, j] = 1

In [12]:
movies = movies.filter(['movieId','year'])
movies_2 = movies

In [13]:
# merge the two dataframes
movies = pd.concat([movies_2, movies_1], axis=1)

In [14]:
# sort the movies' ids
ratings = ratings.sort_values('movieId')
# extract the rating average of each movie
movies_features_list = []
for i in ratings.movieId:
    movies_features_list.append(ratings.loc[ratings['movieId'] == i,'rating'].mean().round(1))

In [15]:
# add the extracted rating average of each movie to the original dataFrame
movies_features = pd.DataFrame()
movies_features['avg rating'] = movies_features_list
movies.insert(loc = 2,column = 'avg rating',value = movies_features)

In [16]:
movie_Id_rating = pd.DataFrame()
movie_Id_rating = movies.filter(['movieId','avg rating'])

In [17]:
# generate vector of movie vectors for prediction phase
item_vecs = np.array(movies)
item_vecs = item_vecs.astype('float64')

In [18]:
movieId = pd.DataFrame()
movieId['movieIds'] = movies.filter(['movieId'])

In [19]:
movies

Unnamed: 0,movieId,year,avg rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.9,0,1,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,2,1995,3.9,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,1995,3.9,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,3.9,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,3.9,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,2017,3.6,1,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9738,193583,2017,3.6,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9739,193585,2017,3.6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9740,193587,2018,3.6,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Users DataFrame Exploration and Manipulation

In [20]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
81531,517,1,4.0,1487954343
30517,213,1,3.5,1316196157
81082,514,1,4.0,1533872400
30601,214,1,3.0,853937855
...,...,...,...,...
27256,184,193581,4.0,1537109082
27257,184,193583,3.5,1537109545
27258,184,193585,3.5,1537109805
27259,184,193587,3.5,1537110021


In [19]:
# append the user features to the user dataFrame
users = ratings.merge(movies,on='movieId')
# drop un-necessary columns
users = users.drop(['timestamp', 'year', 'avg rating'], axis=1)

In [20]:
# sort the users data according to users' id's
users = users.sort_values('userId')

In [21]:
# extract non-repeated users' id's ascending
# to be able to extract the feature of each user for each movies' categories
ids = pd.DataFrame()
ids['userId'] = users["userId"].unique()

In [22]:
# extract the features of each user for each category
users_features_list = []
for i in ids.userId:
    for j in users[['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
          'War', 'Western']]:
        users_features_list.append(round(users.loc[(users['userId'] == i) & (users[j] == 1),'rating'].mean(),1))

In [23]:
# append the average rating for features of each user for each category
n = 0
for i in ids.index:
    for j in users[['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
          'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
          'War', 'Western']]:
        ids.loc[i,j] = users_features_list[n]
        n = n + 1

In [24]:
# df = df.filter(['a', 'b']) eliminate all except 'a' and 'b'
users = users.drop(users.loc[:, 'Action':'Western'].columns, axis=1)

In [25]:
# merge the two dataFrames according to the userId
users = users.merge(ids,on='userId')

In [26]:
# fill the nan values with zero
users = users.fillna(0)

In [27]:
# ratings should be sorted to be able to extract the y train with respect to users' id's
ratings.sort_values('userId', inplace=True)

In [28]:
# extract the moviie ratings given by the users
y_train = ratings.rating
y_train = np.array(y_train)

In [29]:
# the next 4 cells related to handling the movie structure
# by generating new columns of movies w.r.t. the users 
users_2 = users.drop(users.loc[:, 'rating':'Western'].columns, axis=1)

In [30]:
movies = users_2.merge(movies, on='movieId')

In [31]:
movies = movies.sort_values(['userId', 'movieId'])

In [32]:
users = users.sort_values(['userId', 'movieId'])

In [33]:
userid_2 = users.query('userId == 2')

In [34]:
userid_2_indx = users.query('userId == 2').index

In [35]:
userid_2_vec = users.query('userId == 2')

In [37]:
userid_2_vec = userid_2_vec.drop(['movieId', 'rating'], axis=1)
userid_2_vec

Unnamed: 0,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
233,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
243,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
248,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
247,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
255,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
238,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
254,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
240,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
253,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
257,2,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5


In [38]:
userid_2_vec = np.array(userid_2_vec)
userid_2_vec = userid_2_vec[0:1]
userid_2_vec

array([[2. , 4. , 4.2, 0. , 0. , 4. , 3.8, 4.3, 3.9, 0. , 0. , 3. , 0. ,
        4. , 4.5, 3.9, 3.7, 4.5, 3.5]])

In [39]:
users

Unnamed: 0,userId,movieId,rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
61,1,3,4.0,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
104,1,6,4.0,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
196,1,47,5.0,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
130,1,50,5.0,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99756,610,166534,4.0,3.6,3.7,3.9,3.7,3.7,3.8,4.2,...,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7
99546,610,168248,5.0,3.6,3.7,3.9,3.7,3.7,3.8,4.2,...,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7
99554,610,168250,5.0,3.6,3.7,3.9,3.7,3.7,3.8,4.2,...,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7
99782,610,168252,5.0,3.6,3.7,3.9,3.7,3.7,3.8,4.2,...,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7


In [40]:
movies

Unnamed: 0,userId,movieId,year,avg rating,Action,Adventure,Animation,Children,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,1995,3.9,0,1,1,1,1,0,...,1,0,0,0,0,0,0,0,0,0
4651,1,3,1995,3.9,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
7462,1,6,1995,3.9,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
13795,1,47,2015,3.9,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
9259,1,50,1995,3.9,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81379,610,166534,2017,3.5,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
67352,610,168248,2017,3.3,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
80938,610,168250,2017,3.3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
53912,610,168252,2017,3.3,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# desirable dataFrames
-  A single training example consists of a row from both the user and movie arrays and a rating from y_train.

<figure>
    <center> <img src="movies_data.png"   style="width:500px;height:280px;" ></center>
</figure>

# user table

In [41]:
users = users.drop(['movieId','rating'], axis=1)
users

Unnamed: 0,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.3,4.4,4.7,4.5,4.3,4.4,0.0,4.5,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
61,1,4.3,4.4,4.7,4.5,4.3,4.4,0.0,4.5,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
104,1,4.3,4.4,4.7,4.5,4.3,4.4,0.0,4.5,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
196,1,4.3,4.4,4.7,4.5,4.3,4.4,0.0,4.5,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
130,1,4.3,4.4,4.7,4.5,4.3,4.4,0.0,4.5,4.3,5.0,3.5,4.7,4.2,4.3,4.2,4.1,4.5,4.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99756,610,3.6,3.7,3.9,3.7,3.7,3.8,4.2,3.9,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7
99546,610,3.6,3.7,3.9,3.7,3.7,3.8,4.2,3.9,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7
99554,610,3.6,3.7,3.9,3.7,3.7,3.8,4.2,3.9,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7
99782,610,3.6,3.7,3.9,3.7,3.7,3.8,4.2,3.9,3.6,4.4,3.5,3.9,3.8,3.7,3.7,3.6,3.8,3.7


# movie table

In [42]:
movies = movies.drop('userId', axis=1)
movies

Unnamed: 0,movieId,year,avg rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.9,0,1,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4651,3,1995,3.9,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7462,6,1995,3.9,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
13795,47,2015,3.9,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
9259,50,1995,3.9,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81379,166534,2017,3.5,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
67352,168248,2017,3.3,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
80938,168250,2017,3.3,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
53912,168252,2017,3.3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# ratings (the movie rating given by the user)

In [39]:
y_train

array([4. , 5. , 5. , ..., 4.5, 2.5, 3. ])

In [40]:
print(len(y_train))

100836


In [43]:
num_user_features = users.shape[1] - 1 # remove userid during training
num_item_features = movies.shape[1] - 1 # remove movieid during training

uvs = 3  # user genre vector start
ivs = 3  # item genre vector start
u_s = 1  # start of columns to use in training, user
i_s = 1  # start of columns to use in training, items
print(f"Number of training vectors: {len(movies)}")

Number of training vectors: 100836


# Data Scaling

In [44]:
# scale training data
item_train_unscaled = movies
user_train_unscaled = users
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(movies)
movies = scalerItem.transform(movies)

scalerUser = StandardScaler()
scalerUser.fit(users)
users = scalerUser.transform(users)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

#print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(movies)))
#print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(users)))

In [43]:
print(len(y_train))

100836


In [45]:
movies_train, movies_test = train_test_split(movies, train_size=0.80, shuffle=True, random_state=1)
users_train, users_test = train_test_split(users, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_train, train_size=0.80, shuffle=True, random_state=1)

In [46]:
print(f"movie training data shape: {movies_train.shape}")
print(f"movie test data shape: {movies_test.shape}")
print(f"users training data shape: {movies_train.shape}")
print(f"users test data shape: {movies_test.shape}")
print(f"y_train data shape: {y_train.shape}")
print(f"y_test data shape: {y_test.shape}")

movie training data shape: (80668, 21)
movie test data shape: (20168, 21)
users training data shape: (80668, 21)
users test data shape: (20168, 21)
y_train data shape: (80668, 1)
y_test data shape: (20168, 1)


In [47]:
print(np.any(np.isnan(movies_train)))
print(np.any(np.isnan(movies_test)))
print("----------------------")
print(np.any(np.isnan(users_train)))
print(np.any(np.isnan(users_test)))
print("----------------------")
print(np.any(np.isnan(y_train)))
print(np.any(np.isnan(y_test)))

False
False
----------------------
False
False
----------------------
False
False


# Model Build

In [48]:
# GRADED_CELL
# UNQ_C1

num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###     
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(32)
    ### END CODE HERE ###  
])

item_NN = tf.keras.models.Sequential([
    ### START CODE HERE ###     
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(32)
    ### END CODE HERE ###  
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 18)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           41888       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42400       ['input_2[0][0]']                
                                                                                              

In [49]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss=cost_fn)

# Model Train

In [50]:
tf.random.set_seed(1)
model.fit([users_train[:, u_s:], movies_train[:, i_s:]], y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x217558a0550>

# Model Evaluate

In [51]:
model.evaluate([users_test[:, u_s:], movies_test[:, i_s:]], y_test)



0.17443960905075073

# Predictions of a new user

In [50]:
new_user_id = 5000
#new_rating_ave = 0.0
new_action = 0.0
new_adventure = 0.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 0.0
new_Film_Noir = 0.0
new_horror = 0.0
new_Musical = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0
new_War = 5.0
new_Western = 0.0
#new_rating_count = 3
u_s = 1 # start column for users data
i_s = 1 # start column for movies data

user_vec = np.array([[new_user_id, new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary, new_drama, new_fantasy, new_Film_Noir, 
                      new_horror, new_Musical, new_mystery, new_romance, new_scifi, new_thriller,
                      new_War, new_Western]])

- The new user enjoys movies from the adventure, fantasy genres. Let's find the top-rated movies for the new user.
- Below, we'll use a set of movie/item vectors, item_vecs that have a vector for each movie in the training/test set. This is matched with the new user vector above and the scaled vectors are used to predict ratings for all the movies.

In [51]:
# we have 9742 movie
# generate and replicate the user vector to match the number movies in the data set.
repetitions = 9742
user_vecs = np.repeat(user_vec, repetitions, 0)

In [52]:
# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

In [53]:
# make a prediction
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

In [54]:
# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

In [55]:
n = []
for i in y_pu:
    for j in i:
        n.append(round(j,1))

In [56]:
prediction = pd.DataFrame()
prediction['y_pred'] = n
prediction['movieId'] = movieId
movies_df = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\movies.csv")

In [57]:
# merge the two dataFrames according to the userId
prediction = prediction.merge(movies_df,on='movieId')

In [58]:
prediction = prediction.sort_values('y_pred', ascending = False)

In [59]:
prediction

Unnamed: 0,y_pred,movieId,title,genres
0,0.4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
6497,0.4,53322,Ocean's Thirteen (2007),Crime|Thriller
6490,0.4,53129,Mr. Brooks (2007),Crime|Drama|Thriller
6491,0.4,53138,"Librarian: Return to King Solomon's Mines, The...",Action|Adventure|Fantasy
6492,0.4,53140,"Librarian: Quest for the Spear, The (2004)",Action|Adventure|Comedy|Fantasy|Romance
...,...,...,...,...
3248,0.4,4390,Rape Me (Baise-moi) (2000),Crime|Drama|Thriller
3249,0.4,4392,Alice (1990),Comedy|Drama|Fantasy|Romance
3250,0.4,4393,Another Woman (1988),Drama
3251,0.4,4394,Beach Blanket Bingo (1965),Comedy|Musical


# Prediction of an existing user
- Let's look at the predictions for "user 2", one of the users in the data set. We can compare the predicted ratings with the model's ratings.

In [52]:
movies_df = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\movies.csv")
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [53]:
movie_Id_rating

Unnamed: 0,movieId,avg rating
0,1,3.9
1,2,3.9
2,3,3.9
3,4,3.9
4,5,3.9
...,...,...
9737,193581,3.6
9738,193583,3.6
9739,193585,3.6
9740,193587,3.6


In [63]:
userid_2

Unnamed: 0,userId,movieId,rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
233,2,318,3.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
243,2,333,4.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
248,2,1704,4.5,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
247,2,3578,4.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
255,2,6874,4.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
238,2,8798,3.5,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
254,2,46970,4.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
240,2,48516,4.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
253,2,58559,4.5,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5
257,2,60756,5.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,0.0,3.0,0.0,4.0,4.5,3.9,3.7,4.5,3.5


In [54]:
userid_2_moviesss = pd.DataFrame()
userid_2_moviesss['movieId'] = userid_2.filter(['movieId'])

In [55]:
userid_2_moviesss

Unnamed: 0,movieId
233,318
243,333
248,1704
247,3578
255,6874
238,8798
254,46970
240,48516
253,58559
257,60756


In [56]:
# merge the two dataFrames according to the userId
movie_Id_rating = movie_Id_rating.merge(movies_df,on='movieId')

In [57]:
movie_Id_rating

Unnamed: 0,movieId,avg rating,title,genres
0,1,3.9,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.9,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.9,Grumpier Old Men (1995),Comedy|Romance
3,4,3.9,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,3.9,Father of the Bride Part II (1995),Comedy
...,...,...,...,...
9737,193581,3.6,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,3.6,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,3.6,Flint (2017),Drama
9740,193587,3.6,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [46]:
userid_2_vec

array([[2. , 4. , 4.2, 0. , 0. , 4. , 3.8, 4.3, 3.9, 0. , 0. , 3. , 0. ,
        4. , 4.5, 3.9, 3.7, 4.5, 3.5]])

In [58]:
# we have 9742 movie
# generate and replicate the user vector to match the number movies in the data set.
repetitions = 9742
user_vecs = np.repeat(userid_2_vec, repetitions, 0)

In [59]:
# extract the moviie ratings given by the users
y_vecs_userid_2 = userid_2.rating
y_vecs_userid_2 = np.array(y_vecs_userid_2)

In [60]:
y_vec_userid_2 = pd.DataFrame()
y_vec_userid_2['y'] = userid_2.rating
y_vec_userid_2

Unnamed: 0,y
233,3.0
243,4.0
248,4.5
247,4.0
255,4.0
238,3.5
254,4.0
240,4.0
253,4.5
257,5.0


In [61]:
y_vecs = [0] * 9742
n = 0
for i in userid_2_indx:
    y_vecs[i] = y_vecs_userid_2[n]
    n = n + 1

In [62]:
user_vecs

array([[2. , 4. , 4.2, ..., 3.7, 4.5, 3.5],
       [2. , 4. , 4.2, ..., 3.7, 4.5, 3.5],
       [2. , 4. , 4.2, ..., 3.7, 4.5, 3.5],
       ...,
       [2. , 4. , 4.2, ..., 3.7, 4.5, 3.5],
       [2. , 4. , 4.2, ..., 3.7, 4.5, 3.5],
       [2. , 4. , 4.2, ..., 3.7, 4.5, 3.5]])

In [63]:
print(y_vecs)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.5, 3.0, 4.0, 5.0, 4.0, 5.0, 3.5, 3.0, 4.0, 3.0, 2.5, 4.0, 3.5, 4.0, 4.0, 4.0, 4.5, 4.5, 3.0, 3.5, 5.0, 4.5, 4.0, 4.0, 4.5, 5.0, 5.0, 5.0, 2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [64]:
item_vecs

array([[1.00000e+00, 1.99500e+03, 3.90000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [2.00000e+00, 1.99500e+03, 3.90000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.00000e+00, 1.99500e+03, 3.90000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [1.93585e+05, 2.01700e+03, 3.60000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.93587e+05, 2.01800e+03, 3.60000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.93609e+05, 1.99100e+03, 3.60000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [65]:
# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

In [66]:
# make a prediction
u_s = 1
i_s = 1
y_p = model.predict([suser_vecs[:, u_s:], sitem_vecs[:, i_s:]])

In [67]:
# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

In [68]:
n = []
for i in y_pu:
    for j in i:
        n.append(round(j,1))

In [69]:
prediction2 = pd.DataFrame()
prediction2['y_pred'] = n
prediction2['y'] = y_vec_userid_2
prediction2['movieId'] = userid_2_moviesss

In [70]:
# merge the two dataFrames according to the userId
prediction2 = prediction2.merge(movie_Id_rating,on='movieId')

In [79]:
u_id_2 = pd.DataFrame()
u_id_2['user'] = userid_2.userId

In [77]:
prediction2 = prediction2.sort_values('y_pred', ascending = False)

In [78]:
prediction2

Unnamed: 0,y_pred,y,user,movieId,avg rating,title,genres
20,3.9,5.0,,89774.0,3.4,Warrior (2011),Drama
19,3.9,3.5,,115713.0,3.4,Ex Machina (2015),Drama|Sci-Fi|Thriller
13,3.9,4.0,,112552.0,3.4,Whiplash (2014),Drama
17,3.9,4.5,,80489.0,3.2,"Town, The (2010)",Crime|Drama|Thriller
10,3.9,2.5,,91658.0,4.0,"Girl with the Dragon Tattoo, The (2011)",Drama|Thriller
1,3.8,3.0,,318.0,3.4,"Shawshank Redemption, The (1994)",Crime|Drama
2,3.8,4.0,,79132.0,3.2,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
3,3.8,5.0,,106782.0,4.2,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama
4,3.8,4.0,,86345.0,3.2,Louis C.K.: Hilarious (2010),Comedy
5,3.8,5.0,,131724.0,4.4,The Jinx: The Life and Deaths of Robert Durst ...,Documentary


# finding similar items
- The neural network above produces two feature vectors, a user feature vector $v_u$, and a movie feature vector, $v_m$. These are 32 entry vectors whose values are difficult to interpret. However, similar items will have similar vectors. This information can be used to make recommendations. For example, if a user has rated "Toy Story 3" highly, one could recommend similar movies by selecting movies with similar movie feature vectors.

- A similarity measure is the squared distance between the two vectors $ \mathbf{v_m^{(k)}}$ and $\mathbf{v_m^{(i)}}$ :
$$\left\Vert \mathbf{v_m^{(k)}} - \mathbf{v_m^{(i)}}  \right\Vert^2 = \sum_{l=1}^{n}(v_{m_l}^{(k)} - v_{m_l}^{(i)})^2\tag{1}$$

- A matrix of distances between movies can be computed once when the model is trained and then reused for new recommendations without retraining. The first step, once a model is trained, is to obtain the movie feature vector,  𝑣𝑚, for each of the movies. To do this, we will use the trained item_NN and build a small model to allow us to run the movie vectors through it to generate  𝑣𝑚.

In [45]:
# create the movie network to generate the movies vector of vectors Vm
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20)]              0         
                                                                 
 sequential_1 (Sequential)   (None, 32)                42400     
                                                                 
 tf.math.l2_normalize_2 (TFO  (None, 32)               0         
 pLambda)                                                        
                                                                 
Total params: 42,400
Trainable params: 42,400
Non-trainable params: 0
_________________________________________________________________


- Once you have a movie model, you can create a set of movie feature vectors by using the model to predict using a set of item/movie vectors as input. `item_vecs` is a set of all of the movie vectors. It must be scaled to use with the trained model. The result of the prediction is a 32 entry feature vector for each movie.

In [46]:
scaled_item_vecs = scalerItem.transform(item_vecs)
vms = model_m.predict(scaled_item_vecs[:,i_s:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

size of all predicted movie feature vectors: (9742, 32)


- Let's now compute a matrix of the squared distance between each movie feature vector and all other movie feature vectors:
<figure>
    <center> <img src="matrix_of_the_squared_distance.png"   style="width:500px;height:280px;" ></center>
</figure>

- a function to compute the square distance.

In [47]:
def sq_dist(a,b):
    d = sum(np.square(a-b))
    return d

- We can then find the closest movie by finding the minimum along each row. We will make use of numpy masked arrays to avoid selecting the same movie. The masked values along the diagonal won't be included in the computation.

- The results show the model will generally suggest a movie with similar genre's.

In [None]:
movies_df = pd.read_csv("H:\\Data Science Repository\\Projects\\movie recommender system\\ml-latest-small\\movies.csv")

In [None]:
count = 50  # number of movies to display
dim = len(vms)
dist = np.zeros((dim,dim))

# get the difference distance between all movies
# 9472 * 9742
for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])
        print(dist[i,j])
        
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    disp.append([movies_df[movies_df['movieId'] == movie1_id]['title'].tolist(),
                 movies_df[movies_df['movieId'] == movie1_id]['genres'].tolist(),
                 movies_df[movies_df['movieId'] == movie2_id]['title'].tolist(),
                 movies_df[movies_df['movieId'] == movie2_id]['genres'].tolist()])

res=[]
res2=[]
for i in disp:
    res.extend(i)
for j in res:
    res2.extend(j)
clusters = [res2[x:x+4] for x in range(0, len(res), 4)]

table = tabulate.tabulate(clusters, tablefmt='html', headers="firstrow")
table

- This structure is the basis of many commercial recommender systems. The user content can be greatly expanded to incorporate more information about the user if it is available. Items are not limited to movies. This can be used to recommend any item, books, cars or items that are similar to an item in your 'shopping cart'.