In [5]:
#Importing libraries

import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from joblib import dump, load

# tabulate to neatly print tables
import tabulate
pd.set_option("display.precision", 2)

<h2> Movie Dataframe with engineered features </h2>

In [2]:
movieDF = pd.read_csv("dataset/movies.csv")
#remove rows from movieDF with condition
# movieDF = movieDF[movieDF['genres'].str.contains("no genres listed") == False]

# separating year from title
movieDF["year"] = movieDF["title"].str.extract("\((\d{4})\)", expand=False)
movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")
movieDF["title"] = movieDF["title"].str.strip()

# genre one hot encoding
genreDF = movieDF["genres"].str.get_dummies(sep="|")
movieDF = pd.concat([movieDF, genreDF], axis=1)
movieDF = movieDF.drop(columns=["genres"])

# dropping title
movieDF = movieDF.drop(columns=["title"])


del genreDF
movieDF.head()

  movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")


Unnamed: 0,movieId,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
categories= movieDF.columns[3:].to_numpy()
categories.tofile('data/categories.csv', sep=',', format='%s')

In [4]:
# Reading ratings file
rateDF = pd.read_csv("dataset/ratings.csv")
rateDF = rateDF.drop(columns=["timestamp"])
rateDF.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
# average rating for each movie
avgRatingDF = rateDF[['movieId','rating']].groupby("movieId").mean()
avgRatingDF = avgRatingDF.rename(columns={"rating": "avgRating"})
avgRatingDF.reset_index(inplace=True)
avgRatingDF.to_csv("data/avgRatingDF.csv", index=False)
avgRatingDF.head()

Unnamed: 0,movieId,avgRating
0,1,3.92
1,2,3.43
2,3,3.26
3,4,2.36
4,5,3.07


In [6]:
#Joining average Rating with the movie
movieDF = movieDF.join(avgRatingDF.set_index("movieId"), on="movieId")

# change avgRating position
cols = list(movieDF.columns)
cols = cols[0:2] + [cols[-1]] + cols[2:-1]
movieVector = movieDF[cols]

del movieDF
movieVector.to_csv("data/movieVector.csv", index=False)
movieVector.head()
# MOVIE VECTOR

Unnamed: 0,movieId,year,avgRating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.92,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,3.43,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,3.26,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,2.36,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,3.07,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


<h2> Rating Dataframe with engineered features </h2>

In [7]:
# Rating count for each user
userRatingCountDF = rateDF[['userId','rating']].groupby("userId").count()
userRatingCountDF = userRatingCountDF.rename(columns={"rating": "userRatingCount"})

# Avarage rating for each user
userAvgRatingDF = rateDF[['userId','rating']].groupby("userId").mean()
userAvgRatingDF = userAvgRatingDF.rename(columns={"rating": "userAvgRating"})
userAvgRatingDF.reset_index(inplace=True)
userRatingCountDF.reset_index(inplace=True)

# Joining userRatingCount and userAvgRating
userDF = userRatingCountDF.join(userAvgRatingDF.set_index("userId"), on="userId")

del userRatingCountDF, userAvgRatingDF
userDF.head()

Unnamed: 0,userId,userRatingCount,userAvgRating
0,1,232,4.37
1,2,29,3.95
2,3,39,2.44
3,4,216,3.56
4,5,44,3.64


In [8]:
# Movie vector only with genres
cols = list(movieVector.columns)
mmovieDF = movieVector[cols[0:1] + cols[4:]]
mmovieDF.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# Initializing the movie vector
userVector = pd.DataFrame(columns= ['userId'] + list(cols[4:]))

In [10]:
# get max userId
n = rateDF['userId'].max()

# Calculating average rating for each genre for each user
for i in range(n):
    userId = i+1

    # get all movies rated by one user
    userMovies = rateDF.loc[rateDF['userId'] == userId]
    userRatings = userMovies['rating'].values
    movieIds = userMovies['movieId'].values
    # Each movie is represented by a one hot encoding vector of genre
    userMovies = mmovieDF.loc[mmovieDF['movieId'].isin(movieIds)]

    # Calculating average rating for each genre for each user
    userMovies = userMovies.iloc[:,1:].multiply(userRatings, axis="index").replace(0, np.NaN)
    userVec = userMovies.mean(axis=0).fillna(0)
    userVec['userId'] = userId

    # appending row to userVector dataframe
    userVector = pd.concat([userVector, userVec.to_frame().T], ignore_index=True, )

del mmovieDF
userVector.head()

Unnamed: 0,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,4.32,4.39,4.69,4.55,4.28,4.36,0.0,4.53,4.3,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
1,2.0,3.95,4.17,0.0,0.0,4.0,3.8,4.33,3.88,0.0,0.0,3.0,3.75,0.0,4.0,4.5,3.88,3.7,4.5,3.5
2,3.0,3.57,2.73,0.5,0.5,1.0,0.5,0.0,0.75,3.38,0.0,4.69,0.0,0.5,5.0,0.5,4.2,4.14,0.5,0.0
3,4.0,3.32,3.66,4.0,3.8,3.51,3.81,4.0,3.48,3.68,4.0,4.25,3.0,4.0,3.48,3.38,2.83,3.55,3.57,3.8
4,5.0,3.11,3.25,4.33,4.11,3.47,3.83,0.0,3.8,4.14,0.0,3.0,3.67,4.4,4.0,3.09,2.5,3.56,3.33,3.0


In [11]:
# join userVector with userDF
userVector = userVector.join(userDF.set_index("userId"), on="userId")

# Changing the order of columns
cols = list(userVector.columns)
userVector = userVector[cols[0:1] + cols[-2:] + cols[1:-2] ]

del userDF
userVector.to_csv("data/userVector.csv", index=False)
userVector.head()

Unnamed: 0,userId,userRatingCount,userAvgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,232,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
1,2.0,29,3.95,3.95,4.17,0.0,0.0,4.0,3.8,4.33,...,0.0,3.0,3.75,0.0,4.0,4.5,3.88,3.7,4.5,3.5
2,3.0,39,2.44,3.57,2.73,0.5,0.5,1.0,0.5,0.0,...,0.0,4.69,0.0,0.5,5.0,0.5,4.2,4.14,0.5,0.0
3,4.0,216,3.56,3.32,3.66,4.0,3.8,3.51,3.81,4.0,...,4.0,4.25,3.0,4.0,3.48,3.38,2.83,3.55,3.57,3.8
4,5.0,44,3.64,3.11,3.25,4.33,4.11,3.47,3.83,0.0,...,0.0,3.0,3.67,4.4,4.0,3.09,2.5,3.56,3.33,3.0


In [12]:
# Left Joining Ratings with User vector
userVector = rateDF[['userId']].join(userVector.set_index("userId"), on="userId").astype('float64')
userVector.head()

Unnamed: 0,userId,userRatingCount,userAvgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
1,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
2,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
3,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
4,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29


In [13]:
# Left Joining Ratings with Movie vector
movieVector = rateDF[['movieId']].join(movieVector.set_index("movieId"), on="movieId").astype('float64')
movieVector.drop(columns=['(no genres listed)'], inplace=True)
movieVector.head()

Unnamed: 0,movieId,year,avgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,1995.0,3.92,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,1995.0,3.26,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,6.0,1995.0,3.95,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,47.0,1995.0,3.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,50.0,1995.0,4.24,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [14]:
# Y is the rating
y_train = rateDF['rating'].values
del rateDF
y_train1 = y_train
y_train

array([4., 4., 4., ..., 5., 5., 3.])

#### Removing Null values from movie vector

In [15]:
movieVector.loc[movieVector['year'].isnull()]

Unnamed: 0,movieId,year,avgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
9147,176601.0,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16886,147250.0,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16913,171749.0,,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17879,171631.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17881,171891.0,,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30039,140956.0,,3.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
30090,40697.0,,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
46593,140956.0,,3.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
46711,149334.0,,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
49851,171495.0,,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
movieVector.fillna(0, inplace=True)

<h2> Normalizing the training data </h2>

In [17]:
item_train = movieVector.to_numpy()
user_train = userVector.to_numpy()

item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.reshape(-1, 1))
y_train = scalerTarget.transform(y_train.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

print(np.allclose(item_train_unscaled, scalerItem.inverse_transform(item_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

True
True


In [18]:
# save the scaler
dump(scalerItem, 'data/scalerItem.bin', compress = True)
dump(scalerUser, 'data/scalerUser.bin', compress = True)
dump(scalerTarget, 'data/scalerTarget.bin', compress = True)

['data/scalerTarget.bin']

In [19]:
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test data shape: {item_test.shape}")

movie/item training data shape: (80668, 22)
movie/item test data shape: (20168, 22)


<h2> Neural Network for content-based filtering </h2>

In [20]:
num_user_features = user_train[:,3:].shape[1]
num_item_features = item_train[:,1:].shape[1]

num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

item_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           42144       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42656       ['input_2[0][0]']                
                                                                                              

In [21]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn)

In [22]:
tf.random.set_seed(1)
model.fit([user_train[:,3:], item_train[:,1:]], y_train, epochs=1)
# 30



<keras.callbacks.History at 0x1d98a49c1c0>

In [23]:
model.evaluate([user_test[:,3:], item_test[:,1:]], y_test)



0.13260267674922943

In [24]:
model.save('data/my_model')

INFO:tensorflow:Assets written to: data/my_model\assets


<h2> Predictions </h2>

In [3]:
movies = pd.read_csv('dataset/movies.csv')
avgRatingDF = pd.read_csv('data/avgRatingDF.csv')
# movies = movies[movies['genres'].str.contains("no genres listed") == False]
movies = movies.join(avgRatingDF.set_index("movieId"), on="movieId")
movies.head()

Unnamed: 0,movieId,title,genres,avgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.43
2,3,Grumpier Old Men (1995),Comedy|Romance,3.26
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.36
4,5,Father of the Bride Part II (1995),Comedy,3.07


In [6]:
model = keras.models.load_model('data/my_model')
scalerUser=load('data/scalerUser.bin')
scalerItem=load('data/scalerItem.bin')
scalerTarget=load('data/scalerTarget.bin')

In [7]:
item_train1 = pd.read_csv("data/movieVector.csv").astype('float64').drop(columns=['(no genres listed)'])
# item_train1 = pd.read_csv("movieVector.csv").astype('float64')
item_train1.head()

Unnamed: 0,movieId,year,avgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,1995.0,3.92,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1995.0,3.43,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1995.0,3.26,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4.0,1995.0,2.36,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5.0,1995.0,3.07,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h3> Predictions for a new user </h3>

In [8]:
user_vec = {'userId':5000, 'userRatingCount':3, 'userAvgRating':0, 'Action':5, 'Adventure':5,
       'Animation':0, 'Children':0, 'Comedy':0, 'Crime':0, 'Documentary':0, 'Drama':0,
       'Fantasy':0, 'Film-Noir':0, 'Horror':0, 'IMAX':0, 'Musical':0, 'Mystery':0,
       'Romance':0, 'Sci-Fi':0, 'Thriller':0, 'War':0, 'Western':0}

In [9]:
# generate and replicate the user vector to match the number movies in the data set.
user_train1 = pd.DataFrame(user_vec, index=[0])
user_train1 = pd.DataFrame(np.repeat(user_train1.values, item_train1.shape[0], axis=0), columns=user_train1.columns) 

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_train1.to_numpy())
sitem_vecs = scalerItem.transform(item_train1.to_numpy())

# make a prediction
y_p = model.predict([suser_vecs[:, 3:], sitem_vecs[:, 1:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = movies.iloc[sorted_index]  #using unscaled vectors for display
# sorted_items['rating'] = sorted_ypu
sorted_items.insert(1, 'y_predict', sorted_ypu)



In [10]:
sorted_items.head(10)

Unnamed: 0,movieId,y_predict,title,genres,avgRating
9394,164226,3.3,Maximum Ride (2016),Action|Adventure|Comedy|Fantasy|Sci-Fi|Thriller,4.5
224,260,3.29,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.23
898,1196,3.29,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.22
911,1210,3.28,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,4.14
9731,191005,3.27,Gintama (2017),Action|Adventure|Comedy|Sci-Fi,4.5
2765,3703,3.26,"Road Warrior, The (Mad Max 2) (1981)",Action|Adventure|Sci-Fi|Thriller,4.04
8475,112852,3.25,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,4.05
8114,100882,3.25,Journey to the West: Conquering the Demons (Da...,Adventure|Comedy|Fantasy|Romance|IMAX,4.75
7063,69524,3.25,Raiders of the Lost Ark: The Adaptation (1989),Action|Adventure|Thriller,4.33
7668,88932,3.24,Final Destination 5 (2011),Horror|Thriller|IMAX,4.5


<h3> Predictions for an existing user </h3>

In [11]:
item_train1.head()

Unnamed: 0,movieId,year,avgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,1995.0,3.92,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,1995.0,3.43,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1995.0,3.26,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4.0,1995.0,2.36,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5.0,1995.0,3.07,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
uid = 1 
user_train2 = pd.read_csv("data/userVector.csv").astype('float64')
user_train2 = user_train2.loc[user_train2['userId'] == uid]
user_train2 = pd.DataFrame(np.repeat(user_train2.values, item_train1.shape[0], axis=0), columns=user_train2.columns) 
user_train2.head()

Unnamed: 0,userId,userRatingCount,userAvgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
1,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
2,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
3,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29
4,1.0,232.0,4.37,4.32,4.39,4.69,4.55,4.28,4.36,0.0,...,5.0,3.47,0.0,4.68,4.17,4.31,4.22,4.15,4.5,4.29


In [34]:
# form a set of user vectors. This is the same vector, transformed and repeated.

item_vecs = item_train1.to_numpy()
user_vecs = user_train2.to_numpy()

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# # make a prediction
y_p = model.predict([suser_vecs[:, 3:], sitem_vecs[:, 1:]])

# # unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = movies.iloc[sorted_index]  #using unscaled vectors for display
sorted_items.insert(1, 'y_predict', sorted_ypu)




In [36]:
genres = sorted_items['genres'].str.split('|', expand=False)
user_genre_ave = []
try:
    for i in range(len(genres)):
        if genres.iloc[i] != ['(no genres listed)']:
            user_genre_ave.append(user_train2.iloc[uid][genres.iloc[i]].values.round(1)) 
        else:
            user_genre_ave.append(0)
except Exception as e:
    print(genres[i])
    print(e)

sorted_items.insert(2, 'genre', user_genre_ave)

In [37]:
sorted_items.head(10)

Unnamed: 0,movieId,y_predict,genre,title,genres,avgRating
9394,164226,4.65,"[4.3, 4.4, 4.3, 4.3, 4.2, 4.1]",Maximum Ride (2016),Action|Adventure|Comedy|Fantasy|Sci-Fi|Thriller,4.5
224,260,4.64,"[4.3, 4.4, 4.2]",Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.23
898,1196,4.64,"[4.3, 4.4, 4.2]",Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4.22
911,1210,4.63,"[4.3, 4.4, 4.2]",Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,4.14
9731,191005,4.63,"[4.3, 4.4, 4.3, 4.2]",Gintama (2017),Action|Adventure|Comedy|Sci-Fi,4.5
8114,100882,4.63,"[4.4, 4.3, 4.3, 4.3, 0.0]",Journey to the West: Conquering the Demons (Da...,Adventure|Comedy|Fantasy|Romance|IMAX,4.75
7063,69524,4.63,"[4.3, 4.4, 4.1]",Raiders of the Lost Ark: The Adaptation (1989),Action|Adventure|Thriller,4.33
3734,5181,4.62,"[4.3, 4.2, 4.1]",Hangar 18 (1980),Action|Sci-Fi|Thriller,4.5
9150,147382,4.62,"[4.3, 4.2]",Doctor Who: Voyage Of The Damned (2007),Action|Sci-Fi,4.5
5604,26985,4.61,"[4.3, 4.2]",Nirvana (1997),Action|Sci-Fi,4.5


<h3> Finding Similar Items </h3>

In [None]:
assert False

In [None]:
def sq_dist(a,b):
    d = np.sum(np.square(a-b))
    return d

In [None]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

In [None]:
# item_vecs = item_train.to_numpy()

scaled_item_vecs = scalerItem.transform(item_train1.to_numpy())
vms = model_m.predict(scaled_item_vecs[:,1:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

In [None]:
item_vecs = item_train1.to_numpy()

In [None]:
count = 50  # number of movies to display
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])

In [None]:
import numpy.ma as ma
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    disp.append( [movies.iloc[movie1_id]['title'], movies.iloc[movie1_id]['genres'],
                  movies.iloc[movie2_id]['title'], movies.iloc[movie2_id]['genres']]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
table