In [1]:
#Importing libraries

import joblib
import numpy as np
from numpy import float16, float32, save
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from joblib import dump, load

# tabulate to neatly print tables
import tabulate
pd.set_option("display.precision", 2)

<h2> Movie Dataframe with engineered features </h2>

In [None]:
movieDF = pd.read_csv("dataset2/movies.csv")
#remove rows from movieDF with condition
# movieDF = movieDF[movieDF['genres'].str.contains("no genres listed") == False]

# separating year from title
movieDF["year"] = movieDF["title"].str.extract("\((\d{4})\)", expand=False)
movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")
movieDF["title"] = movieDF["title"].str.strip()

# genre one hot encoding
genreDF = movieDF["genres"].str.get_dummies(sep="|")
movieDF = pd.concat([movieDF, genreDF], axis=1)
movieDF = movieDF.drop(columns=["genres"])

# dropping title
movieDF = movieDF.drop(columns=["title"])

del genreDF
movieDF.head()

In [None]:
categories= movieDF.columns[3:].to_numpy()
categories.tofile('data/categories.csv', sep=',', format='%s')

In [None]:
# Reading ratings file
rateDF = pd.read_csv("dataset2/ratings.csv")
rateDF = rateDF.drop(columns=["timestamp"])
rateDF.head()

In [None]:
# average rating for each movie
avgRatingDF = rateDF[['movieId','rating']].groupby("movieId").mean()
avgRatingDF = avgRatingDF.rename(columns={"rating": "avgRating"})
avgRatingDF.reset_index(inplace=True)
avgRatingDF.to_feather("data/avgRatingDF.csv")

avgRatingDF.head()

In [None]:
#Joining average Rating with the movie
movieDF = movieDF.join(avgRatingDF.set_index("movieId"), on="movieId")

# change avgRating position
cols = list(movieDF.columns)
cols = cols[0:2] + [cols[-1]] + cols[2:-1]
movieVector = movieDF[cols]

del movieDF,avgRatingDF
movieVector.head()
# MOVIE VECTOR

#### Removing Null values from movie vector

In [None]:
movieVector.loc[movieVector['year'].isnull()]

In [None]:
movieVector.fillna(0, inplace=True)

In [None]:
movieVector.info()

In [None]:
convert_dict = {'movieId': 'int32',
                'year': 'int16',
                'avgRating': 'float16',
                '(no genres listed)': 'bool',
                'Action': 'bool',
                'Adventure': 'bool',
                'Animation': 'bool',
                'Children': 'bool',
                'Comedy': 'bool',
                'Crime': 'bool',
                'Documentary': 'bool',
                'Drama': 'bool',
                'Fantasy': 'bool',
                'Film-Noir': 'bool',
                'Horror': 'bool',
                'IMAX': 'bool',
                'Musical': 'bool',
                'Mystery': 'bool',
                'Romance': 'bool',
                'Sci-Fi': 'bool',
                'Thriller': 'bool',
                'War': 'bool',
                'Western': 'bool',
                }
movieVector = movieVector.astype(convert_dict)
movieVector.info()

In [None]:
movieVector.to_feather("data2/movieVector.csv")

<h2> Rating Dataframe with engineered features </h2>

In [None]:
# Rating count for each user
userRatingCountDF = rateDF[['userId','rating']].groupby("userId").count()
userRatingCountDF = userRatingCountDF.rename(columns={"rating": "userRatingCount"})

# Avarage rating for each user
userAvgRatingDF = rateDF[['userId','rating']].groupby("userId").mean()
userAvgRatingDF = userAvgRatingDF.rename(columns={"rating": "userAvgRating"})
userAvgRatingDF.reset_index(inplace=True)
userRatingCountDF.reset_index(inplace=True)

# Joining userRatingCount and userAvgRating
userDF = userRatingCountDF.join(userAvgRatingDF.set_index("userId"), on="userId")

del userRatingCountDF, userAvgRatingDF
userDF.head()

In [None]:
# Movie vector only with genres
cols = list(movieVector.columns)
mmovieDF = movieVector[cols[0:1] + cols[4:]]

del movieVector
mmovieDF.head()

In [None]:
rateDF.info()

In [None]:
convert_dict = {'userId': 'int32',
                'movieId': 'int32',
                'rating': 'float16',
                }
rateDF = rateDF.astype(convert_dict)

In [None]:
rateDF.info()

In [None]:
import dask.dataframe as dd
rateDF = dd.from_pandas(rateDF,npartitions=8)
mmovieDFdd = dd.from_pandas(mmovieDF,npartitions=8)

In [None]:
# Initializing the movie vector
userVector = pd.DataFrame(columns= ['userId'] + list(cols[4:]))

In [None]:
#SKIPPING THIS PART, ALREADY DONE


# # get max userId
# n = rateDF['userId'].max()

# # Calculating average rating for each genre for each user
# for i in range(n):
#     userId = i+1

#     # get all movies rated by one user
#     userMovies = rateDFdd.loc[rateDFdd['userId'] == userId].compute()
#     userRatings = userMovies['rating'].values
#     movieIds = userMovies['movieId'].values
    
#     # Each movie is represented by a one hot encoding vector of genre
#     userMovies = mmovieDFdd.loc[mmovieDFdd['movieId'].isin(movieIds)].compute()
#     # Calculating average rating for each genre for each user
#     userMovies = userMovies.iloc[:,1:].multiply(userRatings, axis="index").replace(0, np.NaN)
    
#     userVec = userMovies.mean(axis=0).fillna(0)
#     userVec = userVec.to_frame().T
#     userVec['userId'] = int(userId)

#     # appending row to userVector dataframe
#     userVector = pd.concat([userVector, userVec], ignore_index=True, )

del mmovieDF,mmovieDFdd
# userVector['userId'] = userVector['userId'].astype('int32')
# userVector.head()

In [None]:
# userVector.to_feather("data2/userVectorfirst.csv")
userVector = pd.read_feather("data2/userVectorfirst.csv")

In [None]:
convert_dict = {'userId': 'int32',
                'userRatingCount': 'float16',
                'userAvgRating': 'float16',
                }
userDF = userDF.astype(convert_dict)
userDF.info()

In [None]:
# join userVector with userDF
userVector = userVector.join(userDF.set_index("userId"), on="userId")

# Changing the order of columns
cols = list(userVector.columns)
userVector = userVector[cols[0:1] + cols[-2:] + cols[1:-2] ]

del userDF
userVector.to_feather("data/userVector.csv")
userVector.head()

In [None]:
rateDF.head()

In [None]:
# Left Joining Ratings with User vector
userVectordd = dd.from_pandas(userVector,npartitions=8)
userVectorRepeated = rateDF[['userId']].join(userVectordd.set_index("userId"), on="userId").compute()

userVectorRepeated.reset_index(drop=True, inplace=True)
userVectorRepeated.to_feather("data2/userVectorRepeated.csv")
del userVectorRepeated, userVector, userVectordd

In [None]:
# Left Joining Ratings with Movie vector
movieVector = pd.read_feather("data2/movieVector.csv")
movieVector.drop(columns=['(no genres listed)'], inplace=True)
# movieVector = dd.from_pandas(movieVector,npartitions=8)

movieVectorrepeated = rateDF[['movieId']].join(movieVector.set_index("movieId"), on="movieId").compute()
movieVectorrepeated.to_feather("data2/movieVectorRepeated.csv")
movieVectorrepeated.head()

In [None]:
# Y is the rating
yVec = rateDF['rating'].values.compute()

del rateDF

<h2> Normalizing the training data </h2>

In [2]:
movieVectorrepeated = pd.read_feather("data2/movieVectorRepeated.csv")
userVectorRepeated = pd.read_feather("data2/userVectorRepeated.csv")

In [3]:
movieVectorrepeated.iloc[:10]

Unnamed: 0,movieId,year,avgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,307,1993,3.97,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,481,1993,3.34,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,1091,1989,2.81,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1257,1985,3.83,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
4,1449,1996,3.92,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
5,1590,1997,2.97,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,True,False,False
6,1591,1997,2.65,True,True,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
7,2134,1985,3.29,False,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
8,2478,1986,3.12,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
9,2840,1999,3.03,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [6]:
userVectorRepeated.loc[userVectorRepeated['userId']==2]

Unnamed: 0,userId,userRatingCount,userAvgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
16,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
17,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
18,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
19,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
20,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
21,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
22,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
23,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
24,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0
25,2,15.0,3.67,3.5,3.5,0.0,0.0,3.61,3.5,0.0,...,0.0,4.0,0.0,4.0,0.0,3.7,3.5,3.5,3.0,0.0


In [7]:
# In movie vector, it does not need to scale all the columns as genres are in binary format

scalerItem = StandardScaler()
scalerItem.fit(movieVectorrepeated.iloc[:,1:3])
movie_train = scalerItem.transform(movieVectorrepeated.iloc[:,1:3])
movieVectorrepeated.iloc[:,[1,2]] = float16(movie_train)
cols = {'year': 'float16','avgRating': 'float16'}
movieVectorrepeated = movieVectorrepeated.astype(cols)
del movie_train

# float 32 makes inverce more accurate
uid = userVectorRepeated['userId']
uDetails = userVectorRepeated.iloc[:,1:].astype('float32')
uDetailsCols = uDetails.columns
del userVectorRepeated

scalerUser = StandardScaler()
scalerUser.fit(uDetails)
user_train = scalerUser.transform(uDetails)
userVectorRepeated = pd.DataFrame(user_train, columns=uDetailsCols, dtype='float16')
userVectorRepeated.insert(0, "userId", uid, True)
del uid,uDetails,user_train

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(yVec.reshape(-1, 1))
y_train = scalerTarget.transform(yVec.reshape(-1, 1))
#ynorm_test = scalerTarget.transform(y_test.reshape(-1, 1))

# print(np.allclose(movie_train_unscaled, scalerItem.inverse_transform(movie_train)))
# print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

  movieVectorrepeated.iloc[:,[1,2]] = float16(movie_train)


NameError: name 'yVec' is not defined

In [None]:
# save the scaler
dump(scalerItem, 'data/scalerItem.bin', compress = True)
dump(scalerUser, 'data/scalerUser.bin', compress = True)
dump(scalerTarget, 'data/scalerTarget.bin', compress = True)

In [None]:
movie_train, movie_test = train_test_split(movieVectorrepeated, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(userVectorRepeated, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/movie training data shape: {movie_train.shape}")
print(f"movie/movie test data shape: {movie_test.shape}")

del movieVectorrepeated, userVectorRepeated, yVec, movieVector

In [None]:
movie_train.reset_index(drop=True, inplace=True)
movie_test.reset_index(drop=True, inplace=True)
user_train.reset_index(drop=True, inplace=True)
user_test.reset_index(drop=True, inplace=True)
movie_train.to_feather("data2/test-train/movie_train.csv" )
movie_test.to_feather("data2/test-train/movie_test.csv" )
user_train.to_feather("data2/test-train/user_train.csv" )
user_test.to_feather("data2/test-train/user_test.csv" )

save("data2/test-train/y_train.npy", y_train )
save("data2/test-train/y_test.npy", y_test )

del movie_train,movie_test,user_train,user_test,y_train,y_test

<h2> Neural Network for content-based filtering </h2>

In [1]:
import numpy as np
from numpy import float16, float32, save
import pandas as pd
import tensorflow as tf
from tensorflow import keras


In [2]:
user_train = pd.read_feather("data2/test-train/user_train.csv")
item_train = pd.read_feather("data2/test-train/movie_train.csv")
y_train = np.load("data2/test-train/y_train.npy")

In [3]:
num_user_features = len(user_train.columns)-3
num_item_features = len(item_train.columns)-1

num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

item_NN = tf.keras.models.Sequential([
  tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_outputs),
])

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 19)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 21)]         0           []                               
                                                                                                  
 sequential (Sequential)        (None, 32)           42144       ['input_1[0][0]']                
                                                                                                  
 sequential_1 (Sequential)      (None, 32)           42656       ['input_2[0][0]']                
                                                                                              

In [4]:
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn)

In [5]:
n = 3000000
y_train = np.asarray(y_train[:n])
user_train = np.asarray(user_train[:n])
item_train = np.asarray(item_train[:n]).astype(np.float16)

In [6]:
tf.random.set_seed(1)
model.fit([user_train[:,3:], item_train[:,1:]], y_train, epochs=100, batch_size=26000)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
user_test = pd.read_feather("data2/test-train/user_test.csv")
item_test = pd.read_feather("data2/test-train/movie_test.csv")
y_test = np.load("data2/test-train/y_test.npy")

In [None]:
model.evaluate([user_test[:,3:], item_test[:,1:]], y_test)

In [None]:
model.save('data/my_model')

<h2> Predictions </h2>

In [None]:
movies = pd.read_csv('dataset/movies.csv')
avgRatingDF = pd.read_csv('data/avgRatingDF.csv')
# movies = movies[movies['genres'].str.contains("no genres listed") == False]
movies = movies.join(avgRatingDF.set_index("movieId"), on="movieId")
movies.head()

In [None]:
model = keras.models.load_model('data/my_model')
scalerUser = load('data/scalerUser.bin')
scalerItem = load('data/scalerItem.bin')
scalerTarget = load('data/scalerTarget.bin')

In [None]:
item_train1 = pd.read_csv("data/movieVector.csv").astype('float64').drop(columns=['(no genres listed)'])
# item_train1 = pd.read_csv("movieVector.csv").astype('float64')
item_train1.head()

<h3> Predictions for a new user </h3>

In [None]:
user_vec = {'userId':5000, 'userRatingCount':3, 'userAvgRating':0, 'Action':5, 'Adventure':5,
       'Animation':0, 'Children':0, 'Comedy':0, 'Crime':0, 'Documentary':0, 'Drama':0,
       'Fantasy':0, 'Film-Noir':0, 'Horror':0, 'IMAX':0, 'Musical':0, 'Mystery':0,
       'Romance':0, 'Sci-Fi':0, 'Thriller':0, 'War':0, 'Western':0}

In [None]:
# generate and replicate the user vector to match the number movies in the data set.
user_train1 = pd.DataFrame(user_vec, index=[0])
user_train1 = pd.DataFrame(np.repeat(user_train1.values, item_train1.shape[0], axis=0), columns=user_train1.columns) 

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_train1.to_numpy())
sitem_vecs = scalerItem.transform(item_train1.to_numpy())

# make a prediction
y_p = model.predict([suser_vecs[:, 3:], sitem_vecs[:, 1:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = movies.iloc[sorted_index]  #using unscaled vectors for display
# sorted_items['rating'] = sorted_ypu
sorted_items.insert(1, 'y_predict', sorted_ypu)

In [None]:
sorted_items.head(10)

<h3> Predictions for an existing user </h3>

In [None]:
item_train1.head()

In [None]:
uid = 1 
user_train2 = pd.read_csv("data/userVector.csv").astype('float64')
user_train2 = user_train2.loc[user_train2['userId'] == uid]
user_train2 = pd.DataFrame(np.repeat(user_train2.values, item_train1.shape[0], axis=0), columns=user_train2.columns) 
user_train2.head()

In [None]:
# form a set of user vectors. This is the same vector, transformed and repeated.

item_vecs = item_train1.to_numpy()
user_vecs = user_train2.to_numpy()

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
sitem_vecs = scalerItem.transform(item_vecs)

# # make a prediction
y_p = model.predict([suser_vecs[:, 3:], sitem_vecs[:, 1:]])

# # unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)

# sort the results, highest prediction first
sorted_index = np.argsort(-y_pu,axis=0).reshape(-1).tolist()  #negate to get largest rating first
sorted_ypu   = y_pu[sorted_index]
sorted_items = movies.iloc[sorted_index]  #using unscaled vectors for display
sorted_items.insert(1, 'y_predict', sorted_ypu)


In [None]:
genres = sorted_items['genres'].str.split('|', expand=False)
user_genre_ave = []
try:
    for i in range(len(genres)):
        if genres.iloc[i] != ['(no genres listed)']:
            user_genre_ave.append(user_train2.iloc[uid][genres.iloc[i]].values.round(1)) 
        else:
            user_genre_ave.append(0)
except Exception as e:
    print(genres[i])
    print(e)

sorted_items.insert(2, 'genre', user_genre_ave)

In [None]:
sorted_items.head(10)

<h3> Finding Similar Items </h3>

In [None]:
assert False

In [None]:
def sq_dist(a,b):
    d = np.sum(np.square(a-b))
    return d

In [None]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features))    # input layer
vm_m = item_NN(input_item_m)                                       # use the trained item_NN
vm_m = tf.linalg.l2_normalize(vm_m, axis=1)                        # incorporate normalization as was done in the original model
model_m = tf.keras.Model(input_item_m, vm_m)                                
model_m.summary()

In [None]:
# item_vecs = item_train.to_numpy()

scaled_item_vecs = scalerItem.transform(item_train1.to_numpy())
vms = model_m.predict(scaled_item_vecs[:,1:])
print(f"size of all predicted movie feature vectors: {vms.shape}")

In [None]:
item_vecs = item_train1.to_numpy()

In [None]:
count = 50  # number of movies to display
dim = len(vms)
dist = np.zeros((dim,dim))

for i in range(dim):
    for j in range(dim):
        dist[i,j] = sq_dist(vms[i, :], vms[j, :])

In [None]:
import numpy.ma as ma
m_dist = ma.masked_array(dist, mask=np.identity(dist.shape[0]))  # mask the diagonal

disp = [["movie1", "genres", "movie2", "genres"]]
for i in range(count):
    min_idx = np.argmin(m_dist[i])
    movie1_id = int(item_vecs[i,0])
    movie2_id = int(item_vecs[min_idx,0])
    disp.append( [movies.iloc[movie1_id]['title'], movies.iloc[movie1_id]['genres'],
                  movies.iloc[movie2_id]['title'], movies.iloc[movie2_id]['genres']]
               )
table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
table