In [1]:
#Importing libraries

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# tabulate to neatly print tables
import tabulate
pd.set_option("display.precision", 1)

### Movie Dataframe with engineered features

In [2]:
movieDF = pd.read_csv("dataset/movies.csv")

# separating year from title
movieDF["year"] = movieDF["title"].str.extract("\((\d{4})\)", expand=False)
movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")
movieDF["title"] = movieDF["title"].str.strip()

# genre one hot encoding
genreDF = movieDF["genres"].str.get_dummies(sep="|")
movieDF = pd.concat([movieDF, genreDF], axis=1)
movieDF = movieDF.drop(columns=["genres"])

# dropping title
movieDF = movieDF.drop(columns=["title"])

del genreDF
movieDF.head()

  movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")


Unnamed: 0,movieId,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Reading ratings file
rateDF = pd.read_csv("dataset/ratings.csv")
rateDF = rateDF.drop(columns=["timestamp"])
rateDF.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
# average rating for each movie
avgRatingDF = rateDF[['movieId','rating']].groupby("movieId").mean()
avgRatingDF = avgRatingDF.rename(columns={"rating": "avgRating"})
avgRatingDF.reset_index(inplace=True)
avgRatingDF.head()

Unnamed: 0,movieId,avgRating
0,1,3.9
1,2,3.4
2,3,3.3
3,4,2.4
4,5,3.1


In [5]:
#Joining average Rating with the movie
movieDF = movieDF.join(avgRatingDF.set_index("movieId"), on="movieId")

# change avgRating position
cols = list(movieDF.columns)
cols = cols[0:2] + [cols[-1]] + cols[2:-1]
movieVector = movieDF[cols]

del avgRatingDF,movieDF
movieVector.head()
# MOVIE VECTOR

Unnamed: 0,movieId,year,avgRating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.9,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,3.4,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,3.3,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,2.4,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,3.1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Rating Dataframe with engineered features

In [6]:
# Rating count for each user
userRatingCountDF = rateDF[['userId','rating']].groupby("userId").count()
userRatingCountDF = userRatingCountDF.rename(columns={"rating": "userRatingCount"})

# Avarage rating for each user
userAvgRatingDF = rateDF[['userId','rating']].groupby("userId").mean()
userAvgRatingDF = userAvgRatingDF.rename(columns={"rating": "userAvgRating"})
userAvgRatingDF.reset_index(inplace=True)
userRatingCountDF.reset_index(inplace=True)

# Joining userRatingCount and userAvgRating
userDF = userRatingCountDF.join(userAvgRatingDF.set_index("userId"), on="userId")

del userRatingCountDF, userAvgRatingDF
userDF.head()

Unnamed: 0,userId,userRatingCount,userAvgRating
0,1,232,4.4
1,2,29,3.9
2,3,39,2.4
3,4,216,3.6
4,5,44,3.6


In [7]:
# Movie vector only with genres
cols = list(movieVector.columns)
mmovieDF = movieVector[cols[0:1] + cols[4:]]
mmovieDF.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# Initializing the movie vector
userVector = pd.DataFrame(columns= ['userId'] + list(cols[4:]))

In [9]:
# get max userId
n = rateDF['userId'].max()

# Calculating average rating for each genre for each user
for i in range(n):
    userId = i+1

    # get all movies rated by one user
    userMovies = rateDF.loc[rateDF['userId'] == userId]
    userRatings = userMovies['rating'].values
    movieIds = userMovies['movieId'].values
    # Each movie is represented by a one hot encoding vector of genre
    userMovies = mmovieDF.loc[mmovieDF['movieId'].isin(movieIds)]

    # Calculating average rating for each genre for each user
    userMovies = userMovies.iloc[:,1:].multiply(userRatings, axis="index").replace(0, np.NaN)
    userVec = userMovies.mean(axis=0).fillna(0)
    userVec['userId'] = userId

    # appending row to userVector dataframe
    userVector = pd.concat([userVector, userVec.to_frame().T], ignore_index=True, )

del mmovieDF
userVector.head()

Unnamed: 0,userId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,4.3,4.4,4.7,4.5,4.3,4.4,0.0,4.5,4.3,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
1,2.0,4.0,4.2,0.0,0.0,4.0,3.8,4.3,3.9,0.0,0.0,3.0,3.8,0.0,4.0,4.5,3.9,3.7,4.5,3.5
2,3.0,3.6,2.7,0.5,0.5,1.0,0.5,0.0,0.8,3.4,0.0,4.7,0.0,0.5,5.0,0.5,4.2,4.1,0.5,0.0
3,4.0,3.3,3.7,4.0,3.8,3.5,3.8,4.0,3.5,3.7,4.0,4.2,3.0,4.0,3.5,3.4,2.8,3.6,3.6,3.8
4,5.0,3.1,3.2,4.3,4.1,3.5,3.8,0.0,3.8,4.1,0.0,3.0,3.7,4.4,4.0,3.1,2.5,3.6,3.3,3.0


In [10]:
# join userVector with userDF
userVector = userVector.join(userDF.set_index("userId"), on="userId")

# Changing the order of columns
cols = list(userVector.columns)
userVector = userVector[cols[0:1] + cols[-2:] + cols[1:-2] ]

del userDF
userVector.head()

Unnamed: 0,userId,userRatingCount,userAvgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,232,4.4,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
1,2.0,29,3.9,4.0,4.2,0.0,0.0,4.0,3.8,4.3,...,0.0,3.0,3.8,0.0,4.0,4.5,3.9,3.7,4.5,3.5
2,3.0,39,2.4,3.6,2.7,0.5,0.5,1.0,0.5,0.0,...,0.0,4.7,0.0,0.5,5.0,0.5,4.2,4.1,0.5,0.0
3,4.0,216,3.6,3.3,3.7,4.0,3.8,3.5,3.8,4.0,...,4.0,4.2,3.0,4.0,3.5,3.4,2.8,3.6,3.6,3.8
4,5.0,44,3.6,3.1,3.2,4.3,4.1,3.5,3.8,0.0,...,0.0,3.0,3.7,4.4,4.0,3.1,2.5,3.6,3.3,3.0


In [11]:
# Left Joining Ratings with User vector
userVector = rateDF[['userId']].join(userVector.set_index("userId"), on="userId").astype('float32')
userVector.head()

Unnamed: 0,userId,userRatingCount,userAvgRating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,232.0,4.4,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
1,1.0,232.0,4.4,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
2,1.0,232.0,4.4,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
3,1.0,232.0,4.4,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3
4,1.0,232.0,4.4,4.3,4.4,4.7,4.5,4.3,4.4,0.0,...,5.0,3.5,0.0,4.7,4.2,4.3,4.2,4.1,4.5,4.3


In [12]:
# Left Joining Ratings with Movie vector 
movieVector = rateDF[['movieId']].join(movieVector.set_index("movieId"), on="movieId").astype('float32')
movieVector.head()

Unnamed: 0,movieId,year,avgRating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,1995.0,3.9,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,1995.0,3.3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,6.0,1995.0,3.9,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,47.0,1995.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,50.0,1995.0,4.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [13]:
# Y is the rating
y_train = rateDF['rating'].values
del rateDF
y_train

array([4., 4., 4., ..., 5., 5., 3.])

### Normalizing the training data