In [19]:
#Importing libraries

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
pd.set_option("display.precision", 1)

### Movie Dataframe with engineered features

In [20]:
movieDF = pd.read_csv("dataset/movies.csv")

# separating year from title
movieDF["year"] = movieDF["title"].str.extract("\((\d{4})\)", expand=False)
movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")
movieDF["title"] = movieDF["title"].str.strip()

# genre one hot encoding
genreDF = movieDF["genres"].str.get_dummies(sep="|")
movieDF = pd.concat([movieDF, genreDF], axis=1)
movieDF = movieDF.drop(columns=["genres"])

# dropping title
movieDF = movieDF.drop(columns=["title"])

movieDF.head()

  movieDF["title"] = movieDF["title"].str.replace("\(\d{4}\)", "")


Unnamed: 0,movieId,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Reading ratings file
rateDF = pd.read_csv("dataset/ratings.csv")
rateDF = rateDF.drop(columns=["timestamp"])
rateDF.head()


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [22]:
# average rating for each movie
avgRatingDF = rateDF[['movieId','rating']].groupby("movieId").mean()
avgRatingDF = avgRatingDF.rename(columns={"rating": "avgRating"})
avgRatingDF.reset_index(inplace=True)
avgRatingDF.head()

Unnamed: 0,movieId,avgRating
0,1,3.9
1,2,3.4
2,3,3.3
3,4,2.4
4,5,3.1


In [23]:
#Joining average Rating with the movie
movieDF = movieDF.join(avgRatingDF.set_index("movieId"), on="movieId").head()

# change avgRating position
cols = list(movieDF.columns)
cols = cols[0:2] + [cols[-1]] + cols[2:-1]
movieDF = movieDF[cols]

movieDF.head()

Unnamed: 0,movieId,year,avgRating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1995,3.9,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1995,3.4,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1995,3.3,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,1995,2.4,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,1995,3.1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Rating Dataframe with engineered features

In [24]:
rateDF.head()
# number of ratings for each user
userRatingCountDF = rateDF[['userId','rating']].groupby("userId").count()
userRatingCountDF = userRatingCountDF.rename(columns={"rating": "userRatingCount"})

# avarage rating for each user
userAvgRatingDF = rateDF[['userId','rating']].groupby("userId").mean()
userAvgRatingDF = userAvgRatingDF.rename(columns={"rating": "userAvgRating"})
userAvgRatingDF.reset_index(inplace=True)
userRatingCountDF.reset_index(inplace=True)

userRatingCountDF

Unnamed: 0,userId,userRatingCount
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44
...,...,...
605,606,1115
606,607,187
607,608,831
608,609,37


In [28]:
rateDF

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [30]:
# average rating for each genre for each user



Unnamed: 0,userId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,4.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1,4.0,,,,,,,,,...,,,,,,,,,,
3,1,5.0,,,,,,,,,...,,,,,,,,,,
4,1,5.0,,,,,,,,,...,,,,,,,,,,
