## Data Preprocessing

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_movies = pd.read_csv("movies.csv")
df_rating = pd.read_csv("ratings.csv")

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Data Manipulation

In [5]:
#locating where the year is in the title column.
df_movies['year'] = df_movies.title.str.extract('(\(\d\d\d\d\))', expand = False)

#removing parentheses that enclose the year
df_movies['year'] = df_movies.year.str.extract('(\d\d\d\d)', expand = False)

#removing the years
df_movies['title'] = df_movies.title.str.replace('(\(\d\d\d\d\))', '')

#removing any residual whitespace characters from end 
df_movies['title'] = df_movies['title'].apply(lambda x: x.strip())

df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [7]:
#removing "|" in the separation of genres for easier reading
df_movies['genres'] = df_movies.genres.str.split("|")

df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


## Data Encoding

In [8]:
#encoding the genres using OHE to convert to vector
df_moviegenre = df_movies.copy()

#iterating through list of genres to put 1 if movie corresponds to genre
for index, row in df_movies.iterrows():
    for genre in row['genres']:
        df_moviegenre.at[index, genre] = 1
        
#fills column with 0 if any movie does not equal in the genre
df_moviegenre = df_moviegenre.fillna(0)

df_moviegenre.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [10]:
#removing timestamp as it is redundant
df_rating = df_rating.drop("timestamp", 1)

df_rating.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


## CBF

In [14]:
userInput = [
    {'title': 'Mortal Kombat', 'rating': 4.5},
    {'title': 'Pulp Fiction', 'rating': 5},
    {'title': 'Akira', 'rating': 3},
    {'title': 'Jumanji', 'rating': 4},
    {'title': 'Canadian Bacon', 'rating': 2}   
    ]

movie_input = pd.DataFrame(userInput)

movie_input.head

<bound method NDFrame.head of             title  rating
0   Mortal Kombat     4.5
1    Pulp Fiction     5.0
2           Akira     3.0
3         Jumanji     4.0
4  Canadian Bacon     2.0>

In [15]:
#filtering movie by title
movie_id = df_movies[df_movies['title'].isin(movie_input['title'].tolist())]

#Merging title and MovieID
movie_input = pd.merge(movie_id, movie_input)

#remving redundant columns
movie_input = movie_input.drop('genres', 1).drop('year', 1)

movie_input

Unnamed: 0,movieId,title,rating
0,2,Jumanji,4.0
1,44,Mortal Kombat,4.5
2,157,Canadian Bacon,2.0
3,296,Pulp Fiction,5.0
4,1274,Akira,3.0


In [17]:
#filtering based on the user input
userMovies = df_moviegenre[df_moviegenre['movieId'].isin(movie_input['movieId'])]

userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,44,Mortal Kombat,"[Action, Adventure, Fantasy]",1995,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
155,157,Canadian Bacon,"[Comedy, War]",1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
293,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1246,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
#correcting the index to avoid confusion
userMovies = userMovies.reset_index(drop = True)

#removing redundant columns
userGenre = userMovies.drop('movieId', 1).drop("title", 1).drop("genres", 1).drop('year', 1)
userGenre

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
movie_input['rating']

0    4.0
1    4.5
2    2.0
3    5.0
4    3.0
Name: rating, dtype: float64

In [20]:
#dot product; weight = user movie ratings x user movie genre ratings 
#weight is the preference of the user for a particular genre
user_profile = userGenre.transpose().dot(movie_input['rating'])

user_profile

Adventure             11.5
Animation              3.0
Children               4.0
Comedy                 7.0
Fantasy                8.5
Romance                0.0
Drama                  5.0
Action                 7.5
Crime                  5.0
Thriller               5.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 3.0
IMAX                   0.0
Documentary            0.0
War                    2.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [21]:
#retrieving genre of each movie in list
genre_list = df_moviegenre.set_index(df_moviegenre['movieId'])

#dropping redundant columns
genre_list = genre_list.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)

genre_list.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
genre_list.shape

(34208, 20)

In [24]:
#taking weighted average of genre x weights
df_recommendation = (genre_list * user_profile).sum(axis = 1)/(user_profile.sum())

df_recommendation.head()

movieId
1    0.552846
2    0.390244
3    0.113821
4    0.195122
5    0.113821
dtype: float64

In [25]:
df_recommendation = df_recommendation.sort_values(ascending = False)
df_recommendation.head()

movieId
5018      0.731707
117646    0.723577
27344     0.691057
108932    0.674797
546       0.674797
dtype: float64

## Final Recommendation System

In [26]:
df_movies.loc[df_movies['movieId'].isin(df_recommendation.head(10).keys())]

Unnamed: 0,movieId,title,genres,year
542,546,Super Mario Bros.,"[Action, Adventure, Children, Comedy, Fantasy,...",1993
4923,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mys...",1991
8783,26340,"Twelve Tasks of Asterix, The (Les douze travau...","[Action, Adventure, Animation, Children, Comed...",1976
9296,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
11716,51632,Atlantis: Milo's Return,"[Action, Adventure, Animation, Children, Comed...",2003
11751,51939,TMNT (Teenage Mutant Ninja Turtles),"[Action, Adventure, Animation, Children, Comed...",2007
12021,54278,Underdog,"[Action, Adventure, Children, Comedy, Fantasy,...",2007
22881,108932,The Lego Movie,"[Action, Adventure, Animation, Children, Comed...",2014
25218,117646,Dragonheart 2: A New Beginning,"[Action, Adventure, Comedy, Drama, Fantasy, Th...",2000
26442,122787,The 39 Steps,"[Action, Adventure, Comedy, Crime, Drama, Thri...",1959
