In [1]:
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# reading movies
movies = pd.read_csv('movies.csv')
movies.head(4)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [3]:
# check if there's any NAN value
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [3]:
# remove dates from titles and insert them in year column
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
# remove parentheses
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)

# remove date from title
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '', regex=True)
print(movies['title'])
# delete whitespaces
movies['title'] = movies['title'].apply(lambda x: x.strip())

0                          Toy Story 
1                            Jumanji 
2                   Grumpier Old Men 
3                  Waiting to Exhale 
4        Father of the Bride Part II 
                     ...             
34203                     Grand Slam 
34204                     Bloodmoney 
34205           The Butterfly Circus 
34206                           Zero 
34207          The 2000 Year Old Man 
Name: title, Length: 34208, dtype: object


In [4]:
# split 'genres' column into a list of genres
movies['genres'] = movies.genres.str.split('|')

In [6]:
movies.head(5)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [5]:
#we want to store every genre of each movie in a single column with values of 0 and 1.Where 0 means the movie doesn't have that genre and 1 means the movie has that genre  
# first we copy the dataframe
moviesWithGenres_df = movies.copy()

# in this loop we assign value 1 for ervery genre column of each movie
for index , row in movies.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
# We replace nan values with 0.
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ratings = pd.read_csv('ratings.csv')
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [102]:
# check if there's any NAN value
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [7]:
# delete timestamp column as we don't need it
ratings.drop(['timestamp'], axis =1)

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0
...,...,...,...
22884372,247753,49530,5.0
22884373,247753,69481,3.0
22884374,247753,74458,4.0
22884375,247753,76093,5.0


In [8]:
# These are the data for input user so we can see genres of ervery movie he/she has rated
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 

inputMovies = pd.DataFrame(userInput)

In [13]:
# extract movies data she/he has watched with their id
inputId = movies[movies['title'].isin(inputMovies['title'].tolist())]

# merging inputId and inputMovies so we can see every movie with title and id
inputMovies_hat = pd.merge(inputId, inputMovies )

# we drop columns we don't need 
inputMovies_hat = inputMovies_hat.drop(['genres'], axis=1)
inputMovies_hat = inputMovies_hat.drop(['year'], axis=1)
# This is our final dataframe to work with
inputMovies_hat

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [14]:
# filtering which movies user_input has watched 
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(inputMovies_hat['movieId'].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1246,1274,Akira,"[Action, Adventure, Animation, Sci-Fi]",1988,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1885,1968,"Breakfast Club, The","[Comedy, Drama]",1985,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
# Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
userGenreTable

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [107]:
inputMovies['rating']

0    5.0
1    3.5
2    2.0
3    5.0
4    4.5
Name: rating, dtype: float64

In [16]:
#now we calculate user scores for each genre by multipling genres numbers into rating of each movie
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])
#The user profile
userProfile

Adventure             13.5
Animation             10.0
Children               8.5
Comedy                11.5
Fantasy                8.5
Romance                0.0
Drama                  6.5
Action                 5.0
Crime                  2.0
Thriller               2.0
Horror                 0.0
Mystery                0.0
Sci-Fi                 5.0
IMAX                   0.0
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [17]:
# get genre of every movie due to movieid
#puting movieid in an index so that it won't get affected 
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])

# drop column we don't need
genreTable = genreTable.drop('movieId', axis=1).drop('title', axis=1).drop('genres', axis=1).drop('year', axis=1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
genreTable.shape

(34208, 20)

In [18]:
#Multiply the genres by the weights of rates and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.717241
2    0.420690
3    0.158621
4    0.248276
5    0.158621
dtype: float64

In [19]:
#then we sort recommendations in descending order so the most recommended movies will be at top
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#showing first 20 rows
recommendationTable_df.head(20)

movieId
26093     0.806897
673       0.786207
130520    0.786207
108932    0.786207
32031     0.786207
51632     0.786207
51939     0.786207
26340     0.786207
27344     0.758621
2987      0.744828
148775    0.737931
106240    0.737931
52462     0.737931
52287     0.737931
62956     0.737931
85261     0.737931
40339     0.737931
3754      0.717241
546       0.717241
3114      0.717241
dtype: float64

In [21]:
# final recommendation table
# we choose rows from movies dataframe which exist in the recommendation table
final_recommendation = movies.loc[movies['movieId'].isin(recommendationTable_df.head(20).keys())]
final_recommendation

Unnamed: 0,movieId,title,genres,year
542,546,Super Mario Bros.,"[Action, Adventure, Children, Comedy, Fantasy,...",1993
664,673,Space Jam,"[Adventure, Animation, Children, Comedy, Fanta...",1996
2902,2987,Who Framed Roger Rabbit?,"[Adventure, Animation, Children, Comedy, Crime...",1988
3028,3114,Toy Story 2,"[Adventure, Animation, Children, Comedy, Fantasy]",1999
3664,3754,"Adventures of Rocky and Bullwinkle, The","[Adventure, Animation, Children, Comedy, Fantasy]",2000
8605,26093,"Wonderful World of the Brothers Grimm, The","[Adventure, Animation, Children, Comedy, Drama...",1962
8783,26340,"Twelve Tasks of Asterix, The (Les douze travau...","[Action, Adventure, Animation, Children, Comed...",1976
9296,27344,Revolutionary Girl Utena: Adolescence of Utena...,"[Action, Adventure, Animation, Comedy, Drama, ...",1999
9825,32031,Robots,"[Adventure, Animation, Children, Comedy, Fanta...",2005
10575,40339,Chicken Little,"[Action, Adventure, Animation, Children, Comed...",2005
