In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName('Movie Hybrid Recommender Engine').getOrCreate()

In [571]:
import numpy as np
import pandas as pd
import os

# Check dataset in data folder
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data\links_small.csv
./data\movies_small.csv
./data\ratings_small.csv
./data\tags_small.csv
./data\.ipynb_checkpoints\link-checkpoint.csv
./data\.ipynb_checkpoints\links_small-checkpoint.csv
./data\.ipynb_checkpoints\movie-checkpoint.csv
./data\.ipynb_checkpoints\movies_small-checkpoint.csv
./data\.ipynb_checkpoints\ratings_small-checkpoint.csv
./data\.ipynb_checkpoints\tag-checkpoint.csv
./data\.ipynb_checkpoints\tags_small-checkpoint.csv


In [371]:
pd.set_option('display.max_columns', 20)

movies = pd.read_csv('data/movies_small.csv')
ratings = pd.read_csv('data/ratings_small.csv')
links = pd.read_csv('data/links_small.csv')
tags = pd.read_csv('data/tags_small.csv')

movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [374]:
df = movies.merge(tags, how='left', on='movieId')
df

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1.139046e+09
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1.137207e+09
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1.525286e+09
3,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1.528844e+09
4,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1.528844e+09
...,...,...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,,,
11849,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,,,
11850,193585,Flint (2017),Drama,,,
11851,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,,,


In [219]:
df.shape

(11853, 6)

In [220]:
movies.shape

(9742, 3)

In [226]:
tags.shape

(3683, 4)

In [375]:
# Count number of unique movie's title (equal to number of title in movie.csv)
df['title'].nunique()

9737

In [224]:
# Count how many ratings of each movie
df['title'].value_counts().head()

Pulp Fiction (1994)                                               181
Fight Club (1999)                                                  54
2001: A Space Odyssey (1968)                                       41
Léon: The Professional (a.k.a. The Professional) (Léon) (1994)     35
Eternal Sunshine of the Spotless Mind (2004)                       34
Name: title, dtype: int64

In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11853 entries, 0 to 11852
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    11853 non-null  int64  
 1   title      11853 non-null  object 
 2   genres     11853 non-null  object 
 3   userId     3683 non-null   float64
 4   tag        3683 non-null   object 
 5   timestamp  3683 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 648.2+ KB


In [376]:
# Check if there are duplicate values
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [255]:
# Drop rows with null values
df.dropna(inplace=True)
df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
tag          0
timestamp    0
dtype: int64

In [378]:
# Check df agian after drop duplicates and null values
df['title'].nunique()

9737

In [295]:
# Convert genres from a string 'Adventure|Animation|Children|Comedy|Fantasy' to a list
df.iloc[0].genres

'Adventure|Animation|Children|Comedy|Fantasy'

In [377]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))
df

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",336.0,pixar,1.139046e+09
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",474.0,pixar,1.137207e+09
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",567.0,fun,1.525286e+09
3,2,Jumanji (1995),"[Adventure, Children, Fantasy]",62.0,fantasy,1.528844e+09
4,2,Jumanji (1995),"[Adventure, Children, Fantasy]",62.0,magic board game,1.528844e+09
...,...,...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",,,
11849,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",,,
11850,193585,Flint (2017),[Drama],,,
11851,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",,,


In [379]:
df['genres'] = df['genres'].apply(lambda x: [i.replace(" ","") for i in x]).apply(lambda x: " ".join(x))
df

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1.139046e+09
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1.137207e+09
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1.525286e+09
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy,1.528844e+09
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game,1.528844e+09
...,...,...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,,,
11849,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,,,
11850,193585,Flint (2017),Drama,,,
11851,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,,,


In [567]:
df['tag'] = df['tag'].fillna(0)
df['tag'] = df['tag'].apply(lambda x: str(x))
df = df[['movieId','title','genres','tag']]
df.drop_duplicates(inplace=True)
df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,fun
3,2,Jumanji (1995),Adventure Children Fantasy,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,magic board game
5,2,Jumanji (1995),Adventure Children Fantasy,Robin Williams
...,...,...,...,...
11848,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,0
11849,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,0
11850,193585,Flint (2017),Drama,0
11851,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,0


In [562]:
tags = tags[['movieId','tag']]
tags

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA
...,...,...
3678,7382,for katie
3679,7936,austere
3680,3265,gun fu
3681,3265,heroic bloodshed


In [569]:
temp_df = df.groupby(by=['movieId','title','genres']).count()
temp_df.reset_index()
# temp_df = temp_df.merge(df, on=['movieId','title','genres'])
# temp_df

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,2
1,2,Jumanji (1995),Adventure Children Fantasy,4
2,3,Grumpier Old Men (1995),Comedy Romance,2
3,4,Waiting to Exhale (1995),Comedy Drama Romance,1
4,5,Father of the Bride Part II (1995),Comedy,2
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action Animation Comedy Fantasy,1
9738,193583,No Game No Life: Zero (2017),Animation Comedy Fantasy,1
9739,193585,Flint (2017),Drama,1
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action Animation,1


In [475]:
new_df = df[['movieId','title','tags']]
new_df

KeyError: "['tags'] not in index"

In [274]:
new_df.shape

(11853, 3)

In [416]:
new_df['title'].nunique()

9737

In [424]:
new_df.drop_duplicates(inplace=True)
new_df.duplicated().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


0

In [431]:
new_df.isnull().sum()

movieId    0
title      0
tags       0
dtype: int64

In [409]:
type(new_df)

pandas.core.frame.DataFrame

In [434]:
new_df.groupby(by='movieId').nunique(dropna=True)

Unnamed: 0_level_0,title,tags
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2
2,1,4
3,1,2
4,1,1
5,1,2
...,...,...
193581,1,1
193583,1,1
193585,1,1
193587,1,1


In [388]:
# import CountVectorizer from sklearn
# CountVectorizer converts a collection of text documents to a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')

In [389]:
# fit_transform: Learn the vocabulary dictionary and return document-term matrix
vectors = cv.fit_transform(new_df['tags']).toarray()

In [390]:
vectors.shape

(11749, 1676)

In [391]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [392]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [393]:
len(cv.get_feature_names())



1676

In [394]:
# Use cosine similarity to compute the similarity between X and Y
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [395]:
similarity.shape

(11749, 11749)

In [396]:
similarity[0]

array([1.        , 0.83333333, 0.66666667, ..., 0.        , 0.28867513,
       0.40824829])

In [397]:
similarity[1]

array([0.83333333, 1.        , 0.66666667, ..., 0.        , 0.28867513,
       0.40824829])

In [398]:
len(sorted(similarity[0], reverse=True))

11749

In [399]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]

[(3163, 1.0000000000000002),
 (2379, 0.9128709291752769),
 (2441, 0.9128709291752769),
 (3697, 0.9128709291752769),
 (3907, 0.9128709291752769)]

In [402]:
# Define a function to recommend movies base on similarity
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title) # print the movie's title
        # print(i) # print the movies' id

# Use iloc() to select a particular cell of the dataset
# to check the movie's info by id

In [403]:
recommend('Toy Story (1995)')

Toy Story 2 (1999)
Antz (1998)
Bug's Life, A (1998)
Adventures of Rocky and Bullwinkle, The (2000)
Emperor's New Groove, The (2000)


In [407]:
recommend("Emperor's New Groove, The (2000)")

Hands on a Hard Body (1996)
It Came from Hollywood (1982)
Original Kings of Comedy, The (2000)
Eddie Murphy Raw (1987)
Forgotten Silver (1996)


In [404]:
import pickle

pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

# Get the movies with less than 50 ratings
ratings_count = pd.DataFrame(df['title'].value_counts())
ratings_less_than_50 = ratings_count[ratings_count['title'] <= 50].index

# Now get the movies with more than 50 ratinsg by using ~isin()
common_movies = df[~df['title'].isin(ratings_less_than_50)]

In [92]:
# Check size of the common_movies df 
common_movies.shape

(40712, 6)

In [93]:
# Count number of unique movie with more than 1000 ratings
common_movies['title'].nunique()

437

In [94]:
common_movies

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
98328,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,561.0,2.0,1.491095e+09
98329,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,586.0,4.0,1.529899e+09
98330,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,596.0,4.0,1.535709e+09
98331,122904,Deadpool (2016),Action|Adventure|Comedy|Sci-Fi,599.0,3.5,1.519458e+09


In [95]:
# Create user pivot table
user_movie_df = common_movies.pivot_table(index=['userId'], columns=['title'], values='rating')
user_movie_df

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Wild Wild West (1999),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,4.0,,,...,,5.0,5.0,,5.0,,,5.0,,
2.0,,,,,,,,,,,...,,,,5.0,,,,,3.0,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,5.0,,,,,,,,,...,,4.0,5.0,,,,,,,
5.0,,,,,,,,,3.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606.0,,,5.0,,,,3.5,,,2.0,...,,,,,,,,3.5,,
607.0,,,,,,,,,,,...,,,5.0,,3.0,,,,,
608.0,,,3.0,3.5,5.0,,4.5,3.0,3.5,2.0,...,2.5,3.5,2.5,,4.0,4.0,4.0,,,3.0
609.0,,,,,,,,,,,...,,,,,,,,,,


In [96]:
# Check size of the user pivot table
type(user_movie_df)

pandas.core.frame.DataFrame

In [102]:
# Item-based  movie recommendation
movie_name = 'Matrix, The (1999)'
movie_name = user_movie_df[movie_name]

# Compute pairwise correlation, then sort in descending order
user_movie_df.corrwith(movie_name).sort_values(ascending=False)

title
Matrix, The (1999)                                     1.000000
Slumdog Millionaire (2008)                             0.613839
Kung Fu Panda (2008)                                   0.612549
Interstellar (2014)                                    0.599040
Legends of the Fall (1994)                             0.567155
                                                         ...   
Fantasia (1940)                                       -0.241073
Election (1999)                                       -0.247055
First Knight (1995)                                   -0.268982
City Slickers II: The Legend of Curly's Gold (1994)   -0.293258
Arachnophobia (1990)                                  -0.387551
Length: 437, dtype: float64

In [98]:
# Choose a random user
# random_user = pd.Series(user_movie_df.index).sample(1, random_state=45).values
random_user = 300
random_user

300

In [101]:
user_movie_df.index

Float64Index([  1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
               10.0,
              ...
              601.0, 602.0, 603.0, 604.0, 605.0, 606.0, 607.0, 608.0, 609.0,
              610.0],
             dtype='float64', name='userId', length=606)

In [99]:
# Reduce the #dataset to user 28491:
random_user_df = user_movie_df[user_movie_df.index == random_user]
random_user_df

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Wild Wild West (1999),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300.0,,,,,,,,,,,...,,,,,,,,,,


In [103]:
# Choose non-NaN movies watched by 28491, then convert the result to a list
movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist()
movies_watched

["Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
 'American Beauty (1999)',
 'American History X (1998)',
 'City of God (Cidade de Deus) (2002)',
 'Django Unchained (2012)',
 'Eternal Sunshine of the Spotless Mind (2004)',
 'Fight Club (1999)',
 'Forrest Gump (1994)',
 'Good Will Hunting (1997)',
 'I Am Legend (2007)',
 'Inception (2010)',
 'Interstellar (2014)',
 'Life Is Beautiful (La Vita è bella) (1997)',
 'Lost in Translation (2003)',
 'Matrix, The (1999)',
 'Mulholland Drive (2001)',
 "Pan's Labyrinth (Laberinto del fauno, El) (2006)",
 'Saving Private Ryan (1998)',
 "Schindler's List (1993)",
 'Shawshank Redemption, The (1994)',
 'Silence of the Lambs, The (1991)',
 'Sixth Sense, The (1999)',
 'Slumdog Millionaire (2008)']

In [21]:
# Check if a movie is REALLY watched by 28491. If yes, rating score will not be NaN
user_movie_df.loc[user_movie_df.index == random_user, user_movie_df.columns == "Young Guns (1988)"]

title,Young Guns (1988)
userId,Unnamed: 1_level_1
28941.0,


In [22]:
# Count how many movies that 28491 watched
len(movies_watched)

33

In [23]:
# we have reduced the dataset based on movies watched by user 28491:
movies_watched_df = user_movie_df[movies_watched]
movies_watched_df
# movies_watched_df.shape 

title,Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Aladdin (1992),"American President, The (1995)",Apollo 13 (1995),Babe (1995),Bullets Over Broadway (1994),Clueless (1995),Disclosure (1994),Forrest Gump (1994),...,Ready to Wear (Pret-A-Porter) (1994),"Remains of the Day, The (1993)",Sabrina (1995),Schindler's List (1993),"Secret Garden, The (1993)",Sense and Sensibility (1995),Shadowlands (1993),"Silence of the Lambs, The (1991)",Star Trek: Generations (1994),Stargate (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,3.5,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,5.0,5.0,5.0
4.0,,3.0,,,,,,,,4.0,...,,,,,3.0,,,,3.0,
5.0,,,5.0,5.0,5.0,,,,,,...,,3.0,,,5.0,3.0,,3.0,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138489.0,,,,,,,,,,,...,,,,,,,,4.0,,
138490.0,,,,,4.0,5.0,,,,,...,,,,,4.0,4.0,,5.0,,
138491.0,,,,,,,,,,,...,,,,,,,,,,
138492.0,,,,,,,,,,,...,,,,,,,,,,


In [24]:
# Count how many movies each user watched
user_movie_count = movies_watched_df.T.notnull().sum()

user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ['userId', 'movie_count']
user_movie_count

Unnamed: 0,userId,movie_count
0,1.0,1
1,2.0,2
2,3.0,4
3,4.0,6
4,5.0,11
...,...,...
138488,138489.0,1
138489,138490.0,7
138490,138491.0,0
138491,138492.0,2


In [108]:
# Calculate 60% of movies watched by 300
percentage = len(movies_watched)*60/100
percentage

13.8

In [114]:
# Count how many users have watched more than 60% movies as 300
same_watched_movies = user_movie_count[user_movie_count['movie_count'] > percentage]['userId']
same_watched_movies.shape

(15804,)

In [115]:
# Combine the data of 300 and similar users
final_df = pd.concat([movies_watched_df[movies_watched_df.index.isin(same_watched_movies)], 
                      random_user_df[movies_watched]])

In [116]:
# Check size of the final_df
final_df.shape

(15805, 53)

In [117]:
# Compute pairwise correlation of columns, excluding NA/null values
final_df.T.corr()

userId,13.0,14.0,15.0,24.0,26.0,29.0,32.0,34.0,46.0,54.0,...,138387.0,138404.0,138411.0,138415.0,138422.0,138431.0,138456.0,138483.0,138484.0,300.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13.0,1.000000,0.100871,0.416667,0.038180,0.454606,0.012268,-0.111111,0.641203,0.024263,0.587968,...,-0.094916,-9.267738e-02,0.176712,0.065000,0.576018,0.499230,0.362113,0.330556,0.739894,
14.0,0.100871,1.000000,0.196574,-0.444711,0.457888,0.661438,0.513870,0.499131,-0.091287,0.362050,...,0.056063,2.593806e-01,0.906103,0.242764,0.051709,0.372104,0.059871,0.608164,0.434173,
15.0,0.416667,0.196574,1.000000,0.362620,0.653391,0.426143,-0.169791,0.132453,0.309426,0.302372,...,-0.010040,1.340491e-16,0.626783,0.498724,0.113107,0.389249,0.265144,0.430847,0.810093,0.5
24.0,0.038180,-0.444711,0.362620,1.000000,0.111283,0.107143,-0.054554,-0.066157,0.333333,-0.104686,...,0.412113,-2.251472e-01,-0.213498,-0.119523,0.076974,0.289474,0.020851,0.086468,0.102658,
26.0,0.454606,0.457888,0.653391,0.111283,1.000000,0.748803,0.279073,0.824086,0.598734,0.506900,...,0.181369,-9.840274e-02,0.659618,0.714430,-0.192925,0.565685,0.405222,0.768122,0.656147,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138431.0,0.499230,0.372104,0.389249,0.289474,0.565685,0.398410,0.114708,0.463586,0.245256,0.383713,...,0.081111,-5.659165e-01,0.317925,0.000000,0.722315,1.000000,0.051232,0.177667,0.344584,
138456.0,0.362113,0.059871,0.265144,0.020851,0.405222,0.079556,-0.237356,0.362738,0.138343,-0.021437,...,-0.281430,6.713577e-01,-0.195580,0.721956,-0.257143,0.051232,1.000000,0.159351,0.282984,
138483.0,0.330556,0.608164,0.430847,0.086468,0.768122,0.467684,0.581166,0.501688,0.405995,0.412479,...,0.454179,1.819487e-01,0.582934,0.709139,-0.014456,0.177667,0.159351,1.000000,0.529514,
138484.0,0.739894,0.434173,0.810093,0.102658,0.656147,0.467155,-0.075165,0.466667,0.080322,0.545880,...,0.055989,-7.548944e-02,0.280479,0.480500,0.215207,0.344584,0.282984,0.529514,1.000000,0.5


In [129]:
# corr for all users
corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates()
corr_df = pd.DataFrame(corr_df, columns=['corr'])
corr_df.index.names = ['userId_1', 'userId_2']
corr_df = corr_df.reset_index()
corr_df.head()

Unnamed: 0,userId_1,userId_2,corr
0,69950.0,15349.0,-1.0
1,66290.0,16097.0,-1.0
2,77933.0,93179.0,-1.0
3,37194.0,9026.0,-1.0
4,38320.0,88451.0,-1.0


In [123]:
# Users with a correlation of %50 or more with 300 users:
top_users = corr_df[(corr_df['userId_1'] == random_user) & (corr_df['corr'] >= 0.5)][
    ['userId_2', 'corr']].reset_index(drop=True)

# Sort by column 'corr' in descending order
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={'userId_2': 'userId'}, inplace=True)
top_users

Unnamed: 0,userId,corr


In [124]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [127]:
# Merge top_users with ratings df on 3 columns
top_users_ratings = top_users.merge(ratings[['userId', 'movieId', 'rating']], how='inner')

top_users_ratings = top_users_ratings[top_users_ratings['userId'] != random_user]
top_users_ratings.head()

Unnamed: 0,corr,userId,movieId,rating


In [128]:
# Calculate the Weighted Average Recommendation Score and keep the first 5 movies
# Do a single score with the most similar by corr * rating:
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']
top_users_ratings.groupby('movieId').agg({'weighted_rating': 'mean'})

recommendation_df = top_users_ratings.groupby('movieId').agg({'weighted_rating': 'mean'})
recommendation_df = recommendation_df.reset_index()
recommendation_df

Unnamed: 0,movieId,weighted_rating


In [40]:
pd.__version__

'1.3.4'

In [41]:
# Weighted rating greater than 4:
recommendation_df[recommendation_df["weighted_rating"] > 3.7]

# Movies 28491 will like:
recommended_movies = recommendation_df[recommendation_df["weighted_rating"] > 3.7].sort_values("weighted_rating", ascending=False)

# Merge recommended_movies with movies df, then get the top 5 movies:
recommended_movies.merge(movies[["movieId", "title"]])[:5]

Unnamed: 0,movieId,weighted_rating,title
0,1922,3.76358,Whatever (1998)
1,2057,3.76358,"Incredible Journey, The (1963)"
2,2077,3.76358,"Journey of Natty Gann, The (1985)"
3,2485,3.76358,She's All That (1999)
4,2577,3.76358,Metroland (1997)


In [37]:
import pickle
pickle.dump(recommendation_df.to_dict(),open('movies_dict.pkl','wb'))

In [201]:
# The last highly-rated movie by user 108170:

user = 300
movie_id = ratings[(ratings["userId"] == user) & (ratings["rating"] == 5.0)].sort_values(by="timestamp")["movieId"][0:1].values[0]
movie_id

527

In [218]:
# 5 recommend movies user-based
recommended_movies.merge(movies[['movieId', 'title']])[:5]['title'].to_list()

['Whatever (1998)',
 'Incredible Journey, The (1963)',
 'Journey of Natty Gann, The (1985)',
 "She's All That (1999)",
 'Metroland (1997)']

In [239]:
# 5 recommend movies item-based
movie_name = movies[movies['movieId'] == movie_id]['title'].values[0]
# type(movie_name)
movie_name = user_movie_df[movie_name]
movies_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)
movies_from_item_based[1:6]

title
Interstellar (2014)                            0.689583
12 Angry Men (1957)                            0.674274
Wallace & Gromit: The Wrong Trousers (1993)    0.650442
King's Speech, The (2010)                      0.603023
When Harry Met Sally... (1989)                 0.592458
dtype: float64

In [240]:
type(pd.DataFrame(movies_from_item_based))

pandas.core.frame.DataFrame

In [243]:
pd.DataFrame(movies_from_item_based).head()

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Schindler's List (1993),1.0
Interstellar (2014),0.689583
12 Angry Men (1957),0.674274
Wallace & Gromit: The Wrong Trousers (1993),0.650442
"King's Speech, The (2010)",0.603023


In [215]:
# Define hybrid_recommender function
def hybrid_recommender(userId):
    # get the highly-rated movie by userId
    movie_id = ratings[(ratings["userId"] == userId) & (ratings["rating"] == 5.0)].sort_values(by="timestamp")["movieId"][0:1].values[0]
    
    # user-based 5 recommend movies list
    user_based_movies = []
    user_based_recommended = recommended_movies.merge(movies[['movieId', 'title']])[:5]['title'].to_list()
    for i in user_based_recommended:
        user_based_movies.append(i)
    
    # item-based 5 recommend movies list
    item_based_movies= []
    movie_name = movies[movies['movieId'] == movie_id]['title'].values[0]
    movie_name = user_movie_df[movie_name]
    movies_from_item_based = user_movie_df.corrwith(movie_name).sort_values(ascending=False)
    item_based_recommended = movies_from_item_based[1:6].index.to_list()
    for i in item_based_recommended:
        item_based_movies.append(i)
    
    return user_based_movies, item_based_movies

In [216]:
hybrid_recommender(300)

(['Whatever (1998)',
  'Incredible Journey, The (1963)',
  'Journey of Natty Gann, The (1985)',
  "She's All That (1999)",
  'Metroland (1997)'],
 ['Interstellar (2014)',
  '12 Angry Men (1957)',
  'Wallace & Gromit: The Wrong Trousers (1993)',
  "King's Speech, The (2010)",
  'When Harry Met Sally... (1989)'])