In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Installing surprise module**


In [None]:
!pip install matplotlib-venn



In [None]:
!apt-get -qq install -y libfluidsynth1 && pip install scikit-surprise



# **Library**

In [None]:
import pandas as pd
import numpy as np

from ast import literal_eval

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem.snowball import SnowballStemmer

import warnings; warnings.simplefilter('ignore')


# **Data extraction**

In [None]:
md = pd.read_csv('/content/drive/MyDrive/movies_metadata.csv')
links_small = pd.read_csv('/content/drive/MyDrive/links_small.csv')
credits = pd.read_csv('/content/drive/MyDrive/credits.csv')
keywords = pd.read_csv('/content/drive/MyDrive/keywords.csv')
ratings = pd.read_csv('/content/drive/MyDrive/ratings.csv.zip')

# **DATA EXPLORATION**

In [None]:
features = ['id', 'imdb_id', 'title', 
            'genres', 'overview', 'tagline', 'release_date', 
            'popularity', 'vote_average', 'vote_count']
md = md[features]
md[:10]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,release_date,popularity,vote_average,vote_count
0,862,tt0114709,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,1995-10-30,21.9469,7.7,5415.0
1,8844,tt0113497,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,1995-12-15,17.0155,6.9,2413.0
2,15602,tt0113228,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,11.7129,6.5,92.0
3,31357,tt0114885,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,1995-12-22,3.85949,6.1,34.0
4,11862,tt0113041,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,1995-02-10,8.38752,5.7,173.0
5,949,tt0113277,Heat,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,1995-12-15,17.9249,7.7,1886.0
6,11860,tt0114319,Sabrina,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",An ugly duckling having undergone a remarkable...,You are cordially invited to the most surprisi...,1995-12-15,6.67728,6.2,141.0
7,45325,tt0112302,Tom and Huck,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","A mischievous young boy, Tom Sawyer, witnesses...",The Original Bad Boys.,1995-12-22,2.56116,5.4,45.0
8,9091,tt0114576,Sudden Death,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",International action superstar Jean Claude Van...,Terror goes into overtime.,1995-12-22,5.23158,5.5,174.0
9,710,tt0113189,GoldenEye,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,1995-11-16,14.686,6.6,1194.0


In [None]:
ratings[:5]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [None]:
credits[:5]

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [None]:
keywords[:1]

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."


In [None]:
features = ['id', 'cast', 'crew']
credits = credits[features]
credits[:10]

Unnamed: 0,id,cast,crew
0,862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
5,949,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
6,11860,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de..."
7,45325,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de..."
8,9091,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de..."
9,710,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."


In [None]:
keywords[:10]

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
5,949,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
6,11860,"[{'id': 90, 'name': 'paris'}, {'id': 380, 'nam..."
7,45325,[]
8,9091,"[{'id': 949, 'name': 'terrorist'}, {'id': 1562..."
9,710,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."


In [None]:
features = ['tmdbId', 'movieId', 'imdbId']
links_small = links_small[features]
links_small.columns = ['id', 'movieId', 'imdbId']
links_small[:1]

Unnamed: 0,id,movieId,imdbId
0,862.0,1,114709


In [None]:
ratings[:1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529


# Data Manipulation

In [None]:
def clean_id(x):
    try:
        x = int(x)
    except:
        x = np.NaN
    return x
md['id'] = md['id'].apply(clean_id)
md.dropna(subset=['id'], inplace=True)
df = pd.merge(md, credits, how='inner', on='id')
df = pd.merge(df, keywords, how='inner', on='id')
df[:10]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,release_date,popularity,vote_average,vote_count,cast,crew,keywords
0,862.0,tt0114709,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,1995-10-30,21.9469,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844.0,tt0113497,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,1995-12-15,17.0155,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602.0,tt0113228,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,11.7129,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357.0,tt0114885,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,1995-12-22,3.85949,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862.0,tt0113041,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,1995-02-10,8.38752,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
5,949.0,tt0113277,Heat,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,1995-12-15,17.9249,7.7,1886.0,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...","[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
6,11860.0,tt0114319,Sabrina,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",An ugly duckling having undergone a remarkable...,You are cordially invited to the most surprisi...,1995-12-15,6.67728,6.2,141.0,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de...","[{'id': 90, 'name': 'paris'}, {'id': 380, 'nam..."
7,45325.0,tt0112302,Tom and Huck,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","A mischievous young boy, Tom Sawyer, witnesses...",The Original Bad Boys.,1995-12-22,2.56116,5.4,45.0,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de...",[]
8,9091.0,tt0114576,Sudden Death,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",International action superstar Jean Claude Van...,Terror goes into overtime.,1995-12-22,5.23158,5.5,174.0,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de...","[{'id': 949, 'name': 'terrorist'}, {'id': 1562..."
9,710.0,tt0113189,GoldenEye,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,1995-11-16,14.686,6.6,1194.0,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...","[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."


In [None]:
df['tagline'] = df['tagline'].fillna('')
df['overview'] = df['overview'].fillna('')
df['description'] = df['overview'] + df['tagline']
df[:5]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,release_date,popularity,vote_average,vote_count,cast,crew,keywords,description
0,862.0,tt0114709,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,1995-10-30,21.9469,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","Led by Woody, Andy's toys live happily in his ..."
1,8844.0,tt0113497,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,1995-12-15,17.0155,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",When siblings Judy and Peter discover an encha...
2,15602.0,tt0113228,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,11.7129,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",A family wedding reignites the ancient feud be...
3,31357.0,tt0114885,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,1995-12-22,3.85949,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","Cheated on, mistreated and stepped on, the wom..."
4,11862.0,tt0113041,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,1995-02-10,8.38752,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",Just when George Banks has recovered from his ...


In [None]:
df['genres'] = df['genres'].fillna('[]')
df['genres'] = df['genres'].apply(literal_eval)
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df.head(5)

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,release_date,popularity,vote_average,vote_count,cast,crew,keywords,description
0,862.0,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,1995-10-30,21.9469,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","Led by Woody, Andy's toys live happily in his ..."
1,8844.0,tt0113497,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,1995-12-15,17.0155,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",When siblings Judy and Peter discover an encha...
2,15602.0,tt0113228,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,11.7129,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",A family wedding reignites the ancient feud be...
3,31357.0,tt0114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,1995-12-22,3.85949,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","Cheated on, mistreated and stepped on, the wom..."
4,11862.0,tt0113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,1995-02-10,8.38752,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",Just when George Banks has recovered from his ...


In [None]:
def genres_list(df_genres):
    genres = set()
        
    for genres_list in df_genres:
        try:
            genres.update(genres_list)
        except AttributeError:
            pass
    
    return genres
    
genres = genres_list(df['genres'])
print(genres)

{'Science Fiction', 'Romance', 'Music', 'Animation', 'Western', 'Documentary', 'Foreign', 'Thriller', 'TV Movie', 'Mystery', 'Horror', 'Drama', 'Crime', 'History', 'Action', 'Adventure', 'War', 'Fantasy', 'Family', 'Comedy'}


In [None]:
def split_genres(var):
    try:
        if gene in var:
            return 1
        else:
            return 0
    except AttributeError:
        return 0

for gene in genres:        
    df[gene] = df['genres'].apply(split_genres)

In [None]:
df[:5]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,release_date,popularity,vote_average,vote_count,cast,crew,keywords,description,Science Fiction,Romance,Music,Animation,Western,Documentary,Foreign,Thriller,TV Movie,Mystery,Horror,Drama,Crime,History,Action,Adventure,War,Fantasy,Family,Comedy
0,862.0,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,1995-10-30,21.9469,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","Led by Woody, Andy's toys live happily in his ...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,8844.0,tt0113497,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,1995-12-15,17.0155,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",When siblings Judy and Peter discover an encha...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0
2,15602.0,tt0113228,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,1995-12-22,11.7129,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",A family wedding reignites the ancient feud be...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,31357.0,tt0114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,1995-12-22,3.85949,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","Cheated on, mistreated and stepped on, the wom...",0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
4,11862.0,tt0113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,1995-02-10,8.38752,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",Just when George Banks has recovered from his ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# **clean year**

In [None]:
def clean_year(x):
    if x != np.nan:
        year = str(x).split('-')[0]
        return year
    else:
        return np.NaN
    
df.dropna(subset=['release_date'], inplace=True)
df['year'] = df['release_date'].apply(clean_year)
df = df.drop(['release_date'], axis=1)

In [None]:
df[:5]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,popularity,vote_average,vote_count,cast,crew,keywords,description,Science Fiction,Romance,Music,Animation,Western,Documentary,Foreign,Thriller,TV Movie,Mystery,Horror,Drama,Crime,History,Action,Adventure,War,Fantasy,Family,Comedy,year
0,862.0,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,21.9469,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","Led by Woody, Andy's toys live happily in his ...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1995
1,8844.0,tt0113497,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,17.0155,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",When siblings Judy and Peter discover an encha...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1995
2,15602.0,tt0113228,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,11.7129,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",A family wedding reignites the ancient feud be...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995
3,31357.0,tt0114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,3.85949,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","Cheated on, mistreated and stepped on, the wom...",0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1995
4,11862.0,tt0113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,8.38752,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",Just when George Banks has recovered from his ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995


**clean cast,crew and keywords**

In [None]:
df['cast'] = df['cast'].apply(literal_eval)
df['crew'] = df['crew'].apply(literal_eval)
df['keywords'] = df['keywords'].apply(literal_eval)
df[:5]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,popularity,vote_average,vote_count,cast,crew,keywords,description,Science Fiction,Romance,Music,Animation,Western,Documentary,Foreign,Thriller,TV Movie,Mystery,Horror,Drama,Crime,History,Action,Adventure,War,Fantasy,Family,Comedy,year
0,862.0,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,21.9469,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","Led by Woody, Andy's toys live happily in his ...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1995
1,8844.0,tt0113497,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,17.0155,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",When siblings Judy and Peter discover an encha...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1995
2,15602.0,tt0113228,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,11.7129,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",A family wedding reignites the ancient feud be...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995
3,31357.0,tt0114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,3.85949,6.1,34.0,"[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...","Cheated on, mistreated and stepped on, the wom...",0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1995
4,11862.0,tt0113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,8.38752,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",Just when George Banks has recovered from his ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995


In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.NaN

In [None]:
#df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['director'] = df['crew'].apply(get_director)
df['director'] = df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
df['director'] = df['director'].apply(lambda x: [x])

clean cast

In [None]:
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['cast'] = df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
df['cast'] = df['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

clean keywords and crew


In [None]:
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['crew'] = df['crew'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
dictionary = {}

def count_words(word_list):
    for word in word_list:
        if dictionary.get(word) == None:
            dictionary[word] = 1
        else:
            dictionary[word] += 1


df['keywords'].apply(count_words)


dictionary_copy = dictionary.copy()

In [None]:
def filter_keywords(word_list):
    words = []
    for word in word_list:
        if dictionary.get(word):
            words.append(word)
    return words

stemming keywords


In [None]:
stemmer = SnowballStemmer('english')
stemmer.stem('chilling')

'chill'

In [None]:
df['keywords'] = df['keywords'].apply(filter_keywords)
df['keywords'] = df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
df['keywords'] = df['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
df[:5]

Unnamed: 0,id,imdb_id,title,genres,overview,tagline,popularity,vote_average,vote_count,cast,crew,keywords,description,Science Fiction,Romance,Music,Animation,Western,Documentary,Foreign,Thriller,TV Movie,Mystery,Horror,Drama,Crime,History,Action,Adventure,War,Fantasy,Family,Comedy,year,director
0,862.0,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,21.9469,7.7,5415.0,"[tomhanks, timallen, donrickles]","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[jealousi, toy, boy, friendship, friend, rival...","Led by Woody, Andy's toys live happily in his ...",0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1995,[johnlasseter]
1,8844.0,tt0113497,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,17.0155,6.9,2413.0,"[robinwilliams, jonathanhyde, kirstendunst]","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[boardgam, disappear, basedonchildren'sbook, n...",When siblings Judy and Peter discover an encha...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1995,[joejohnston]
2,15602.0,tt0113228,Grumpier Old Men,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,11.7129,6.5,92.0,"[waltermatthau, jacklemmon, ann-margret]","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[fish, bestfriend, duringcreditssting, oldmen]",A family wedding reignites the ancient feud be...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995,[howarddeutch]
3,31357.0,tt0114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,3.85949,6.1,34.0,"[whitneyhouston, angelabassett, lorettadevine]","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[basedonnovel, interracialrelationship, single...","Cheated on, mistreated and stepped on, the wom...",0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1995,[forestwhitaker]
4,11862.0,tt0113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,8.38752,5.7,173.0,"[stevemartin, dianekeaton, martinshort]","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[babi, midlifecrisi, confid, age, daughter, mo...",Just when George Banks has recovered from his ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995,[charlesshyer]


# **Simple Recommender**

In [None]:
def weighted_rating(x, C, m):
    v = x['vote_count']
    R = x['vote_average']
    #display(x)
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
def simple_recommender(genre, percentile=0.85):
    df_genre = df[df[genre] == 1]
     
    vote_counts = df_genre[df_genre['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df_genre[df_genre['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    display(m)
    
    features = ['title', 'year', 'vote_count', 'vote_average', 'popularity']
    
    weighted = df_genre[(df_genre['vote_count'] >= m) & (df_genre['vote_count'].notnull()) & (df_genre['vote_average'].notnull())][features]
    weighted['wr'] = weighted.apply(lambda x: weighted_rating(x, C, m), axis=1)
    weighted = weighted.sort_values('wr', ascending=True)
    
    return weighted

In [None]:
simple_recommender('Horror').tail(5)


656.6000000000001

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
1827,The Thing,1982,1629.0,7.8,16.8313,7.141587
974,Aliens,1986,3282.0,7.7,21.7612,7.334589
987,Alien,1979,4564.0,7.9,23.3774,7.599167
992,Psycho,1960,2405.0,8.3,36.8263,7.701239
1029,The Shining,1980,3890.0,8.1,19.6116,7.725688


# **content based**

In [None]:
df = pd.merge(df, links_small, how='inner', on='id')

features = ['id', 'movieId','imdb_id', 'title', 'genres', 'overview', 'tagline', 'description','popularity',
       'vote_average', 'vote_count', 'year', 'cast', 'crew','director', 'keywords',
       'History', 'Horror', 'Foreign', 'Action', 'Documentary',
       'Science Fiction', 'Music', 'Romance', 'Animation', 'TV Movie',
       'Fantasy', 'War', 'Comedy', 'Family', 'Drama', 'Western', 'Crime',
       'Adventure', 'Mystery', 'Thriller']
df = df[features]
df[:1]

Unnamed: 0,id,movieId,imdb_id,title,genres,overview,tagline,description,popularity,vote_average,vote_count,year,cast,crew,director,keywords,History,Horror,Foreign,Action,Documentary,Science Fiction,Music,Romance,Animation,TV Movie,Fantasy,War,Comedy,Family,Drama,Western,Crime,Adventure,Mystery,Thriller
0,862.0,1,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,"Led by Woody, Andy's toys live happily in his ...",21.9469,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[John Lasseter, Joss Whedon, Andrew Stanton, J...",[johnlasseter],"[jealousi, toy, boy, friendship, friend, rival...",0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0


In [None]:
df = df.reset_index()
df[:5]
indices_to_titles = df['title'] 
title_to_index = pd.Series(data=df.index, index=df['title'])

In [None]:
df['mix'] = df['cast'] + df['director'] + df['keywords'] + df['genres']
df['mix'] = df['mix'].apply(lambda x: ' '.join(x))
df['mix'][:1]

0    tomhanks timallen donrickles johnlasseter jeal...
Name: mix, dtype: object

**Count vectorizer**

In [None]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),stop_words='english')
count_matrix = count.fit_transform(df['mix'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
def updated_recommendations(title):
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:30]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    display(m)
    
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(lambda x: weighted_rating(x, C, m), axis=1)    
    qualified = qualified.sort_values('wr', ascending=True).head(10)
    return qualified

In [None]:
updated_recommendations('Toy Story').head(5)

572.6000000000001

Unnamed: 0,title,vote_count,vote_average,year,wr
5456,Garfield,851,5,2004,5.443829
7404,Cloudy with a Chance of Meatballs,1799,6,2009,6.024977
1662,One Hundred and One Dalmatians,1643,6,1961,6.026735
3016,Chicken Run,1190,6,2000,6.033606
6968,Horton Hears a Who!,927,6,2008,6.0395


## collaberative filtering

In [None]:
ratings[:5]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [None]:
from surprise import Reader, Dataset,accuracy
#from surprise import NormalPredictor
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']][:10000], reader)


In [None]:
from surprise import SVD
algo = SVD()
from surprise.model_selection import cross_validate
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9396  0.9209  0.9280  0.9295  0.0077  
MAE (testset)     0.7279  0.7148  0.7164  0.7197  0.0059  
Fit time          0.34    0.34    0.36    0.34    0.01    
Test time         0.02    0.05    0.02    0.03    0.01    


In [None]:
algo.fit(trainset)

predictions = algo.test(testset)


accuracy.mae(predictions)

MAE:  0.7175


0.7174828738199972

In [None]:
user_id = 514
movie_id = 505
real_rating = 3
algo.predict(user_id, movie_id, real_rating, verbose=True)

user: 514        item: 505        r_ui = 3.00   est = 3.59   {'was_impossible': False}


Prediction(uid=514, iid=505, r_ui=3, est=3.5858922963601048, details={'was_impossible': False})

In [None]:
indices_map = df.set_index('id')
indices_map.head(1)

Unnamed: 0_level_0,index,movieId,imdb_id,title,genres,overview,tagline,description,popularity,vote_average,vote_count,year,cast,crew,director,keywords,History,Horror,Foreign,Action,Documentary,Science Fiction,Music,Romance,Animation,TV Movie,Fantasy,War,Comedy,Family,Drama,Western,Crime,Adventure,Mystery,Thriller,mix
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
862.0,0,1,tt0114709,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",,"Led by Woody, Andy's toys live happily in his ...",21.9469,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[John Lasseter, Joss Whedon, Andrew Stanton, J...",[johnlasseter],"[jealousi, toy, boy, friendship, friend, rival...",0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,tomhanks timallen donrickles johnlasseter jeal...


# Hybrid Filtering

In [None]:
def hybrid(userId, title):
    idx = title_to_index[title]
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: algo.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.tail(10)

In [None]:
hybrid(1, 'Toy Story')

Unnamed: 0,title,vote_count,vote_average,year,id,est
8432,Despicable Me 2,4729.0,7.0,2013,93456.0,4.048964
320,The Flintstones,559.0,5.0,1994,888.0,4.047518
543,The Aristocats,1287.0,7.1,1970,10112.0,4.046641
4272,Jonah: A VeggieTales Movie,22.0,6.4,2002,15173.0,4.006746
7404,Cloudy with a Chance of Meatballs,1799.0,6.5,2009,22794.0,3.910365
7629,Toy Story 3,4710.0,7.6,2010,10193.0,3.889925
607,Oliver & Company,372.0,6.5,1988,12233.0,3.857942
1832,Antz,1320.0,6.0,1998,8916.0,3.788548
3016,Chicken Run,1190.0,6.5,2000,7443.0,3.710403
8595,The Lego Movie,3127.0,7.5,2014,137106.0,3.665922
