# Cosine Similarity using Movies MetaData.

Assigned to Jaini Patel (jp1891)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import pickle
# from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
# !python3 -m pip install pickle-mixin

### Simple Recommendation

In [15]:
md = pd.read_csv('data/movies_metadata.csv')
md.head()

Unnamed: 0.1,Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director,soup
0,0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,...,7.7,5415.0,1995,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","['jealousi', 'toy', 'boy', 'friendship', 'frie...",13,106,"['johnlasseter', 'johnlasseter', 'johnlasseter']",jealousi toy boy friendship friend rivalri boy...
1,1,1,False,,65000000,"['Adventure', 'Fantasy', 'Family']",,8844,tt0113497,en,...,6.9,2413.0,1995,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","['boardgam', 'disappear', ""basedonchildren'sbo...",26,16,"['joejohnston', 'joejohnston', 'joejohnston']",boardgam disappear basedonchildren'sbook newho...
2,2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"['Romance', 'Comedy']",,15602,tt0113228,en,...,6.5,92.0,1995,"['waltermatthau', 'jacklemmon', 'ann-margret']","[{'credit_id': '52fe466a9251416c75077a89', 'de...","['fish', 'bestfriend', 'duringcreditssting', '...",7,4,"['howarddeutch', 'howarddeutch', 'howarddeutch']",fish bestfriend duringcreditssting oldmen walt...
3,3,3,False,,16000000,"['Comedy', 'Drama', 'Romance']",,31357,tt0114885,en,...,6.1,34.0,1995,"['whitneyhouston', 'angelabassett', 'lorettade...","[{'credit_id': '52fe44779251416c91011acb', 'de...","['basedonnovel', 'interracialrelationship', 's...",10,10,"['forestwhitaker', 'forestwhitaker', 'forestwh...",basedonnovel interracialrelationship singlemot...
4,4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,['Comedy'],,11862,tt0113041,en,...,5.7,173.0,1995,"['stevemartin', 'dianekeaton', 'martinshort']","[{'credit_id': '52fe44959251416c75039ed7', 'de...","['babi', 'midlifecrisi', 'confid', 'age', 'dau...",12,7,"['charlesshyer', 'charlesshyer', 'charlesshyer']",babi midlifecrisi confid age daughter motherda...


In [17]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval)
md['genres']

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
46623                 [Drama, Family]
46624                         [Drama]
46625       [Action, Drama, Thriller]
46626                              []
46627                              []
Name: genres, Length: 46628, dtype: object

In [18]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.238696808510638

In [19]:
m = vote_counts.quantile(0.95) #top 5% rank
m

425.0

In [20]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [21]:
qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2335, 6)

In [22]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [23]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [24]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

#### Top Movies

In [25]:
qualified.head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15651,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.919065
12589,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.907551
23076,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.898936
2870,Fight Club,1999,9678,8,63.869599,[Drama],7.883841
4904,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.874042
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.870967
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.866383
7069,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.864345
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.863095
5876,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.854506


In [26]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [27]:
def build_chart(genre, percentile=0.85):
    df = gen_md[gen_md['genre'] == genre]
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

#### Top Romance Movies

In [28]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10397,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,8.572162
351,Forrest Gump,1994,8147,8,48.307194,7.971846
890,Vertigo,1958,1162,8,18.20822,7.814646
41418,Your Name.,2016,1030,8,34.461252,7.792781
897,Some Like It Hot,1959,835,8,11.845107,7.74905
1153,Cinema Paradiso,1988,834,8,14.177005,7.748778
20097,Paperman,2012,734,8,7.198633,7.718252
39032,Sing Street,2016,669,8,10.672862,7.694092
896,The Apartment,1960,498,8,11.994281,7.604977
39887,The Handmaiden,2016,453,8,16.727405,7.57218


### Content Based Filtering:

In [31]:
links_small = pd.read_csv('data/links.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [32]:
md = md.drop([19730, 29503, 35587])

In [33]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

In [34]:
smd = md[md['id'].isin(links_small)]
smd.shape


(46625, 34)

#### Movie Based Recommender

In [35]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [36]:
smd.head()

Unnamed: 0.1,Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,vote_count,year,cast,crew,keywords,cast_size,crew_size,director,soup,description
0,0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,...,5415.0,1995,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","['jealousi', 'toy', 'boy', 'friendship', 'frie...",13,106,"['johnlasseter', 'johnlasseter', 'johnlasseter']",jealousi toy boy friendship friend rivalri boy...,"Led by Woody, Andy's toys live happily in his ..."
1,1,1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,...,2413.0,1995,"['robinwilliams', 'jonathanhyde', 'kirstendunst']","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","['boardgam', 'disappear', ""basedonchildren'sbo...",26,16,"['joejohnston', 'joejohnston', 'joejohnston']",boardgam disappear basedonchildren'sbook newho...,When siblings Judy and Peter discover an encha...
2,2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,...,92.0,1995,"['waltermatthau', 'jacklemmon', 'ann-margret']","[{'credit_id': '52fe466a9251416c75077a89', 'de...","['fish', 'bestfriend', 'duringcreditssting', '...",7,4,"['howarddeutch', 'howarddeutch', 'howarddeutch']",fish bestfriend duringcreditssting oldmen walt...,A family wedding reignites the ancient feud be...
3,3,3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,...,34.0,1995,"['whitneyhouston', 'angelabassett', 'lorettade...","[{'credit_id': '52fe44779251416c91011acb', 'de...","['basedonnovel', 'interracialrelationship', 's...",10,10,"['forestwhitaker', 'forestwhitaker', 'forestwh...",basedonnovel interracialrelationship singlemot...,"Cheated on, mistreated and stepped on, the wom..."
4,4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,...,173.0,1995,"['stevemartin', 'dianekeaton', 'martinshort']","[{'credit_id': '52fe44959251416c75039ed7', 'de...","['babi', 'midlifecrisi', 'confid', 'age', 'dau...",12,7,"['charlesshyer', 'charlesshyer', 'charlesshyer']",babi midlifecrisi confid age daughter motherda...,Just when George Banks has recovered from his ...


In [37]:
# smd.to_csv("/Users/jainipatel/Spring2021/MDM/archive/smd.csv")

In [38]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [39]:
tfidf_matrix.shape

(46625, 1104444)

#### Cosine Similarity.

In [40]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim.shape

In [43]:
cosine_df = pd.DataFrame(cosine_sim)

In [44]:
# cosine_df.shape

cosine_df.to_csv('/common/users/am2229/cosine_similarity_matrix_metadata.csv')

In [28]:
cosine_sim[0]
# cosine_sim.shape

array([1.        , 0.00511811, 0.        , ..., 0.        , 0.00236862,
       0.        ])

In [29]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [30]:
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45458
Century of Birthing            45459
Betrayal                       45460
Satan Triumphant               45461
Queerama                       45462
Length: 45463, dtype: int64

In [31]:
def get_recommendations(title):
    idx = indices[title]
    if idx.size > 1:
        idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [32]:
get_recommendations('The Godfather').head(10)

44027    The Godfather Trilogy: 1972-1990
1178               The Godfather: Part II
31971                    Honor Thy Father
21613                          The Family
23125                          Blood Ties
38027            A Mother Should Be Loved
18322                     The Outside Man
11297                    Household Saints
10821                            Election
4324                                 Made
Name: title, dtype: object

In [33]:
get_recommendations('The Dark Knight').head(10)

18252                                The Dark Knight Rises
150                                         Batman Forever
1328                                        Batman Returns
21193    Batman Unmasked: The Psychology of the Dark Kn...
15511                           Batman: Under the Red Hood
20231              Batman: The Dark Knight Returns, Part 2
41973                                The Lego Batman Movie
585                                                 Batman
25266                                    Batman vs Dracula
18035                                     Batman: Year One
Name: title, dtype: object

In [34]:
indices['The Dark Knight']

The Dark Knight    12481
The Dark Knight    28699
dtype: int64

#### Metadata Based Recommender

In [35]:
credits = pd.read_csv('/Users/jainipatel/Spring2021/MDM/archive/credits.csv')
keywords = pd.read_csv('/Users/jainipatel/Spring2021/MDM/archive/keywords.csv')

In [36]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [37]:
md.shape

(45463, 25)

In [38]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [39]:
smd = md[md['id'].isin(links_small)]
smd.shape

(46628, 28)

In [40]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [41]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [42]:
smd['director'] = smd['crew'].apply(get_director)

In [43]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [44]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [45]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [46]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

#### Keywords

In [47]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [48]:
s = s.value_counts()
s[:5]

woman director      3128
independent film    1942
murder              1314
based on novel       841
musical              734
Name: keyword, dtype: int64

In [49]:
s = s[s > 1]

In [50]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [51]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [52]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [53]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [54]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [55]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [56]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [57]:
get_recommendations('The Dark Knight').head(10)

18442    The Dark Knight Rises
10210            Batman Begins
11463             The Prestige
26110                Doodlebug
26111                Doodlebug
2486                 Following
45843                  Dunkirk
5302                  Insomnia
15651                Inception
4126                   Memento
Name: title, dtype: object

In [58]:
get_recommendations('Mean Girls').head(10)

4042                Head Over Heels
6513                  Freaky Friday
1590               The House of Yes
10422              Just Like Heaven
13994    Ghosts of Girlfriends Past
17472         Mr. Popper's Penguins
42333                   Bad Santa 2
23453               Vampire Academy
12524     The Spiderwick Chronicles
27879                      The DUFF
Name: title, dtype: object

#### Popularity and Ratings

In [59]:
def improved_recommendations(title):
    idx = indices[title]
    if idx.size > 1:
        idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [60]:
improved_recommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
15651,Inception,14075,8,2010,7.917588
23076,Interstellar,11187,8,2014,7.897107
11463,The Prestige,4510,8,2006,7.758148
4126,Memento,4168,8,2000,7.740175
18442,The Dark Knight Rises,9263,7,2012,6.921448
10210,Batman Begins,7511,7,2005,6.904127
45843,Dunkirk,2712,7,2017,6.757878
1349,Batman Returns,1706,6,1992,5.846862
31282,Batman v Superman: Dawn of Justice,7189,5,2016,5.013943
1511,Batman & Robin,1447,4,1997,4.287233


In [61]:
improved_recommendations('Mean Girls')

Unnamed: 0,title,vote_count,vote_average,year,wr
1879,The Breakfast Club,2189,7,1985,6.709602
40941,The Edge of Seventeen,952,7,2016,6.450422
27879,The DUFF,1372,6,2015,5.818541
4612,The Princess Diaries,1063,6,2001,5.781086
6513,Freaky Friday,919,6,2003,5.757786
10422,Just Like Heaven,595,6,2005,5.681521
12524,The Spiderwick Chronicles,593,6,2008,5.680901
23453,Vampire Academy,603,5,2014,5.102493
13994,Ghosts of Girlfriends Past,716,5,2009,5.092422
17472,Mr. Popper's Penguins,775,5,2011,5.087912


In [62]:
improved_recommendations('Avatar')

Unnamed: 0,title,vote_count,vote_average,year,wr
1659,Titanic,7770,7,1997,6.907153
21123,Star Trek Into Darkness,4479,7,2013,6.844959
582,Terminator 2: Judgment Day,4274,7,1991,6.838208
1216,The Terminator,4208,7,1984,6.835908
1179,Aliens,3282,7,1986,6.795018
1113,The Abyss,822,7,1989,6.393539
375,True Lies,1138,6,1994,5.79153
1831,Small Soldiers,522,6,1998,5.657202
24091,Jupiter Ascending,2816,5,2015,5.032703
13710,Dragonball Evolution,475,2,2009,3.549269


In [63]:
# pickle.dump(cosine_sim, open('cosine_sim.csv','wb'))

In [64]:
get_recommendations('Avatar').head(15) #cast and keywords

26774                          Avatar 2
9741                 Aliens of the Deep
45149        T2 3-D: Battle Across Time
1179                             Aliens
6220                Ghosts of the Abyss
1216                     The Terminator
582          Terminator 2: Judgment Day
1113                          The Abyss
5696     Piranha Part Two: The Spawning
375                           True Lies
1659                            Titanic
21123           Star Trek Into Darkness
17686                     Almighty Thor
26776                    Justice League
24091                 Jupiter Ascending
Name: title, dtype: object