# Content-based Recommendation

### Jaccard Similarity

In [1]:
import pandas as pd

In [2]:
movie = pd.read_csv('movies.csv')

In [3]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### 1. Preprocess the raw data

In [8]:
mv = movie['genres']

In [9]:
type(mv)

pandas.core.series.Series

In [10]:
mv

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

In [11]:
mv.str.split('|', expand = True) #expand: expand the split strings into separate columns

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,Adventure,Children,Fantasy,,,,,,,
2,Comedy,Romance,,,,,,,,
3,Comedy,Drama,Romance,,,,,,,
4,Comedy,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
9737,Action,Animation,Comedy,Fantasy,,,,,,
9738,Animation,Comedy,Fantasy,,,,,,,
9739,Drama,,,,,,,,,
9740,Action,Animation,,,,,,,,


In [12]:
genres = mv.str.split('|', expand = True)

In [13]:
genres.index # get the index of genres

RangeIndex(start=0, stop=9742, step=1)

In [14]:
movie['title'] #get the title from movie dataframe

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

- replace the index:

In [15]:
genres.index = movie['title'] #assign the index of genres equal to the title of movie dataframe

In [16]:
genres

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Toy Story (1995),Adventure,Animation,Children,Comedy,Fantasy,,,,,
Jumanji (1995),Adventure,Children,Fantasy,,,,,,,
Grumpier Old Men (1995),Comedy,Romance,,,,,,,,
Waiting to Exhale (1995),Comedy,Drama,Romance,,,,,,,
Father of the Bride Part II (1995),Comedy,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),Action,Animation,Comedy,Fantasy,,,,,,
No Game No Life: Zero (2017),Animation,Comedy,Fantasy,,,,,,,
Flint (2017),Drama,,,,,,,,,
Bungo Stray Dogs: Dead Apple (2018),Action,Animation,,,,,,,,


- stack

In [17]:
genres.stack() # Rotating the dataframe

title                                 
Toy Story (1995)                     0    Adventure
                                     1    Animation
                                     2     Children
                                     3       Comedy
                                     4      Fantasy
                                            ...    
No Game No Life: Zero (2017)         2      Fantasy
Flint (2017)                         0        Drama
Bungo Stray Dogs: Dead Apple (2018)  0       Action
                                     1    Animation
Andrew Dice Clay: Dice Rules (1991)  0       Comedy
Length: 22084, dtype: object

In [23]:
# Look into the index of this stacked genres
genres = genres.stack()

In [24]:
genres.index # It contains multiple index, two level of index

MultiIndex([(                         'Toy Story (1995)', 0),
            (                         'Toy Story (1995)', 1),
            (                         'Toy Story (1995)', 2),
            (                         'Toy Story (1995)', 3),
            (                         'Toy Story (1995)', 4),
            (                           'Jumanji (1995)', 0),
            (                           'Jumanji (1995)', 1),
            (                           'Jumanji (1995)', 2),
            (                  'Grumpier Old Men (1995)', 0),
            (                  'Grumpier Old Men (1995)', 1),
            ...
            ('Black Butler: Book of the Atlantic (2017)', 1),
            ('Black Butler: Book of the Atlantic (2017)', 2),
            ('Black Butler: Book of the Atlantic (2017)', 3),
            (             'No Game No Life: Zero (2017)', 0),
            (             'No Game No Life: Zero (2017)', 1),
            (             'No Game No Life: Zero (2017

In [25]:
genres

title                                 
Toy Story (1995)                     0    Adventure
                                     1    Animation
                                     2     Children
                                     3       Comedy
                                     4      Fantasy
                                            ...    
No Game No Life: Zero (2017)         2      Fantasy
Flint (2017)                         0        Drama
Bungo Stray Dogs: Dead Apple (2018)  0       Action
                                     1    Animation
Andrew Dice Clay: Dice Rules (1991)  0       Comedy
Length: 22084, dtype: object

- remove second index

In [28]:
genres.index = genres.index.droplevel(1) # Remove the second index

In [29]:
genres

title
Toy Story (1995)                       Adventure
Toy Story (1995)                       Animation
Toy Story (1995)                        Children
Toy Story (1995)                          Comedy
Toy Story (1995)                         Fantasy
                                         ...    
No Game No Life: Zero (2017)             Fantasy
Flint (2017)                               Drama
Bungo Stray Dogs: Dead Apple (2018)       Action
Bungo Stray Dogs: Dead Apple (2018)    Animation
Andrew Dice Clay: Dice Rules (1991)       Comedy
Length: 22084, dtype: object

- convert to DataFame

In [30]:
movie = pd.DataFrame(genres)

In [31]:
movie

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Toy Story (1995),Adventure
Toy Story (1995),Animation
Toy Story (1995),Children
Toy Story (1995),Comedy
Toy Story (1995),Fantasy
...,...
No Game No Life: Zero (2017),Fantasy
Flint (2017),Drama
Bungo Stray Dogs: Dead Apple (2018),Action
Bungo Stray Dogs: Dead Apple (2018),Animation


- reset index and rename column

In [32]:
movie = movie.reset_index()
movie

Unnamed: 0,title,0
0,Toy Story (1995),Adventure
1,Toy Story (1995),Animation
2,Toy Story (1995),Children
3,Toy Story (1995),Comedy
4,Toy Story (1995),Fantasy
...,...,...
22079,No Game No Life: Zero (2017),Fantasy
22080,Flint (2017),Drama
22081,Bungo Stray Dogs: Dead Apple (2018),Action
22082,Bungo Stray Dogs: Dead Apple (2018),Animation


In [34]:
movie = movie.rename(columns = {0: 'Genre'})

In [35]:
movie

Unnamed: 0,title,Genre
0,Toy Story (1995),Adventure
1,Toy Story (1995),Animation
2,Toy Story (1995),Children
3,Toy Story (1995),Comedy
4,Toy Story (1995),Fantasy
...,...,...
22079,No Game No Life: Zero (2017),Fantasy
22080,Flint (2017),Drama
22081,Bungo Stray Dogs: Dead Apple (2018),Action
22082,Bungo Stray Dogs: Dead Apple (2018),Animation


- reshape data

In [36]:
movie_final = pd.crosstab(movie['title'], movie['Genre'])

In [37]:
movie_final

Genre,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
'71 (2014),0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
'Hellboy': The Seeds of Creation (2004),0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
'Round Midnight (1986),0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
'Salem's Lot (2004),0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
'Til There Was You (1997),0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
xXx (2002),0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
xXx: State of the Union (2005),0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
¡Three Amigos! (1986),0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


- remove 'no genres listed' movie

In [38]:
movie_final.loc[movie_final['(no genres listed)'] == 1]
# These movies don't have any genres, having no meaning, should be removed

Genre,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
A Christmas Story Live! (2017),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A Cosmic Christmas (1977),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
A Midsummer Night's Dream (2016),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Ali Wong: Baby Cobra (2016),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Ben-hur (2016),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Black Mirror,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cosmos,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Death Note: Desu nôto (2006–2007),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Ethel & Ernest (2016),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Generation Iron 2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
movie.loc[movie['Genre'] == '(no genres listed)']

Unnamed: 0,title,Genre
19492,La cravate (1957),(no genres listed)
19881,Ben-hur (2016),(no genres listed)
19888,Pirates of the Caribbean: Dead Men Tell No Tal...,(no genres listed)
20100,Superfast! (2015),(no genres listed)
20229,Let It Be Me (1995),(no genres listed)
20373,Trevor Noah: African American (2013),(no genres listed)
20664,Guardians (2016),(no genres listed)
20699,Green Room (2015),(no genres listed)
20733,The Brand New Testament (2015),(no genres listed)
20772,Hyena Road,(no genres listed)


In [40]:
movie_keep = movie.loc[movie['Genre'] != '(no genres listed)']

In [41]:
movie_final = pd.crosstab(movie_keep['title'], movie_keep['Genre'])

In [42]:
movie_final

Genre,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
'71 (2014),1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
'Hellboy': The Seeds of Creation (2004),1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
'Round Midnight (1986),0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
'Salem's Lot (2004),0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
'Til There Was You (1997),0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
xXx (2002),1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
xXx: State of the Union (2005),1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
¡Three Amigos! (1986),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


#### 2.Make Content-based Recommendations – Jaccard Similarity

In [43]:
# Import numpy and the Jaccard similarity function
import numpy as np
from sklearn.metrics import jaccard_score

# Import functions from scipy
from scipy.spatial.distance import pdist, squareform

- Calculate all pairwise distances

In [44]:
jaccard_distances = pdist(movie_final.values, metric = 'jaccard')
# use the pdist() (pairwise distance) function from scipy to find the similarities between the rows in a DataFrame

In [45]:
jaccard_distances

array([0.875     , 0.8       , 0.66666667, ..., 1.        , 1.        ,
       0.66666667])

In [46]:
jaccard_distances.shape

(47069253,)

- Convert the distances to a square matrix

In [47]:
jaccard_similarity_array = 1 - squareform(jaccard_distances)
# reshape it into the desired rectangular shape using squareform()

In [48]:
jaccard_similarity_array.shape

(9703, 9703)

- Wrap the array in a pandas DataFrame

In [49]:
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_final.index, columns=movie_final.index)

In [50]:
jaccard_similarity_df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.125000,0.200000,0.333333,0.200000,0.0,0.0,0.25,0.166667,0.000000,...,0.40,0.40,0.20,0.20,0.200000,0.400000,0.400000,0.400000,0.000000,0.000000
'Hellboy': The Seeds of Creation (2004),0.125000,1.000000,0.000000,0.000000,0.000000,0.0,0.2,0.00,0.142857,0.285714,...,0.00,0.00,0.00,0.00,0.000000,0.142857,0.142857,0.142857,0.166667,0.166667
'Round Midnight (1986),0.200000,0.000000,1.000000,0.200000,0.333333,0.0,0.0,0.50,0.250000,0.000000,...,0.25,0.25,0.00,0.00,0.333333,0.000000,0.000000,0.000000,0.000000,0.333333
'Salem's Lot (2004),0.333333,0.000000,0.200000,1.000000,0.200000,0.0,0.0,0.25,0.166667,0.000000,...,0.40,0.75,0.50,0.50,0.200000,0.166667,0.166667,0.166667,0.000000,0.000000
'Til There Was You (1997),0.200000,0.000000,0.333333,0.200000,1.000000,0.5,0.0,0.50,0.666667,0.000000,...,0.25,0.25,0.00,0.00,0.333333,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.400000,0.142857,0.000000,0.166667,0.000000,0.0,0.0,0.00,0.000000,0.166667,...,0.20,0.20,0.25,0.25,0.000000,1.000000,0.500000,0.500000,0.000000,0.000000
xXx (2002),0.400000,0.142857,0.000000,0.166667,0.000000,0.0,0.0,0.00,0.000000,0.000000,...,0.50,0.20,0.25,0.25,0.000000,0.500000,1.000000,1.000000,0.000000,0.000000
xXx: State of the Union (2005),0.400000,0.142857,0.000000,0.166667,0.000000,0.0,0.0,0.00,0.000000,0.000000,...,0.50,0.20,0.25,0.25,0.000000,0.500000,1.000000,1.000000,0.000000,0.000000
¡Three Amigos! (1986),0.000000,0.166667,0.000000,0.000000,0.000000,0.0,0.5,0.00,0.250000,0.200000,...,0.00,0.00,0.00,0.00,0.000000,0.000000,0.000000,0.000000,1.000000,0.333333


- Make a recommendation

In [51]:
# Find the values for the movie Thor
jaccard_similarity_series = jaccard_similarity_df.loc['Thor (2011)']

In [52]:
jaccard_similarity_series

title
'71 (2014)                                   0.285714
'Hellboy': The Seeds of Creation (2004)      0.428571
'Round Midnight (1986)                       0.166667
'Salem's Lot (2004)                          0.125000
'Til There Was You (1997)                    0.166667
                                               ...   
eXistenZ (1999)                              0.142857
xXx (2002)                                   0.142857
xXx: State of the Union (2005)               0.142857
¡Three Amigos! (1986)                        0.000000
À nous la liberté (Freedom for Us) (1931)    0.000000
Name: Thor (2011), Length: 9703, dtype: float64

In [53]:
# Sort these values from highest to lowest
ordered_similarities = jaccard_similarity_series.sort_values(ascending = False)

In [54]:
ordered_similarities

title
Thor (2011)                                            1.000000
Harry Potter and the Deathly Hallows: Part 2 (2011)    0.833333
Harry Potter and the Order of the Phoenix (2007)       0.800000
Oz the Great and Powerful (2013)                       0.800000
Harry Potter and the Deathly Hallows: Part 1 (2010)    0.800000
                                                         ...   
La Belle Verte (1996)                                  0.000000
La vérité si je mens ! (1997)                          0.000000
Ladies Man, The (2000)                                 0.000000
Lady Bird (2017)                                       0.000000
L.A. Story (1991)                                      0.000000
Name: Thor (2011), Length: 9703, dtype: float64

### Text-based Similarities

#### 1. Clean the data

In [55]:
# Load Movies Metadata
metadata = pd.read_csv('movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [56]:
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [57]:
metadata[['original_title', 'overview']]

Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...
45461,رگ خواب,Rising and falling between a man and woman.
45462,Siglo ng Pagluluwal,An artist struggles to finish his work while a...
45463,Betrayal,"When one of her hits goes wrong, a professiona..."
45464,Satana likuyushchiy,"In a small town live two brothers, one a minis..."


In [59]:
df_plots = metadata[['original_title', 'overview']].head(100) # Only calculate for 100 samples

In [60]:
#drop missing overviews
df_plots = df_plots.dropna()
df_plots

Unnamed: 0,original_title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...
95,La Haine,Aimlessly whiling away their days in the concr...
96,Shopping,"A dark, hip, urban story of a barren and anony..."
97,Heidi Fleiss: Hollywood Madam,A documentary crew from the BBC arrives in L.A...
98,City Hall,The accidental shooting of a boy in New York l...


#### 2. Create the TF-IDF DataFrame

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer()

- Filter the data

In [62]:
vectorizer = TfidfVectorizer(max_df = 0.7, min_df = 2) 
# max_df = 0.7: If frequency of a word exceed 0.7 then drop it, the word is probably "the"
# min_df = 2: Frequency of a word must at least appear in two overviews of movie, if equal to 1 would mean that the word is too distinct

- Vectorize the data

In [63]:
vectorized_data = vectorizer.fit_transform(df_plots['overview']) 

- Format the data to a DataFrame

In [64]:
# Create Dataframe from TF-IDFarray
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())

In [65]:
tfidf_df

Unnamed: 0,12,1930s,able,about,accompanies,action,actress,adventure,african,after,...,works,world,would,writing,year,years,yet,york,young,younger
0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
1,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.120011,0.0,0.0,0.00000,0.166538,0.000000,0.000000,0.000000,0.0
2,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
3,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
4,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.245526,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
95,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
96,0.0,0.0,0.0,0.08471,0.0,0.0,0.0,0.0,0.000000,0.080034,...,0.0,0.000000,0.0,0.0,0.07608,0.000000,0.000000,0.000000,0.080034,0.0
97,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.377727,0.000000,0.0


In [66]:
# Assign the movie titles to the index and inspect
tfidf_df.index = df_plots['original_title']
tfidf_df.head()

Unnamed: 0_level_0,12,1930s,able,about,accompanies,action,actress,adventure,african,after,...,works,world,would,writing,year,years,yet,york,young,younger
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.120011,0.0,0.0,0.0,0.166538,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
tfidf_df

Unnamed: 0_level_0,12,1930s,able,about,accompanies,action,actress,adventure,african,after,...,works,world,would,writing,year,years,yet,york,young,younger
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
Jumanji,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.120011,0.0,0.0,0.00000,0.166538,0.000000,0.000000,0.000000,0.0
Grumpier Old Men,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
Waiting to Exhale,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
La Haine,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.245526,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
Shopping,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
Heidi Fleiss: Hollywood Madam,0.0,0.0,0.0,0.08471,0.0,0.0,0.0,0.0,0.000000,0.080034,...,0.0,0.000000,0.0,0.0,0.07608,0.000000,0.000000,0.000000,0.080034,0.0
City Hall,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.00000,0.000000,0.000000,0.377727,0.000000,0.0


#### 3. Calculate Cosine Similarity and make recommendations

In [68]:
# Import cosine_similarity measure
from sklearn.metrics.pairwise import cosine_similarity

In [69]:
# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_df)

In [70]:
# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index = tfidf_df.index, columns = tfidf_df.index)

In [71]:
cosine_similarity_df

original_title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Mary Reilly,Vampire in Brooklyn,Beautiful Girls,Broken Arrow,A Midwinter's Tale,La Haine,Shopping,Heidi Fleiss: Hollywood Madam,City Hall,Bottle Rocket
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,1.000000,0.083435,0.033010,0.042938,0.108803,0.053493,0.040938,0.016796,0.070338,0.043062,...,0.053632,0.144492,0.069437,0.084063,0.111870,0.213796,0.039431,0.044669,0.055772,0.060196
Jumanji,0.083435,1.000000,0.089436,0.081010,0.074653,0.169403,0.051432,0.025393,0.241365,0.023399,...,0.000000,0.018047,0.026605,0.092593,0.095535,0.071925,0.218799,0.100770,0.039802,0.077979
Grumpier Old Men,0.033010,0.089436,1.000000,0.080034,0.116561,0.010122,0.017476,0.068761,0.033146,0.000000,...,0.036639,0.130231,0.098723,0.055085,0.056003,0.073173,0.102159,0.116453,0.016807,0.079741
Waiting to Exhale,0.042938,0.081010,0.080034,1.000000,0.050331,0.114753,0.044514,0.039893,0.004369,0.062590,...,0.000000,0.065712,0.128644,0.042261,0.150829,0.101080,0.098775,0.063312,0.010688,0.099695
Father of the Bride Part II,0.108803,0.074653,0.116561,0.050331,1.000000,0.037094,0.122301,0.048021,0.104825,0.066489,...,0.044239,0.109381,0.052696,0.069120,0.170679,0.068189,0.046965,0.098946,0.017069,0.118322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
La Haine,0.213796,0.071925,0.073173,0.101080,0.068189,0.116398,0.046821,0.017485,0.046318,0.010526,...,0.011297,0.022999,0.103334,0.010402,0.091391,1.000000,0.069927,0.091275,0.047245,0.014804
Shopping,0.039431,0.218799,0.102159,0.098775,0.046965,0.085966,0.215632,0.089433,0.149161,0.027934,...,0.070443,0.061118,0.043051,0.094899,0.136448,0.069927,1.000000,0.086151,0.059287,0.044796
Heidi Fleiss: Hollywood Madam,0.044669,0.100770,0.116453,0.063312,0.098946,0.159031,0.209288,0.051196,0.037264,0.026358,...,0.014979,0.054320,0.034086,0.070174,0.082610,0.091275,0.086151,1.000000,0.004656,0.034500
City Hall,0.055772,0.039802,0.016807,0.010688,0.017069,0.116465,0.035682,0.155283,0.058818,0.018264,...,0.019602,0.039907,0.036190,0.018049,0.054066,0.047245,0.059287,0.004656,1.000000,0.123509


In [74]:
cosine_similarity_df.describe()

original_title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Mary Reilly,Vampire in Brooklyn,Beautiful Girls,Broken Arrow,A Midwinter's Tale,La Haine,Shopping,Heidi Fleiss: Hollywood Madam,City Hall,Bottle Rocket
count,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,0.077779,0.074446,0.06896,0.069546,0.110514,0.069556,0.069819,0.063056,0.068759,0.049852,...,0.059755,0.092694,0.066591,0.07209,0.117103,0.06117,0.090484,0.088757,0.059823,0.074207
std,0.100788,0.104219,0.105361,0.105096,0.109854,0.10784,0.105882,0.104631,0.103754,0.106541,...,0.107977,0.102954,0.103596,0.101182,0.10422,0.103906,0.103288,0.105578,0.107247,0.104357
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.039466,0.037814,0.028011,0.024097,0.05734,0.018226,0.029413,0.023068,0.025954,0.009533,...,0.019074,0.050642,0.025878,0.034678,0.068206,0.021004,0.046105,0.046161,0.017156,0.029294
50%,0.069437,0.055163,0.052021,0.051941,0.098946,0.048165,0.050375,0.044586,0.054027,0.027934,...,0.037914,0.085851,0.048348,0.059738,0.106713,0.043918,0.073963,0.07198,0.039727,0.05767
75%,0.090662,0.083085,0.07996,0.090452,0.130698,0.092945,0.084113,0.076251,0.077762,0.056631,...,0.060558,0.110367,0.079672,0.084821,0.143956,0.07637,0.110476,0.109014,0.060616,0.095423
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


- Make recommendation

In [72]:
# Find the values for the movie Toy Story
cosine_similarity_series = cosine_similarity_df.loc['Toy Story']

In [73]:
# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending = False)
ordered_similarities

original_title
Toy Story                      1.000000
La Haine                       0.213796
Last Summer in the Hamptons    0.154279
Vampire in Brooklyn            0.144492
The Indian in the Cupboard     0.137623
                                 ...   
Casino                         0.010277
بادکنک سفید                    0.008088
Richard III                    0.000000
White Squall                   0.000000
When Night Is Falling          0.000000
Name: Toy Story, Length: 99, dtype: float64

This recommendation system is not a successful one.
One reason is probably because of the overviews of movies, no good overviews