### Content Based Recommendation System

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from warnings import filterwarnings
filterwarnings('ignore')


### Read the Dataset `movies_metadata.csv`

In [2]:
df = pd.read_csv('movies_metadata.csv')

In [3]:
df.shape

(45466, 24)

In [4]:
df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

In [6]:
df.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

**There are columns which has null values**

In [7]:
df[df['overview'].isna() == True].shape

(954, 24)

In [8]:
df[df['tagline'].isna() == True].shape

(25054, 24)

In [9]:
df[(df['overview'].isna() == True) & (df['tagline'].isna() == True)].shape

(946, 24)

### Inference:
    Individually column 'overview' has null values for 954 datasets & column 'tagline' has null values for 25054 datasets.
    In the given dataset, combinely both 'ovreview' & 'tagline' has null values for 946 datasets

**Handle the null values for 'overview' & 'tagline' with empty space**

In [10]:
df['overview'].replace(np.nan,' ',inplace=True)
df['tagline'].replace(np.nan,' ',inplace=True)

In [11]:
df.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                     0
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                      0
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

### Create a new column with name 'description' combining `'overview' and 'tagline'` columns in the given dataset

In [12]:
df['description'] = df['overview'] + df['tagline']

In [13]:
df['overview'].str.isspace().sum()

959

In [14]:
df['tagline'].str.isspace().sum()

25055

In [15]:
df['description'].str.isspace().sum()

951

In [16]:
df[df['description'].str.isspace()==True].sample(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
31560,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,169069,tt2573226,it,Studio illegale,,...,0.0,0.0,[],Released,,Studio illegale,False,4.3,9.0,
33068,False,,0,[],,154671,tt0157153,sv,Vem älskar Yngve Frej?,,...,0.0,104.0,"[{'iso_639_1': 'sv', 'name': 'svenska'}]",Released,,Vem älskar Yngve Frej?,False,8.0,1.0,
40631,False,,0,[],,73160,tt1205903,pt,Bellini e o Demônio,,...,0.0,120.0,"[{'iso_639_1': 'pt', 'name': 'Português'}]",Released,,Bellini e o Demônio,False,4.0,1.0,
25912,False,,0,[],,58189,tt0110808,it,Perdiamoci di vista!,,...,0.0,0.0,[],Released,,Perdiamoci di vista!,False,6.3,11.0,
19951,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,64362,tt0246278,fr,"Oui, mais...",,...,0.0,104.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,"Oui, mais...",False,6.9,5.0,


### Lets drop the null values in `description` column

Please refer the In[7] - In[16] where the null values are handled at column wise and combined as new column named 'description'

### Keep the first occurance and drop duplicates of each title in column `title`

In [17]:
df['title'].value_counts()

Cinderella                                    11
Hamlet                                         9
Alice in Wonderland                            9
Les Misérables                                 8
Beauty and the Beast                           8
Blackout                                       7
Treasure Island                                7
The Three Musketeers                           7
A Christmas Carol                              7
The Hound of the Baskervilles                  6
The Journey                                    6
Aftermath                                      6
Bluebeard                                      6
The Stranger                                   6
Mother                                         6
Wuthering Heights                              6
King Lear                                      6
The Hunters                                    6
First Love                                     6
Love                                           6
Countdown           

In [18]:
df[df['title']=='Cinderella']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
993,False,"{'id': 55419, 'name': 'Cinderella Collection',...",2900000,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",http://movies.disney.com/cinderella-1950,11224,tt0042332,en,Cinderella,Cinderella has faith her dreams of a better li...,...,263591415.0,74.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The greatest love story ever told.,Cinderella,False,6.8,1760.0,Cinderella has faith her dreams of a better li...
13076,False,,0,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 107...",,42884,tt0128996,en,Cinderella,Updated version of the classic Rodgers and Ham...,...,0.0,88.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Cinderella,False,6.1,28.0,Updated version of the classic Rodgers and Ham...
23507,False,,0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,92349,tt0003772,en,Cinderella,Based on Charles Perrault's fairy tale: Cinder...,...,0.0,52.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Cinderella,False,5.4,7.0,Based on Charles Perrault's fairy tale: Cinder...
23518,False,,0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",,105875,tt0910852,en,Cinderella,"Cinderella, the beautiful and kind-hearted rag...",...,0.0,48.0,"[{'iso_639_1': 'af', 'name': 'Afrikaans'}]",Released,The version children love!,Cinderella,False,6.0,2.0,"Cinderella, the beautiful and kind-hearted rag..."
28391,False,,0,"[{'id': 10751, 'name': 'Family'}, {'id': 10770...",http://www.betafilm.com/en/product/do/detail.h...,261985,tt1781790,en,Cenerentola,Once upon a time in post-war Rome: 13-year old...,...,0.0,180.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Cinderella,False,5.3,9.0,Once upon a time in post-war Rome: 13-year old...
28664,False,,95000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",http://movies.disney.com/cinderella,150689,tt1661199,en,Cinderella,"When her father unexpectedly passes away, youn...",...,543514353.0,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Midnight is just the beginning.,Cinderella,False,6.7,2426.0,"When her father unexpectedly passes away, youn..."
34254,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,42651,tt0168269,ru,Zolushka,"Based on a classic fairytale ""Cinderella"" bril...",...,0.0,80.0,"[{'iso_639_1': 'ru', 'name': 'Pусский'}]",Released,,Cinderella,False,4.8,5.0,"Based on a classic fairytale ""Cinderella"" bril..."
35593,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,44459,tt0129672,en,Cinderella,The first of three TV-versions of the classic ...,...,0.0,77.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Cinderella,False,5.9,5.0,The first of three TV-versions of the classic ...
35595,False,,0,[],,289673,tt0218891,en,Cinderella,Cinderella (named Zezolla) and her family live...,...,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Cinderella,False,5.6,5.0,Cinderella (named Zezolla) and her family live...
41074,False,,0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 27, 'na...",,114108,tt0000230,fr,Cendrillon,A fairy godmother magically turns Cinderella's...,...,0.0,5.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]",Released,,Cinderella,False,5.5,19.0,A fairy godmother magically turns Cinderella's...


In [19]:
df.drop_duplicates(subset='title',keep='first',inplace=True)

In [20]:
df['title'].value_counts()

Who Is Harry Nilsson (And Why Is Everybody Talkin' About Him?)            1
Ironclad                                                                  1
Larceny Inc.                                                              1
Dark Tide                                                                 1
Intolerable Cruelty                                                       1
Wife                                                                      1
September Eleven 1683                                                     1
Big Fish & Begonia                                                        1
Van Diemen's Land                                                         1
The Other Woman                                                           1
Ricky Gervais: Out of England 2 - The Stand-Up Special                    1
Swept Under                                                               1
The Aristocrats                                                           1
Everything Y

### As we might have dropped a few rows with duplicate `title` in above step, just reset the index [make sure you are not adding any new column to the dataframe while doing reset index]

In [21]:
# After dropping the duplicates the dataset shape is
df.shape

(42278, 25)

In [22]:

df.tail(3)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0,An artist struggles to finish his work while a...
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,"In a small town live two brothers, one a minis..."
45465,False,,0,[],,461257,tt6980792,en,Queerama,50 years after decriminalisation of homosexual...,...,0.0,75.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Queerama,False,0.0,0.0,50 years after decriminalisation of homosexual...


In [23]:
df.reset_index(inplace=True)

In [24]:
df.tail(3)

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
42275,45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,...,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0,An artist struggles to finish his work while a...
42276,45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,...,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0,"In a small town live two brothers, one a minis..."
42277,45465,False,,0,[],,461257,tt6980792,en,Queerama,...,0.0,75.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Queerama,False,0.0,0.0,50 years after decriminalisation of homosexual...


### Based on genre, movies count in the given data-set

In [25]:
df['genres'].value_counts().head()

[{'id': 18, 'name': 'Drama'}]                                      4565
[{'id': 35, 'name': 'Comedy'}]                                     3465
[{'id': 99, 'name': 'Documentary'}]                                2626
[]                                                                 2282
[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]    1191
Name: genres, dtype: int64

### Top 10 movies based on 'vote count'

In [26]:
df.sort_values(by='vote_count',ascending=False).head(10).title

14898                  Inception
12075            The Dark Knight
14015                     Avatar
25066                   Deadpool
21743               Interstellar
19146           Django Unchained
22528    Guardians of the Galaxy
2814                  Fight Club
17463           The Hunger Games
25057         Mad Max: Fury Road
Name: title, dtype: object

### Movies for people whose age are 18+

In [27]:
df['adult'].value_counts()

False    42269
True         9
Name: adult, dtype: int64

In [28]:
df[df['adult'] == 'True'].title

18618    Erotic Nights of the Living Dead
26989                            Standoff
29963                     Electrical Girl
30130                         Diet of Sex
37208          Amateur Porn Star Killer 2
37209                            The Band
37820                    The Sinful Dwarf
38207                          Adulterers
40126                          Half -Life
Name: title, dtype: object

In [29]:
df['spoken_languages'].value_counts()

[{'iso_639_1': 'en', 'name': 'English'}]                                                                                                                                                                                                         20852
[]                                                                                                                                                                                                                                                3575
[{'iso_639_1': 'fr', 'name': 'Français'}]                                                                                                                                                                                                         1698
[{'iso_639_1': 'ja', 'name': '日本語'}]                                                                                                                                                                                                              1236
[{'iso_639_1

In [30]:
df['original_language'].value_counts()

en    30064
fr     2242
it     1450
ja     1290
de     1008
es      928
ru      726
hi      483
ko      395
zh      373
sv      352
pt      294
cn      289
fi      278
nl      221
da      204
pl      203
tr      136
cs      118
el      103
fa       94
hu       92
no       91
ta       77
th       66
he       60
sr       59
ro       50
te       43
ml       33
      ...  
kn        3
ky        3
ku        3
eu        3
am        2
lo        2
ne        2
iu        2
bo        2
ps        2
mn        2
af        2
pa        2
ay        1
hy        1
fy        1
la        1
lb        1
sm        1
jv        1
mt        1
eo        1
gl        1
zu        1
qu        1
uz        1
si        1
cy        1
tg        1
rw        1
Name: original_language, Length: 89, dtype: int64

### Reasoning for two different columns

In [31]:
df[(df['original_title'] == df['title']) == False].shape

(10478, 26)

In [32]:
df[(df['original_title'] == df['title']) == False].sample(3)

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,description
23560,24910,False,,0,"[{'id': 18, 'name': 'Drama'}]",,78399,tt0195232,fr,Le Sauveur,...,0.0,90.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,The Savior,False,5.0,3.0,"This is rural France. It's the summer of 1943,..."
2458,2479,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,22826,tt0110570,it,Il mostro,...,0.0,112.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,,The Monster,False,6.8,126.0,"A vicious serial sex killer is on the loose, a..."
14632,15203,False,,0,"[{'id': 18, 'name': 'Drama'}]",,31345,tt0065695,fr,L'Enfance nue,...,0.0,83.0,"[{'iso_639_1': 'fr', 'name': 'Français'}]",Released,,Naked Childhood,False,6.4,15.0,Handed over to foster care by his mother—who's...


### Inference:
    
    It can be infered as the name suggest, Original title refer to it's country production and tiltle can be interpreted as tranlated movie into english language

### Top 10 movies w.r.t revenue

In [33]:
df.sort_values(by='revenue',ascending=False).head(10).title

14015                                          Avatar
25059                    Star Wars: The Force Awakens
1635                                          Titanic
23721                                  Jurassic World
27105                                       Furious 7
25061                         Avengers: Age of Ultron
16721    Harry Potter and the Deathly Hallows: Part 2
40277                         The Fate of the Furious
19861                                      Iron Man 3
28824                                         Minions
Name: title, dtype: object

In [34]:
df.columns

Index(['index', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'description'],
      dtype='object')

### From the above analysis, create a new data-frame with required columns

* assume the columns 'title','description','original_language','genres', shall be helpful in recommneding a movies.
* also, as in the below questions it can be inferred that the recommendation is based on the description. Hence except 'description columns dropping other columns/features

In [35]:
movie_df = df.copy()

In [36]:
movie_df = movie_df[['title','description']]
movie_df.head()

Unnamed: 0,title,description
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...


### Generate tf-idf matrix using the column `description`. Consider till 3-grams, with minimum document frequency as 0.

In [37]:
tf_idf_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english', min_df = 0.0)


In [38]:
tf_idf_vect.fit(movie_df['description'])
matrix = tf_idf_vect.transform(movie_df["description"])

In [39]:
matrix.shape

(42278, 2237328)

In [40]:
tf_idf_vect.vocabulary_

{'led': 1130844,
 'woody': 2185240,
 'andy': 86970,
 'toys': 2017366,
 'live': 1167418,
 'happily': 880214,
 'room': 1688716,
 'birthday': 198754,
 'brings': 239773,
 'buzz': 263314,
 'lightyear': 1156597,
 'scene': 1722397,
 'afraid': 51874,
 'losing': 1193587,
 'place': 1501681,
 'heart': 895505,
 'plots': 1516039,
 'circumstances': 336992,
 'separate': 1757333,
 'owner': 1442310,
 'duo': 581768,
 'eventually': 642917,
 'learns': 1126425,
 'aside': 119481,
 'differences': 522730,
 'led woody': 1131684,
 'woody andy': 2185258,
 'andy toys': 87192,
 'toys live': 2017434,
 'live happily': 1168306,
 'happily room': 880366,
 'room andy': 1688764,
 'andy birthday': 86989,
 'birthday brings': 198792,
 'brings buzz': 239857,
 'buzz lightyear': 263344,
 'lightyear scene': 1156600,
 'scene afraid': 1722434,
 'afraid losing': 51970,
 'losing place': 1193873,
 'place andy': 1501787,
 'andy heart': 87076,
 'heart woody': 896706,
 'woody plots': 2185288,
 'plots buzz': 1516049,
 'buzz circumstance

In [41]:
# Find vocabulary
features = tf_idf_vect.get_feature_names()
features

['00',
 '00 agent',
 '00 agent dead',
 '00 body',
 '00 body vanish',
 '00 editor',
 '00 editor jabez',
 '00 foot',
 '00 foot tall',
 '00 furnish',
 '00 furnish ascent',
 '00 joey',
 '00 joey stabbed',
 '00 middle',
 '00 middle dance',
 '00 movie',
 '00 movie andre',
 '00 pm',
 '00 pm august',
 '00 pm clock',
 '00 rescue',
 '00 rescue dr',
 '00 schneider',
 '00 schneider asked',
 '000',
 '000 00',
 '000 00 editor',
 '000 000',
 '000 000 000',
 '000 000 american',
 '000 000 bloody',
 '000 000 brazilian',
 '000 000 cash',
 '000 000 citizens',
 '000 000 debt',
 '000 000 dollars',
 '000 000 fmk',
 '000 000 gambling',
 '000 000 instead',
 '000 000 instructions',
 '000 000 kroner',
 '000 000 make',
 '000 000 people',
 '000 000 pounds',
 '000 000 rebuild',
 '000 000 stashed',
 '000 000 stolen',
 '000 000 tournament',
 '000 000 worth',
 '000 000 years',
 '000 24',
 '000 24 hours',
 '000 30',
 '000 30 000',
 '000 50',
 '000 50 000',
 '000 accept',
 '000 accept denard',
 '000 additional',
 '000 a

### create cosine similarity matrix

In [42]:
cosine_sim = cosine_similarity(matrix)

In [43]:
cosine_sim.shape

(42278, 42278)

### Write a function with name `recommend` which takes `title` as argument and returns a list of 10 recommended title names in the output based on the above cosine similarities

In [44]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use later to match the indexes
indices = pd.Series(movie_df['title'])
indices[:5]

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [45]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommend(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    print(top_10_indexes)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(indices)[i])
        
    return recommended_movies

### Give the recommendations from above functions for movies `The Godfather` and `The Dark Knight Rises`

In [46]:
recommend('The Godfather')

[40983, 1176, 29998, 21970, 11608, 35503, 17533, 10957, 4278, 29]


['The Godfather Trilogy: 1972-1990',
 'The Godfather: Part II',
 'Honor Thy Father',
 'Blood Ties',
 'The Cave of the Yellow Dog',
 'A Mother Should Be Loved',
 'The Outside Man',
 'Household Saints',
 'Made',
 'Shanghai Triad']

In [47]:
indices[indices == 'The Godfather']

834    The Godfather
Name: title, dtype: object

In [48]:
pd.Series(cosine_sim[indices[indices == 'The Godfather'].index[0]]).sort_values(ascending=False).head(10)

834      1.000000
40983    0.174265
1176     0.147131
29998    0.095468
21970    0.050380
11608    0.045011
35503    0.043921
17533    0.042806
10957    0.038974
4278     0.034122
dtype: float64

In [49]:
recommend('The Dark Knight Rises')

[12075, 150, 1325, 3060, 585, 20389, 14924, 8993, 23890, 20195]


['The Dark Knight',
 'Batman Forever',
 'Batman Returns',
 'Batman: Mask of the Phantasm',
 'Batman',
 'Batman: Mystery of the Batwoman',
 'Batman: Under the Red Hood',
 'Batman Beyond: Return of the Joker',
 'Batman vs Dracula',
 'Batman Unmasked: The Psychology of the Dark Knight']