In [97]:
#Content-based Movie Recommendation System using cosine similarity
#dataset is taken from https://www.kaggle.com/carolzhangdc/imdb-5000-movie-dataset/notebooks

In [98]:
#importing libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer         # Convert a collection of text documents to a matrix of token counts.
from sklearn.metrics.pairwise import cosine_similarity
import difflib 

In [99]:
#defining fuction to get movie from index
def get_movie_from_index(index):
    return df[df.index==index].movie_title.values[0]
    #return df.loc[index, 'movie_title']

In [100]:
#defining fuction to get index from movie
def get_index_from_movie(movie):
    movie_list = df['movie_title'].tolist()
    close_matches = difflib.get_close_matches(movie, movie_list, n=1)
    closest_movie = close_matches[0]
    return df[df.movie_title == closest_movie].index.values[0]

In [101]:
# Creating Dataframe of the dataset
df = pd.read_csv("movie_metadata.csv", encoding = 'utf-8')

In [102]:
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [103]:
df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [104]:
df.movie_title[0]

'Avatar\xa0'

In [105]:
#handlinf \xa0
#since movie tile is suffixed with '\xa0', we need to remove it. \xa0 is actually non-breaking space in Latin1 (ISO 8859-1).
df.replace(u'\xa0',u'', regex =True , inplace=True)
df.movie_title[0]

'Avatar'

In [106]:
df['plot_keywords'].head()

0               avatar|future|marine|native|paraplegic
1    goddess|marriage ceremony|marriage proposal|pi...
2                  bomb|espionage|sequel|spy|terrorist
3    deception|imprisonment|lawlessness|police offi...
4                                                  NaN
Name: plot_keywords, dtype: object

In [107]:
# Taking features to be used.
features = ['director_name', 'genres', 'language', 'plot_keywords' ]

In [108]:
# Checking for nan value.
df[features].isnull().values.any()

True

In [109]:
df[features].isnull().sum()

director_name    104
genres             0
language          12
plot_keywords    153
dtype: int64

In [110]:
# Handling nan values.
df[features] = df[features].fillna('')
df[features].isnull().values.any()

False

In [111]:
#Combining features to form a string
df['combined_features'] = df['director_name'] +" "+ df['genres'] +" "+ df['language'] +' '+ df['plot_keywords']
df['combined_features'].head()

0    James Cameron Action|Adventure|Fantasy|Sci-Fi ...
1    Gore Verbinski Action|Adventure|Fantasy Englis...
2    Sam Mendes Action|Adventure|Thriller English b...
3    Christopher Nolan Action|Thriller English dece...
4                            Doug Walker Documentary  
Name: combined_features, dtype: object

In [112]:
# Extracting features of data using CountVectoriser, creating count matrix.
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'])

In [113]:
cv.get_feature_names()

['007',
 '10',
 '1000000',
 '11',
 '1190s',
 '12',
 '12th',
 '13',
 '130',
 '13th',
 '14',
 '14th',
 '15',
 '1520s',
 '15th',
 '16',
 '16th',
 '1743',
 '1770s',
 '17th',
 '18',
 '1800s',
 '1810s',
 '1818',
 '1830s',
 '1850s',
 '1860s',
 '1874',
 '1879',
 '1880s',
 '1890s',
 '18th',
 '1910s',
 '1920s',
 '1930',
 '1930s',
 '1937',
 '1940s',
 '1949',
 '1950s',
 '1954',
 '1955',
 '1957',
 '1959',
 '1960s',
 '1964',
 '1969',
 '1970s',
 '1978',
 '1979',
 '1980s',
 '1983',
 '1987',
 '1988',
 '1990',
 '1990s',
 '1994',
 '1996',
 '1997',
 '19th',
 '1st',
 '20',
 '2000',
 '2000s',
 '2001',
 '2002',
 '2005',
 '2009',
 '2010s',
 '2015',
 '2019',
 '2020s',
 '2027',
 '2029',
 '2030s',
 '2046',
 '2047',
 '2054',
 '206',
 '2065',
 '20th',
 '21',
 '21st',
 '22',
 '22nd',
 '23',
 '23rd',
 '250',
 '27th',
 '2nd',
 '30',
 '35',
 '3d',
 '40',
 '40k',
 '40th',
 '41st',
 '50s',
 '51',
 '5th',
 '66',
 '70mm',
 '747',
 '80s',
 '911',
 '95',
 'aarniokoski',
 'aaron',
 'abandoned',
 'abandonment',
 'abascal',
 '

In [114]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [115]:
# Computing cosine similarity based on count matrix (extracted features)
cosine_sim = cosine_similarity(count_matrix)

In [116]:
cosine_sim

array([[1.        , 0.28644595, 0.2508726 , ..., 0.1132277 , 0.1132277 ,
        0.06726728],
       [0.28644595, 1.        , 0.23354968, ..., 0.10540926, 0.10540926,
        0.06262243],
       [0.2508726 , 0.23354968, 1.        , ..., 0.24618298, 0.12309149,
        0.07312724],
       ...,
       [0.1132277 , 0.10540926, 0.24618298, ..., 1.        , 0.33333333,
        0.09901475],
       [0.1132277 , 0.10540926, 0.12309149, ..., 0.33333333, 1.        ,
        0.09901475],
       [0.06726728, 0.06262243, 0.07312724, ..., 0.09901475, 0.09901475,
        1.        ]])

In [117]:
# Taking movie from the user and finding its index
user_movie = input('Enter a movie name to know the similar movies:\n')
movie_index = get_index_from_movie(user_movie)

Enter a movie name to know the similar movies:
Interstellar


In [119]:
#finding similar movies from cosine_sim and enumerate it to form it a tuple of form (index, cosine_value) and at last listing all such tuples. 
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [120]:
#sorting similar_movies by cosine_values i.e. tuple[1] in descending order.
similar_movies_sorted = sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [121]:
#printing recomended movies
print("Top 50 Recomended movies for you are")
i = 1
for sim_index in similar_movies_sorted:
    print(i, ") ", get_movie_from_index(sim_index[0]))
    i+=1
    if(i == 51):
        break

Top 50 Recomended movies for you are
1 )  Interstellar
2 )  The Astronaut Farmer
3 )  Heroes            
4 )  Midnight Special
5 )  American Hero
6 )  Stargate: The Ark of Truth
7 )  Frances Ha
8 )  The Prestige
9 )  Seeking a Friend for the End of the World
10 )  Divergent
11 )  Harlock: Space Pirate
12 )  Mr. Peabody & Sherman
13 )  Gravity
14 )  Inception
15 )  Dune
16 )  H.
17 )  Dawn of the Crescent Moon
18 )  Star Trek Into Darkness
19 )  Silent Running
20 )  The Martian
21 )  Stargate SG-1            
22 )  Star Trek: The Motion Picture
23 )  Suicide Squad
24 )  Edge of Tomorrow
25 )  Knowing
26 )  Primer
27 )  A.I. Artificial Intelligence
28 )  12 Monkeys            
29 )  The Black Hole
30 )  The Day After Tomorrow
31 )  The Hunger Games
32 )  Space Chimps
33 )  That Awkward Moment
34 )  Instructions Not Included
35 )  Bang Bang Baby
36 )  Daredevil            
37 )  The Postman
38 )  Star Trek: Nemesis
39 )  The Descendants
40 )  Brazil
41 )  Prom
42 )  Mission to Mars
43 )  