# Movie Recommendation System

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
movies_data = pd.read_csv('data\movies.csv')
print(movies_data.head())

   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [3]:
movies_data.shape

(4803, 24)

In [4]:
# Selecting the relevant features for recommendation
selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [5]:
# replacing the null values with null string or empty string
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')


In [6]:
# Combining All the 5 features together 
combined_features = movies_data ['genres']+''+movies_data['keywords']+''+movies_data['tagline']+''+movies_data['cast']+''+movies_data['director']
print(combined_features)

0       Action Adventure Fantasy Science Fictioncultur...
1       Adventure Fantasy Actionocean drug abuse exoti...
2       Action Adventure Crimespy based on novel secre...
3       Action Crime Drama Thrillerdc comics crime fig...
4       Action Adventure Science Fictionbased on novel...
                              ...                        
4798    Action Crime Thrillerunited states\u2013mexico...
4799    Comedy RomanceA newlywed couple's honeymoon is...
4800    Comedy Drama Romance TV Moviedate love at firs...
4801    A New Yorker in ShanghaiDaniel Henney Eliza Co...
4802    Documentaryobsession camcorder crush dream gir...
Length: 4803, dtype: object


In [7]:
# Converting the text data to feature vectors
vectorizer = TfidfVectorizer() # creating an instance of this TfidfVectorizer


In [8]:
feature_vector = vectorizer.fit_transform(combined_features) # creating another variable as feature_vector to store all the numerical values.
print(feature_vector)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 111820 stored elements and shape (4803, 27580)>
  Coords	Values
  (0, 245)	0.07858581760804864
  (0, 444)	0.09109355212252296
  (0, 8936)	0.11806131645084655
  (0, 21836)	0.09966592997173947
  (0, 9304)	0.27385789694599244
  (0, 4456)	0.2179967521551067
  (0, 10193)	0.1653243242081299
  (0, 22916)	0.3366875640692919
  (0, 26334)	0.130597230719163
  (0, 4676)	0.24529097704249617
  (0, 22778)	0.27385789694599244
  (0, 24197)	0.07518543993419267
  (0, 27161)	0.12619886579371511
  (0, 18164)	0.08690831799482268
  (0, 18558)	0.26121683351048536
  (0, 21521)	0.15623566659433683
  (0, 27182)	0.23480088356130557
  (0, 27540)	0.19771357974524179
  (0, 21487)	0.21550128478931552
  (0, 22464)	0.2020708956871175
  (0, 26547)	0.19638671147741735
  (0, 23267)	0.16128139780622522
  (0, 14550)	0.22368093636480682
  (0, 16614)	0.1569607387745327
  (0, 20729)	0.27385789694599244
  :	:
  (4801, 18790)	0.22450140828104795
  (4801, 11776)	0.3005

# Cosine Similarity

In [9]:
# Getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_vector)
print(similarity)
print(similarity.shape)

[[1.         0.06865296 0.01492221 ... 0.         0.         0.        ]
 [0.06865296 1.         0.02799128 ... 0.01243107 0.         0.        ]
 [0.01492221 0.02799128 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.01243107 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
(4803, 4803)


In [None]:
# Getting the movie name from user
import ipywidgets as widgets
from IPython.display import display

movie_name_widget = widgets.Text(
    description="Movie Name:",
    placeholder="Enter your favorite movie name",
)
display(movie_name_widget)

def get_movie_name(change):
    print(f"Your favorite movie is: {change.new}")

movie_name_widget.observe(get_movie_name, names='value')
print(movie_name_widget)


Text(value='', description='Movie Name:', placeholder='Enter your favorite movie name')

Text(value='', description='Movie Name:', placeholder='Enter your favorite movie name')


Your favorite movie is: S
Your favorite movie is: Sp
Your favorite movie is: Spi
Your favorite movie is: Spid
Your favorite movie is: Spide
Your favorite movie is: Spider
Your favorite movie is: Spider-
Your favorite movie is: Spider-M
Your favorite movie is: Spider-Ma
Your favorite movie is: Spider-Man


In [11]:
# Creating a list with all movie names 
movies_list = movies_data['title'].tolist()
print(movies_list)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [16]:
# Finding the close match
current_movie_name = movie_name_widget.value
close_match = difflib.get_close_matches(current_movie_name, movies_list)
print(close_match)

['Spider-Man', 'Spider-Man 3', 'Spider-Man 2']


In [17]:
closest_match = close_match[0]
print(closest_match)

Spider-Man


In [18]:
# Finding the index of the movie with title
movie_index = movies_data[movies_data.title == closest_match]['index'].values[0]
print(movie_index)

159


In [19]:
# Getting a list of similar movies
similarity_score = list(enumerate(similarity[movie_index]))
print(similarity_score)

[(0, np.float64(0.020183824687633835)), (1, np.float64(0.02384792579315554)), (2, np.float64(0.0)), (3, np.float64(0.0)), (4, np.float64(0.07009794704787743)), (5, np.float64(0.2776168685313856)), (6, np.float64(0.0)), (7, np.float64(0.0)), (8, np.float64(0.021290749462889353)), (9, np.float64(0.0)), (10, np.float64(0.030167520837360632)), (11, np.float64(0.0)), (12, np.float64(0.013354027570894695)), (13, np.float64(0.012615674024968767)), (14, np.float64(0.013315406905890543)), (15, np.float64(0.0)), (16, np.float64(0.0)), (17, np.float64(0.0)), (18, np.float64(0.0)), (19, np.float64(0.006222184959527053)), (20, np.float64(0.006863022286507036)), (21, np.float64(0.0)), (22, np.float64(0.006788685003265008)), (23, np.float64(0.0)), (24, np.float64(0.006662042970246023)), (25, np.float64(0.0)), (26, np.float64(0.0)), (27, np.float64(0.0)), (28, np.float64(0.0)), (29, np.float64(0.0)), (30, np.float64(0.2644062917792628)), (31, np.float64(0.029417049027168095)), (32, np.float64(0.024167

In [20]:
len(similarity_score)

4803

In [21]:
# Sorting the movies based on their similarity score (Higher similarity score to the lower similarity score)
sorted_similar_movies = sorted(similarity_score, key=lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(159, np.float64(1.0000000000000002)), (5, np.float64(0.2776168685313856)), (30, np.float64(0.2644062917792628)), (1559, np.float64(0.15858528525624224)), (382, np.float64(0.15720878026755258)), (1364, np.float64(0.12629417230225548)), (328, np.float64(0.11912416311928875)), (1523, np.float64(0.11207176546249262)), (677, np.float64(0.11030053799319536)), (1193, np.float64(0.10672945972305733)), (2369, np.float64(0.09839132454513651)), (1598, np.float64(0.09614328463026375)), (37, np.float64(0.09561515347118849)), (3046, np.float64(0.0936576497302635)), (3188, np.float64(0.09209454623229585)), (1427, np.float64(0.09022848651916353)), (764, np.float64(0.08902547715715463)), (2361, np.float64(0.08795107948874566)), (2529, np.float64(0.08773623790779296)), (3479, np.float64(0.08535825871723512)), (868, np.float64(0.084569820116243)), (4441, np.float64(0.08405757925364286)), (1796, np.float64(0.08344848621760617)), (1435, np.float64(0.08317732832206096)), (3130, np.float64(0.08178982041539

In [22]:
# Printing the name of similar movies based on index
print('Movies suggested for you : \n')
i = 1
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index==index]['title'].values[0]
    if (i<30):
     print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Spider-Man
2 . Spider-Man 3
3 . Spider-Man 2
4 . The Notebook
5 . Seabiscuit
6 . Horrible Bosses
7 . Finding Nemo
8 . The Good German
9 . Clear and Present Danger
10 . The Count of Monte Cristo
11 . Labor Day
12 . Drag Me to Hell
13 . Oz: The Great and Powerful
14 . Drop Dead Gorgeous
15 . Velvet Goldmine
16 . Wonder Boys
17 . The Legend of Bagger Vance
18 . The Ice Storm
19 . The Queen
20 . The Virgin Suicides
21 . Elizabethtown
22 . Bambi
23 . Brothers
24 . Wimbledon
25 . Adam Resurrected
26 . The Great Gatsby
27 . Interview with the Vampire
28 . Small Soldiers
29 . Midnight Special


# All code in one cell

In [None]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1