In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('./Dataset/movies.csv')

In [3]:
df.sample(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
1609,1609,25000000,Horror Comedy Music,,10776,flower solar eclipse florist assistant plants,en,Little Shop of Horrors,Seymour Krelborn is a nerdy orphan working at ...,12.113235,...,94.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Don't feed the plants.,Little Shop of Horrors,6.6,384,Rick Moranis Ellen Greene Vincent Gardenia Ste...,"[{'name': 'Roy Walker', 'gender': 2, 'departme...",Frank Oz
944,944,50000000,Crime Drama Thriller,,66,corruption assassination washington d.c. rape ...,en,Absolute Power,A master thief coincidentally is robbing a hou...,13.576765,...,121.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Corrupts Absolutely.,Absolute Power,6.4,223,Clint Eastwood Gene Hackman Ed Harris Scott Gl...,"[{'name': 'Clint Eastwood', 'gender': 2, 'depa...",Clint Eastwood


In [4]:
df.shape

(4803, 24)

In [5]:
df.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [6]:
sel = ['genres', 'keywords', 'tagline', 'overview']
df[sel] = df[sel].fillna('')

In [7]:
comb = df['genres'] + ' ' + df['keywords'] + ' ' + df['tagline'] + ' ' + df['overview']
comb

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai When ambitious New ...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [8]:
vectors = TfidfVectorizer().fit_transform(comb)
print(vectors)

  (0, 3847)	0.18998320818587605
  (0, 849)	0.14140201647755615
  (0, 1007)	0.06066240935688215
  (0, 15872)	0.19163182137882953
  (0, 1037)	0.03677454757508785
  (0, 14354)	0.18553705883719787
  (0, 7965)	0.1593846069840042
  (0, 2246)	0.11479569406889707
  (0, 20499)	0.17430377383961507
  (0, 2056)	0.11285306325729084
  (0, 3048)	0.0713088889560065
  (0, 13170)	0.12590540666133332
  (0, 21195)	0.1782884272848316
  (0, 14255)	0.056491574582560884
  (0, 13328)	0.1782884272848316
  (0, 20414)	0.0356962560421724
  (0, 5936)	0.20693666762079285
  (0, 10705)	0.047995037933818895
  (0, 12530)	0.16486601155612282
  (0, 14710)	0.22167034908713734
  (0, 3512)	0.14388465597129593
  (0, 239)	0.22692902762100967
  (0, 10185)	0.04198426088609064
  (0, 14667)	0.4344483994769183
  (0, 14171)	0.03667985130848312
  :	:
  (4802, 20222)	0.07585493360346558
  (4802, 14036)	0.07834333228811927
  (4802, 9064)	0.09997883073148389
  (4802, 22050)	0.09878309660779563
  (4802, 7119)	0.09072232451233114
  (4802,

In [9]:
similarity = cosine_similarity(vectors)
print(similarity)

[[1.         0.07119511 0.02957865 ... 0.03511847 0.01620524 0.01066013]
 [0.07119511 1.         0.06180734 ... 0.06863578 0.04226665 0.02364066]
 [0.02957865 0.06180734 1.         ... 0.03423982 0.01524207 0.02043494]
 ...
 [0.03511847 0.06863578 0.03423982 ... 1.         0.04327462 0.04021146]
 [0.01620524 0.04226665 0.01524207 ... 0.04327462 1.         0.04433985]
 [0.01066013 0.02364066 0.02043494 ... 0.04021146 0.04433985 1.        ]]


In [10]:
similarity.shape

(4803, 4803)

In [13]:
# Movie Recommendation based on another movie

get_name = input('Enter your favorite movie name: ')

all_movies = df['title'].tolist()
close_matches_movie = difflib.get_close_matches(get_name, all_movies)
index_of_the_movie = df[df['title'] == close_matches_movie[0]]['index'].values[0]
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

print('Suggested for you:\n')
for i, movie in enumerate(sorted_similar_movies[:9], start=1):
    index = movie[0]
    title_from_index = df.loc[index, 'title']
    print(f"{i}. {title_from_index}")

Enter your favorite movie name:  Fast and Furious


Suggested for you:

1. 2 Fast 2 Furious
2. The Final Destination
3. The Wash
4. The Fast and the Furious: Tokyo Drift
5. The Fast and the Furious
6. Bad Boys II
7. Ride Along 2
8. Fast Five
9. Underclassman


In [14]:
# Movie Recommendation based on Genre

get_genre = input('Enter your favorite genre name: ')

all_genres = df['genres'].tolist()
close_matches_genre = difflib.get_close_matches(get_genre, all_genres)
index_of_the_movie1 = df[df['genres'] == close_matches_genre[0]]['index'].values[0]
similarity_score1 = list(enumerate(similarity[index_of_the_movie1]))
sorted_similar_movies1 = sorted(similarity_score1, key=lambda x: x[1], reverse=True)

print('Suggested for you:\n')
for i, movie in enumerate(sorted_similar_movies1[:10], start=1):
    index1 = movie[0]
    title_from_index1 = df.loc[index1, 'title']
    print(f"{i}. {title_from_index1}")

Enter your favorite genre name:  Action


Suggested for you:

1. Furious 7
2. The Final Destination
3. The Fast and the Furious: Tokyo Drift
4. The Fast and the Furious
5. Stranded
6. Dead Man's Shoes
7. Speed Racer
8. Gone in Sixty Seconds
9. Blue Ruin
10. Turbo
