## **Importing Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
import difflib

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# **Data Collection & Pre-processing**

In [None]:
# Loading the data from a csv to pandas data frame
df = pd.read_csv('/content/sample_data/movies.csv')

In [None]:
# Showing first 5 data frames
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [None]:
# Showing the last 5 of the dataframe
df.tail()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
4798,4798,220000,Action Crime Thriller,,9367,united states\u2013mexico barrier legs arms pa...,es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,...,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,Carlos Gallardo Jaime de Hoyos Peter Marquardt...,"[{'name': 'Robert Rodriguez', 'gender': 0, 'de...",Robert Rodriguez
4799,4799,9000,Comedy Romance,,72766,,en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,...,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...,"[{'name': 'Edward Burns', 'gender': 2, 'depart...",Edward Burns
4800,4800,0,Comedy Drama Romance TV Movie,http://www.hallmarkchannel.com/signedsealeddel...,231617,date love at first sight narration investigati...,en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,...,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,Eric Mabius Kristin Booth Crystal Lowe Geoff G...,"[{'name': 'Carla Hetland', 'gender': 0, 'depar...",Scott Smith
4801,4801,0,,http://shanghaicalling.com/,126186,,en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...,"[{'name': 'Daniel Hsia', 'gender': 2, 'departm...",Daniel Hsia
4802,4802,0,Documentary,,25975,obsession camcorder crush dream girl,en,My Date with Drew,Ever since the second grade when he first saw ...,1.929883,...,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,My Date with Drew,6.3,16,Drew Barrymore Brian Herzlinger Corey Feldman ...,"[{'name': 'Clark Peterson', 'gender': 2, 'depa...",Brian Herzlinger


In [None]:
# Checking the shape of the data

df.shape

(4803, 24)

In [None]:
# Checking the size of the data

df.size

115272

In [None]:
# Checking for any duplicates

df.duplicated().sum()

np.int64(0)

In [None]:
# Checking for any null values

df.isna().sum()

Unnamed: 0,0
index,0
budget,0
genres,28
homepage,3091
id,0
keywords,412
original_language,0
original_title,0
overview,3
popularity,0


In [None]:
# Identifying the relevant features

selected_features = ['genres', 'keywords', 'overview', 'tagline', 'director', 'cast']


In [None]:
# Replacing the null values with empty string
for feat in selected_features:
  df[feat] = df[feat].fillna("")

In [None]:
# Combining the feataure into a single dataframe

df['combined_features'] = df[selected_features].agg(" ".join, axis=1)

In [None]:
# Converting the text into numerical values using TFID Vectorizer

vector = TfidfVectorizer()

feature_vect = vector.fit_transform(df['combined_features'])

# **Cosine Similarity**

In [None]:
similarity = cosine_similarity(feature_vect)

# **Getting a Movie Name from  User**

In [None]:
search_term = input("Enter a movie to search")

Enter a movie to searchiron man


In [None]:
# Creating a list which contains all the movies

movie_list = df['title'].tolist()


In [None]:
# Getting best match based on the user input

movie_match = difflib.get_close_matches(search_term, movie_list)

close_match = movie_match[0]

In [None]:
# Finding the index based on the title

index_of_movie = df[df.title == close_match].index[0]

print(index_of_movie)

68


In [None]:
# Getting the similarity score

similarity_score = list(enumerate(similarity[index_of_movie]))

In [None]:
# Sort the similarity score based on descending order

sorted_similarity_score = sorted(similarity_score, key = lambda x: x[1], reverse=True)

print(sorted_similarity_score)

[(68, np.float64(1.0000000000000002)), (79, np.float64(0.32928784708836273)), (31, np.float64(0.2905211440511144)), (7, np.float64(0.1934346428669005)), (16, np.float64(0.13208839109747222)), (511, np.float64(0.11735356375367764)), (4401, np.float64(0.11644899793001534)), (26, np.float64(0.11563787911286949)), (64, np.float64(0.11260372706734423)), (182, np.float64(0.10817936918053872)), (3623, np.float64(0.10427579910401667)), (94, np.float64(0.10008582067970911)), (46, np.float64(0.08808497545952931)), (4033, np.float64(0.0865367140010388)), (783, np.float64(0.08640309672854926)), (174, np.float64(0.08596763747178215)), (618, np.float64(0.08422372953874693)), (85, np.float64(0.08317925570431418)), (3133, np.float64(0.08003408207059728)), (101, np.float64(0.07997032837045653)), (1740, np.float64(0.07985500618953217)), (30, np.float64(0.07969083021504449)), (3466, np.float64(0.0786385267898631)), (33, np.float64(0.07770036840942475)), (203, np.float64(0.07717739012595405)), (1177, np.f

In [None]:
print("Movies Suggested for you \n")

i = 0

for similar_movie in sorted_similarity_score:
  index = similar_movie[0]

  title_from_movie = df[df.index == index]['title'].values
  if i < 30:
     print(i, ' ', title_from_movie)
     i = i + 1


Movies Suggested for you 

0   ['Iron Man']
1   ['Iron Man 2']
2   ['Iron Man 3']
3   ['Avengers: Age of Ultron']
4   ['The Avengers']
5   ['X-Men']
6   ['The Helix... Loaded']
7   ['Captain America: Civil War']
8   ['X-Men: Apocalypse']
9   ['Ant-Man']
10   ['Made']
11   ['Guardians of the Galaxy']
12   ['X-Men: Days of Future Past']
13   ['Super']
14   ['Mortdecai']
15   ['The Incredible Hulk']
16   ['Mystery Men']
17   ['Captain America: The Winter Soldier']
18   ["The Devil's Tomb"]
19   ['X-Men: First Class']
20   ['Kick-Ass 2']
21   ['Spider-Man 2']
22   ['Sliding Doors']
23   ['X-Men: The Last Stand']
24   ['X2']
25   ['Sin City']
26   ['Hellboy II: The Golden Army']
27   ['G-Force']
28   ['The Nativity Story']
29   ["Surf's Up"]
