In [4]:
import pandas as pd
import numpy as np
import difflib #rectify of spelling mistake and find closed values by user 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Data Collection & processing

In [5]:
#loading data to pandas dataframe
movies_data=pd.read_csv("movies.csv")

In [23]:
# checking the columns of the data set
movies_data.head()
movies_data.tail()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
4798,4798,220000,Action Crime Thriller,,9367,united states\u2013mexico barrier legs arms pa...,es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,...,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,Carlos Gallardo Jaime de Hoyos Peter Marquardt...,"[{'name': 'Robert Rodriguez', 'gender': 0, 'de...",Robert Rodriguez
4799,4799,9000,Comedy Romance,,72766,,en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,...,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...,"[{'name': 'Edward Burns', 'gender': 2, 'depart...",Edward Burns
4800,4800,0,Comedy Drama Romance TV Movie,http://www.hallmarkchannel.com/signedsealeddel...,231617,date love at first sight narration investigati...,en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,...,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,Eric Mabius Kristin Booth Crystal Lowe Geoff G...,"[{'name': 'Carla Hetland', 'gender': 0, 'depar...",Scott Smith
4801,4801,0,,http://shanghaicalling.com/,126186,,en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...,"[{'name': 'Daniel Hsia', 'gender': 2, 'departm...",Daniel Hsia
4802,4802,0,Documentary,,25975,obsession camcorder crush dream girl,en,My Date with Drew,Ever since the second grade when he first saw ...,1.929883,...,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,My Date with Drew,6.3,16,Drew Barrymore Brian Herzlinger Corey Feldman ...,"[{'name': 'Clark Peterson', 'gender': 2, 'depa...",Brian Herzlinger


In [7]:
# no of rows and columns in dataframe
movies_data.shape

(4803, 24)

In [8]:
# selecting the feature (feature extraction)
selected_feature=['genres','keywords','overview','tagline','cast','director']
print(selected_feature)


['genres', 'keywords', 'overview', 'tagline', 'cast', 'director']


In [24]:
# replacing the missing or null values with null string
for feature in selected_feature:
    movies_data[feature]=movies_data[feature].fillna('')

In [25]:
# combining all the 6 selected feature
combined_feature=movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['overview']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [26]:
combined_feature

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      When ambitious New York attorney Sam is sent...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [27]:
#convert text data to feature vectors
vectorizer=TfidfVectorizer()
feature_vectors=vectorizer.fit_transform(combined_feature)

In [28]:
print(feature_vectors)

  (0, 4288)	0.13213124585063996
  (0, 14180)	0.08418056281586364
  (0, 23134)	0.1498786462809525
  (0, 17784)	0.12285843797047787
  (0, 15569)	0.171366691592621
  (0, 25868)	0.11590870590502833
  (0, 29614)	0.15064979633862852
  (0, 24802)	0.158925784923944
  (0, 23578)	0.16802034155542864
  (0, 30543)	0.15064979633862852
  (0, 30145)	0.18243919685121024
  (0, 23619)	0.11047375838269538
  (0, 19410)	0.0318696249813729
  (0, 30121)	0.07272849549697494
  (0, 9065)	0.14505772980982928
  (0, 5245)	0.16506865163441015
  (0, 1014)	0.12285843797047787
  (0, 1243)	0.05270709034262741
  (0, 21523)	0.1665010643166998
  (0, 1281)	0.03195190270050669
  (0, 19650)	0.1612055740237117
  (0, 10368)	0.13848277654301597
  (0, 2974)	0.09974129089792806
  (0, 27515)	0.14569220419090678
  (0, 2685)	0.09805341831299116
  :	:
  (4802, 27153)	0.05940079357098276
  (4802, 19243)	0.061349419053434766
  (4802, 11941)	0.07829183421073846
  (4802, 29680)	0.07735547381237222
  (4802, 9346)	0.07104321122746965
  (48


# cosine similarity

In [31]:
#getting similarity score 
similarity=cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.05083168 0.0332947  ... 0.02749812 0.0304889  0.0072518 ]
 [0.05083168 1.         0.04356836 ... 0.05077045 0.03100979 0.01521198]
 [0.0332947  0.04356836 1.         ... 0.02646984 0.04751623 0.01372603]
 ...
 [0.02749812 0.05077045 0.02646984 ... 1.         0.03481447 0.03546821]
 [0.0304889  0.03100979 0.04751623 ... 0.03481447 1.         0.03098945]
 [0.0072518  0.01521198 0.01372603 ... 0.03546821 0.03098945 1.        ]]


In [32]:
print(similarity.shape)

(4803, 4803)


In [34]:
# geting movie name from user
movie_name=input("enter the movie name :- ")

enter the movie name :- iron man


In [35]:
#creating list with all movies names in dataset
