In [3]:
#Importing libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import difflib #comparing datasets sequences
import ast #for literal evaluation (maps,dictionary) iregardless of the type we parse python will understand
from sklearn.feature_extraction.text import TfidfVectorizer # Mathematical statistic that is planned to reflect how significant a word is to a record in a collection(quantifies relevance of string representation)
from sklearn.metrics.pairwise import cosine_similarity # finding similarity score of a feature with regards to other features

**Data collection and preprosessing**

In [4]:
credits=pd.read_csv(r"C:\Users\Ernest\Desktop\datasets\netflix\credits.csv")
titles=pd.read_csv(r"C:\Users\Ernest\Desktop\datasets\netflix\titles.csv",
usecols=['index','title', 'type', 'description', 'release_year',
       'age_certification','genres', 'production_countries', 'imdb_score','tmdb_popularity',
       'tmdb_score'])

In [36]:
credits.tail()

Unnamed: 0,person_id,id,name,character,role
77796,736339,tm1059008,Adelaida Buscato,María Paz,ACTOR
77797,399499,tm1059008,Luz Stella Luengas,Karen Bayona,ACTOR
77798,373198,tm1059008,Inés Prieto,Fanny,ACTOR
77799,378132,tm1059008,Isabel Gaona,Cacica,ACTOR
77800,1950416,tm1059008,Julian Gaviria,,DIRECTOR


In [35]:
titles.head()

Unnamed: 0,index,title,type,description,release_year,age_certification,genres,production_countries,imdb_score,tmdb_popularity,tmdb_score,character,role
1,1,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,"['drama', 'crime']",['US'],8.2,40.965,8.179,Iris Steensma,ACTOR
2,2,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,"['drama', 'action', 'thriller', 'european']",['US'],7.7,10.01,7.3,Tom,ACTOR
3,3,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,"['fantasy', 'action', 'comedy']",['GB'],8.2,15.461,7.811,Matthew 'Sport' Higgins,ACTOR
5,5,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,"['comedy', 'european']",['GB'],8.8,17.617,8.306,Wizard,ACTOR
6,6,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,['comedy'],['GB'],8.0,17.77,7.8,Senator Charles Palantine,ACTOR


In [6]:
#adding index to the dataframe
#index=pd.Index(range(0,2750,1))
#titles=titles.set_index(index)
#titles.index.name='index'
titles.head(3)

Unnamed: 0,index,title,type,description,release_year,age_certification,genres,production_countries,imdb_score,tmdb_popularity,tmdb_score
0,0,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,['documentation'],['US'],,0.6,
1,1,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,"['drama', 'crime']",['US'],8.2,40.965,8.179
2,2,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,"['drama', 'action', 'thriller', 'european']",['US'],7.7,10.01,7.3


In [7]:
credits.columns.to_list()

['person_id', 'id', 'name', 'character', 'role']

In [8]:
#Adding 'character' and 'role' to titles dataframe(creating a new column named 'character' and 'role')
titles['character']=credits['character']
titles['role']=credits['role']

In [9]:
#checking if the new columns were added to the dataframe
titles.columns

Index(['index', 'title', 'type', 'description', 'release_year',
       'age_certification', 'genres', 'production_countries', 'imdb_score',
       'tmdb_popularity', 'tmdb_score', 'character', 'role'],
      dtype='object')

In [10]:
#Handling all the null values
titles=titles.dropna()

In [11]:
titles.isnull().sum()

index                   0
title                   0
type                    0
description             0
release_year            0
age_certification       0
genres                  0
production_countries    0
imdb_score              0
tmdb_popularity         0
tmdb_score              0
character               0
role                    0
dtype: int64

In [12]:
#selecting features that are relevant
features=[['title','character','genres']]
features

[['title', 'character', 'genres']]

In [13]:
titles[['title', 'type', 'description', 'release_year', 'age_certification',
       'genres', 'production_countries', 'imdb_score', 'tmdb_popularity',
       'tmdb_score', 'character', 'role']].to_csv('titles.csv')

In [14]:
titles.shape

(2750, 13)

In [15]:
#combining all the selected features of interest
features_combined=titles['title'] +' '+ titles['character'] +' '+ titles['genres']
features_combined

1            Taxi Driver Iris Steensma ['drama', 'crime']
2       Deliverance Tom ['drama', 'action', 'thriller'...
3       Monty Python and the Holy Grail Matthew 'Sport...
5       Monty Python's Flying Circus Wizard ['comedy',...
6       Life of Brian Senator Charles Palantine ['come...
                              ...                        
5798    Convergence: Courage in a Crisis Felix Leiter ...
5800           Stuck Apart Gregg Beam ['comedy', 'drama']
5801    We Are: The Brooklyn Saints Mr. White ['docume...
5819    Alma Matters: Inside the IIT Dream Additional ...
5831      Pitta Kathalu Coach Cotton ['drama', 'romance']
Length: 2750, dtype: object

In [16]:
# Applying vectorization to change the text data into vectors
vectorizer=TfidfVectorizer()
feature_vectors=vectorizer.fit_transform(features_combined)
print(feature_vectors)

  (0, 1209)	0.17744670360494605
  (0, 1518)	0.10909738959997123
  (0, 4862)	0.5640089963565359
  (0, 2524)	0.5164872317557975
  (0, 1530)	0.3783466611511891
  (0, 5037)	0.47812030120052845
  (1, 1681)	0.30585138326266437
  (1, 5115)	0.1950594435467086
  (1, 93)	0.19694933796296413
  (1, 5165)	0.5901766102747942
  (1, 1358)	0.6811388561346206
  (1, 1518)	0.13175405293078413
  (2, 1095)	0.09441338921647555
  (2, 1742)	0.13679575568679203
  (2, 2310)	0.3985134481775531
  (2, 4807)	0.20308587104471212
  (2, 3244)	0.3985134481775531
  (2, 2090)	0.3985134481775531
  (2, 2353)	0.37887181774527084
  (2, 5091)	0.11398683917143393
  (2, 226)	0.22698151354822027
  (2, 4085)	0.3452942287407319
  (2, 3435)	0.3378268274193724
  (2, 93)	0.11522901546581994
  (3, 5597)	0.4599402436079409
  :	:
  (2747, 723)	0.46459412152242285
  (2747, 287)	0.3863034111951049
  (2747, 5519)	0.34050628149880136
  (2747, 5553)	0.34050628149880136
  (2747, 3476)	0.29265529601622103
  (2747, 1469)	0.16680247454342498
  (2

In [17]:
#finding the similarity score (cosinesimilarity)
similarity=cosine_similarity(feature_vectors)
similarity

array([[1.        , 0.01437402, 0.        , ..., 0.        , 0.00824115,
        0.01079452],
       [0.01437402, 1.        , 0.02269428, ..., 0.        , 0.00995261,
        0.01303626],
       [0.        , 0.02269428, 1.        , ..., 0.06323032, 0.01273243,
        0.        ],
       ...,
       [0.        , 0.        , 0.06323032, ..., 1.        , 0.03823078,
        0.        ],
       [0.00824115, 0.00995261, 0.01273243, ..., 0.03823078, 1.        ,
        0.00747416],
       [0.01079452, 0.01303626, 0.        , ..., 0.        , 0.00747416,
        1.        ]])

In [18]:
similarity.shape

(2750, 2750)

In [19]:
#create a Pool of all the movies in the dataset
movies_list=titles['title'].tolist()
movies_list

['Taxi Driver',
 'Deliverance',
 'Monty Python and the Holy Grail',
 "Monty Python's Flying Circus",
 'Life of Brian',
 'Dirty Harry',
 'Bonnie and Clyde',
 'The Blue Lagoon',
 'The Professionals',
 'Richard Pryor: Live in Concert',
 'Hitler: A Career',
 'FTA',
 "Monty Python's Fliegender Zirkus",
 'Seinfeld',
 'Full Metal Jacket',
 'Once Upon a Time in America',
 'When Harry Met Sally...',
 'A Nightmare on Elm Street',
 'Steel Magnolias',
 'Police Academy',
 'Christine',
 'Knight Rider',
 'Thomas & Friends',
 'Saved by the Bell',
 'Awakenings',
 'Wheel of Fortune',
 "National Lampoon's Christmas Vacation",
 'Lean On Me',
 'Eddie Murphy Raw',
 "She's Gotta Have It",
 'Major Dad',
 'Endless Love',
 'Danger Mouse',
 'Sam Kinison: Breaking the Rules',
 'Monty Python Live at the Hollywood Bowl',
 'Survivor',
 'Mission: Impossible',
 'Stargate SG-1',
 'Se7en',
 'Pokémon',
 'Boogie Nights',
 'Forrest Gump',
 'Snatch',
 'The Talented Mr. Ripley',
 'One Piece',
 'The Challenge',
 'Titanic',
 '

In [20]:
#Prompt the user to give a movie title
user_movie_request=input('Enter a movie name: ')

In [28]:
#checking if there is a close match for the movie given by the user
close_match=difflib.get_close_matches(user_movie_request,movies_list)
close_match

['Girlfriends', 'Unfriended', 'LEGO Friends']

In [29]:
closest_match=close_match[0]
closest_match

'Girlfriends'

In [30]:
#getting movie index based on the user input
movie_index=titles[titles['title']==closest_match]['index'].values[0]
movie_index

151

In [24]:
#Taking the movie index and use similarity score to find the list of similar movies
similarity_score=list(enumerate(similarity[movie_index]))
similarity_score

[(0, 0.012114301014518603),
 (1, 0.014630123259033106),
 (2, 0.03155683007909191),
 (3, 0.014819661960007795),
 (4, 0.01594117075037447),
 (5, 0.0),
 (6, 0.010572922697072232),
 (7, 0.062436544083082045),
 (8, 0.025341794097851394),
 (9, 0.014075853439446237),
 (10, 0.0),
 (11, 0.017377895451880027),
 (12, 0.013631929945373083),
 (13, 0.01554451431779151),
 (14, 0.00985084488288633),
 (15, 0.009503046577522241),
 (16, 0.05383958237759574),
 (17, 0.0),
 (18, 0.07524398166704661),
 (19, 0.040316158785605835),
 (20, 0.0),
 (21, 0.011431761629290039),
 (22, 0.030279048644710628),
 (23, 0.07231233954183373),
 (24, 0.014092255772510111),
 (25, 0.0),
 (26, 0.014902979306914692),
 (27, 0.013266434526782188),
 (28, 0.01828465293949742),
 (29, 0.04667951622668044),
 (30, 0.019452541163245476),
 (31, 0.04933785055024047),
 (32, 0.01708208986128395),
 (33, 0.026786698448722716),
 (34, 0.021565704003531146),
 (35, 0.0),
 (36, 0.0),
 (37, 0.010294710950708151),
 (38, 0.011769092978708649),
 (39, 0.0

In [31]:
len(similarity_score)

2750

In [32]:
sorted_movies=sorted(similarity_score,key=lambda x:x[1],reverse=True)
sorted_movies

[(151, 1.0000000000000002),
 (972, 0.313813798213977),
 (957, 0.2749838363839142),
 (149, 0.22899701066809003),
 (1741, 0.12213486770679395),
 (675, 0.11415483381990948),
 (1383, 0.10968596707221147),
 (2359, 0.10937456244818085),
 (1247, 0.10720252934731461),
 (611, 0.1040989524965503),
 (701, 0.10344470447573896),
 (1638, 0.10267250934690964),
 (2215, 0.10170736499956853),
 (1103, 0.10156199852710227),
 (1640, 0.09678397765876626),
 (1568, 0.09445225939761935),
 (838, 0.09382501564558068),
 (933, 0.09362080952017074),
 (2625, 0.09137925624699997),
 (2196, 0.09126876397059673),
 (2585, 0.09020477887075594),
 (2486, 0.08936253252390891),
 (481, 0.08858980240121363),
 (296, 0.08838619267284432),
 (1215, 0.0877168205992512),
 (2582, 0.08694597657806946),
 (2041, 0.0869247482433262),
 (1714, 0.086636911075291),
 (1665, 0.08646474136865973),
 (503, 0.08592918956749364),
 (1379, 0.08586053911613091),
 (2170, 0.08507726167629504),
 (2155, 0.083387655901269),
 (1124, 0.08332219180656047),
 (2

In [33]:
#Suggestions
i=1 
for movie in sorted_movies:
    index=movie[0]
    choose_from=titles[titles.index==index]['title'].values
    if (i<11):
        print(i,'.',choose_from)
        i+=1

1 . ['Girlfriends']
2 . ['Fukrey']
3 . []
4 . ['Adventures of Sonic the Hedgehog']
5 . ['Castlevania']
6 . ['The Smurfs']
7 . []
8 . []
9 . []
10 . []
