In [482]:

#importing libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import warnings
import difflib #comparing datasets
import ast #for literal evaluation (maps,dictionary) iregardless of the type we parse pytohn will understand
from sklearn.feature_extraction.text import TfidfVectorizer  # Mathematical statistic that is planned to reflect how significant a word is to a record in a collection
from sklearn.metrics.pairwise import cosine_similarity # finding similarity score of a feature with regards to other features

**Data collection and preprosessing**

In [483]:
credits=pd.read_csv(r"C:\Users\Ernest\Desktop\datasets\netflix\credits.csv")
titles=pd.read_csv(r"C:\Users\Ernest\Desktop\datasets\netflix\titles.csv",usecols=['index','title', 'type', 'description', 'release_year',
       'age_certification','genres', 'production_countries', 'imdb_score','tmdb_popularity',
       'tmdb_score'])

In [484]:
#adding index to the dataframe
#index=pd.Index(range(0,2750,1))
#titles=titles.set_index(index)
#titles.index.name='index'
titles

Unnamed: 0,index,title,type,description,release_year,age_certification,genres,production_countries,imdb_score,tmdb_popularity,tmdb_score
0,0,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,['documentation'],['US'],,0.600,
1,1,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,"['drama', 'crime']",['US'],8.2,40.965,8.179
2,2,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,"['drama', 'action', 'thriller', 'european']",['US'],7.7,10.010,7.300
3,3,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,"['fantasy', 'action', 'comedy']",['GB'],8.2,15.461,7.811
4,4,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,"['war', 'action']","['GB', 'US']",7.7,20.398,7.600
...,...,...,...,...,...,...,...,...,...,...,...
5845,5845,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,"['romance', 'drama']",['NG'],6.8,1.466,
5846,5846,C/O Kaadhal,MOVIE,A heart warming film that explores the concept...,2021,,['drama'],[],7.7,,
5847,5847,Lokillo,MOVIE,A controversial TV host and comedian who has b...,2021,,['comedy'],['CO'],3.8,26.005,6.300
5848,5848,Dad Stop Embarrassing Me - The Afterparty,MOVIE,"Jamie Foxx, David Alan Grier and more from the...",2021,PG-13,[],['US'],,1.296,10.000


In [485]:
titles.shape

(5850, 11)

In [486]:
titles.dtypes

index                     int64
title                    object
type                     object
description              object
release_year              int64
age_certification        object
genres                   object
production_countries     object
imdb_score              float64
tmdb_popularity         float64
tmdb_score              float64
dtype: object

In [487]:
#adding 'character' and 'role' to titles df
titles['character']=credits['character']
titles['role']=credits['role']

In [488]:
#checking if the columns were added to the df
titles.columns

Index(['index', 'title', 'type', 'description', 'release_year',
       'age_certification', 'genres', 'production_countries', 'imdb_score',
       'tmdb_popularity', 'tmdb_score', 'character', 'role'],
      dtype='object')

In [489]:
#handling all the nulls
# note-This step has been sequential 
titles=titles.dropna()

In [490]:
titles.isnull().sum()

index                   0
title                   0
type                    0
description             0
release_year            0
age_certification       0
genres                  0
production_countries    0
imdb_score              0
tmdb_popularity         0
tmdb_score              0
character               0
role                    0
dtype: int64

In [491]:
#selecting relevant features
features=['title','character','genres']

In [492]:
titles[['title', 'type', 'description', 'release_year', 'age_certification',
       'genres', 'production_countries', 'imdb_score', 'tmdb_popularity',
       'tmdb_score', 'character', 'role']].to_csv('titles.csv')

In [493]:
titles.shape

(2750, 13)

In [494]:
#let us combine all the selected features
features_combined=titles['title']+' '+titles['character']+' '+titles['genres']
features_combined

1            Taxi Driver Iris Steensma ['drama', 'crime']
2       Deliverance Tom ['drama', 'action', 'thriller'...
3       Monty Python and the Holy Grail Matthew 'Sport...
5       Monty Python's Flying Circus Wizard ['comedy',...
6       Life of Brian Senator Charles Palantine ['come...
                              ...                        
5798    Convergence: Courage in a Crisis Felix Leiter ...
5800           Stuck Apart Gregg Beam ['comedy', 'drama']
5801    We Are: The Brooklyn Saints Mr. White ['docume...
5819    Alma Matters: Inside the IIT Dream Additional ...
5831      Pitta Kathalu Coach Cotton ['drama', 'romance']
Length: 2750, dtype: object

In [495]:
# We will apply the vectorization to change the text data into vectors
#since this is an instance is not static, which is to say that you'll need to initialize that instance first
vectorizer=TfidfVectorizer()
feature_vectors=vectorizer.fit_transform(features_combined)
print(feature_vectors)


  (0, 1209)	0.17744670360494605
  (0, 1518)	0.10909738959997123
  (0, 4862)	0.5640089963565359
  (0, 2524)	0.5164872317557975
  (0, 1530)	0.3783466611511891
  (0, 5037)	0.47812030120052845
  (1, 1681)	0.30585138326266437
  (1, 5115)	0.1950594435467086
  (1, 93)	0.19694933796296413
  (1, 5165)	0.5901766102747942
  (1, 1358)	0.6811388561346206
  (1, 1518)	0.13175405293078413
  (2, 1095)	0.09441338921647555
  (2, 1742)	0.13679575568679203
  (2, 2310)	0.3985134481775531
  (2, 4807)	0.20308587104471212
  (2, 3244)	0.3985134481775531
  (2, 2090)	0.3985134481775531
  (2, 2353)	0.37887181774527084
  (2, 5091)	0.11398683917143393
  (2, 226)	0.22698151354822027
  (2, 4085)	0.3452942287407319
  (2, 3435)	0.3378268274193724
  (2, 93)	0.11522901546581994
  (3, 5597)	0.4599402436079409
  :	:
  (2747, 723)	0.46459412152242285
  (2747, 287)	0.3863034111951049
  (2747, 5519)	0.34050628149880136
  (2747, 5553)	0.34050628149880136
  (2747, 3476)	0.29265529601622103
  (2747, 1469)	0.16680247454342498
  (2

In [496]:
#finding the similarity score -cosinesimilarity
#create a similarity score instance since it is not static
similarity=cosine_similarity(feature_vectors)
similarity

array([[1.        , 0.01437402, 0.        , ..., 0.        , 0.00824115,
        0.01079452],
       [0.01437402, 1.        , 0.02269428, ..., 0.        , 0.00995261,
        0.01303626],
       [0.        , 0.02269428, 1.        , ..., 0.06323032, 0.01273243,
        0.        ],
       ...,
       [0.        , 0.        , 0.06323032, ..., 1.        , 0.03823078,
        0.        ],
       [0.00824115, 0.00995261, 0.01273243, ..., 0.03823078, 1.        ,
        0.00747416],
       [0.01079452, 0.01303626, 0.        , ..., 0.        , 0.00747416,
        1.        ]])

In [497]:
similarity.shape

(2750, 2750)

In [498]:
#Prompt the user to give a movie title
user_movie_request=input('Enter a movie name: ')

In [504]:
#create a pool of all the movies in the dataset
movies_list=titles['title'].tolist()
movies_list

['Taxi Driver',
 'Deliverance',
 'Monty Python and the Holy Grail',
 "Monty Python's Flying Circus",
 'Life of Brian',
 'Dirty Harry',
 'Bonnie and Clyde',
 'The Blue Lagoon',
 'The Professionals',
 'Richard Pryor: Live in Concert',
 'Hitler: A Career',
 'FTA',
 "Monty Python's Fliegender Zirkus",
 'Seinfeld',
 'Full Metal Jacket',
 'Once Upon a Time in America',
 'When Harry Met Sally...',
 'A Nightmare on Elm Street',
 'Steel Magnolias',
 'Police Academy',
 'Christine',
 'Knight Rider',
 'Thomas & Friends',
 'Saved by the Bell',
 'Awakenings',
 'Wheel of Fortune',
 "National Lampoon's Christmas Vacation",
 'Lean On Me',
 'Eddie Murphy Raw',
 "She's Gotta Have It",
 'Major Dad',
 'Endless Love',
 'Danger Mouse',
 'Sam Kinison: Breaking the Rules',
 'Monty Python Live at the Hollywood Bowl',
 'Survivor',
 'Mission: Impossible',
 'Stargate SG-1',
 'Se7en',
 'Pokémon',
 'Boogie Nights',
 'Forrest Gump',
 'Snatch',
 'The Talented Mr. Ripley',
 'One Piece',
 'The Challenge',
 'Titanic',
 '

In [505]:
#checking if there is a close match for the movie given by the user
close_match=difflib.get_close_matches(user_movie_request,movies_list)
close_match

['Life of Brian', 'Prince of Peoria']

In [506]:
closest_match=close_match[0]
closest_match

'Life of Brian'

In [507]:
#getting movie index based on the user input
movie_index=titles[titles.title==closest_match]['index'].values[0]
movie_index

6

In [510]:
#we will now take the movie index and use similarity score to find the list similar movies
similarity_score=list(enumerate(similarity[movie_index]))
similarity_score


[(0, 0.03786884617096068),
 (1, 0.04057725786803015),
 (2, 0.08003935544261771),
 (3, 0.0),
 (4, 0.0),
 (5, 0.05150415971000298),
 (6, 1.0000000000000002),
 (7, 0.027461480595495406),
 (8, 0.022206427304149873),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.008446961881781024),
 (15, 0.029706163695979823),
 (16, 0.008215496065816378),
 (17, 0.0),
 (18, 0.08884398330542687),
 (19, 0.024807059824944824),
 (20, 0.0),
 (21, 0.05763914630641337),
 (22, 0.03359053707068449),
 (23, 0.0),
 (24, 0.012083912471863363),
 (25, 0.0),
 (26, 0.0),
 (27, 0.01137578228945156),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.010270193418883486),
 (32, 0.0),
 (33, 0.0),
 (34, 0.04889951036641042),
 (35, 0.0),
 (36, 0.021293917664244315),
 (37, 0.028552810767728698),
 (38, 0.03678973892495453),
 (39, 0.02448599037480646),
 (40, 0.010304260369736673),
 (41, 0.007761596004506142),
 (42, 0.027168403517199898),
 (43, 0.02521974059346887),
 (44, 0.03545409621603639),
 (45, 0.00912801255428023)

In [511]:
len(similarity_score)

2750

In [514]:
sorted_movies=sorted(similarity_score,key=lambda x:x[1],reverse=True)
sorted_movies

[(6, 1.0000000000000002),
 (1105, 0.29443214152384484),
 (1173, 0.25646802384923434),
 (157, 0.243893931906158),
 (162, 0.2231015672760182),
 (2194, 0.22154563594174065),
 (1172, 0.2186426359708129),
 (1983, 0.21654401570763324),
 (122, 0.20606043050509035),
 (159, 0.20518168985639718),
 (1848, 0.19982975945748246),
 (1368, 0.19482737607207093),
 (2510, 0.17970035807378104),
 (1792, 0.17581447351124632),
 (149, 0.17246166833123655),
 (942, 0.1518089141768919),
 (837, 0.14635937111178263),
 (943, 0.14264452115616066),
 (457, 0.13858933321188582),
 (219, 0.13692676857599514),
 (2319, 0.13602418922123713),
 (939, 0.13560898159860107),
 (695, 0.13309254044819813),
 (2211, 0.13103680350035338),
 (1587, 0.12862197550472154),
 (2680, 0.12703073647859361),
 (88, 0.12645547516214753),
 (932, 0.12349553389050058),
 (1822, 0.12095462342833399),
 (403, 0.11774496089850017),
 (1343, 0.11622795354849201),
 (2569, 0.11462751011353155),
 (1349, 0.11362912537084902),
 (243, 0.11262699075761917),
 (214,

In [518]:
#suggestions
i=1 
for movie in sorted_movies:
    index=movie[0]
    choose_from=titles[titles.index==index]['title'].values[0]
    if (i<11):
        print(i,'.',choose_from)
        i+=1 

1 . Life of Brian


IndexError: index 0 is out of bounds for axis 0 with size 0