In [None]:
import numpy as np

import pandas as pd

import difflib

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Step 1: Load Data


df = pd.read_csv('netflix_titles.csv')

In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
df.shape

(8807, 12)

In [None]:
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [None]:
# Step 2: Feature Selection


selected_features = ['listed_in','rating','cast','director','description','type']

print(selected_features)

['listed_in', 'rating', 'cast', 'director', 'description', 'type']


In [None]:
# Step 3: Handle Missing Values


for feature in selected_features:
  df[feature] = df[feature].fillna('')

In [None]:
# Step 4: Combine Selected Features into a Single Text Column


combined_features = (df['listed_in'] + ' ' +
                     df['rating'] + ' ' +
                     df['cast'] + ' ' +
                     df['director'] + ' ' +
                     df['description'] + ' ' +
                     df['type'])

In [None]:
print(combined_features)

0       Documentaries PG-13  Kirsten Johnson As her fa...
1       International TV Shows, TV Dramas, TV Mysterie...
2       Crime TV Shows, International TV Shows, TV Act...
3       Docuseries, Reality TV TV-MA   Feuds, flirtati...
4       International TV Shows, Romantic TV Shows, TV ...
                              ...                        
8802    Cult Movies, Dramas, Thrillers R Mark Ruffalo,...
8803    Kids' TV, Korean TV Shows, TV Comedies TV-Y7  ...
8804    Comedies, Horror Movies R Jesse Eisenberg, Woo...
8805    Children & Family Movies, Comedies PG Tim Alle...
8806    Dramas, International Movies, Music & Musicals...
Length: 8807, dtype: object


In [None]:
# Step 5: Convert Text Data to Feature Vectors


vectorizer = TfidfVectorizer()

In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)

  (0, 12266)	0.10354748610427798
  (0, 33973)	0.08378366832291632
  (0, 14)	0.12043666356437348
  (0, 23862)	0.46108464740854765
  (0, 22241)	0.332135232121077
  (0, 3050)	0.09468071336714876
  (0, 19274)	0.08621907905244168
  (0, 14934)	0.1354747321458056
  (0, 30901)	0.25436562291944753
  (0, 44673)	0.0930576488749591
  (0, 13777)	0.18669226257481034
  (0, 32024)	0.05456434815115673
  (0, 19599)	0.14107388126062162
  (0, 25733)	0.1049402372533327
  (0, 15312)	0.19108663322218225
  (0, 42321)	0.23269893234429126
  (0, 10956)	0.15134294669039386
  (0, 20686)	0.05790016690494484
  (0, 21130)	0.25918413069880414
  (0, 2139)	0.04882041596401707
  (0, 9338)	0.25918413069880414
  (0, 48048)	0.18936555119397866
  (0, 45132)	0.04943642742415164
  (0, 19178)	0.13721460267433158
  (0, 44700)	0.13587548669973534
  :	:
  (8806, 6269)	0.11374878628023477
  (8806, 453)	0.09792562008181219
  (8806, 23189)	0.15020277854681752
  (8806, 27086)	0.14618178156530662
  (8806, 14643)	0.15585634770278456
  (

In [None]:
# Step 6: Calculate Cosine Similarity


similarity_matrix = cosine_similarity(feature_vectors)

In [None]:
print(similarity_matrix)

[[1.         0.00507153 0.02009151 ... 0.01063627 0.02078336 0.03070197]
 [0.00507153 1.         0.0215393  ... 0.00102913 0.         0.00540465]
 [0.02009151 0.0215393  1.         ... 0.00557927 0.01031976 0.03403668]
 ...
 [0.01063627 0.00102913 0.00557927 ... 1.         0.05670707 0.00602502]
 [0.02078336 0.         0.01031976 ... 0.05670707 1.         0.01042238]
 [0.03070197 0.00540465 0.03403668 ... 0.00602502 0.01042238 1.        ]]


In [None]:
print(similarity_matrix.shape)

(8807, 8807)


In [None]:
# Step 7: Create a List of Content Titles


list_of_all_titles = df['title'].tolist()

print(list_of_all_titles)



In [None]:
# Step 8: User Input for Content Recommendation


content_name = input(' Enter your favourite Content name : ')

 Enter your favourite Content name : SparkingJoy


In [None]:
# Step 9: Find Closest Matching Title


find_close_match = difflib.get_close_matches(content_name, list_of_all_titles)

print(find_close_match)

['Sparking Joy', 'Sparring', 'Spark']


In [None]:
close_match = find_close_match[0]

print(close_match)

Sparking Joy


In [None]:
# Step 10: Get Index of the Selected Content


index_of_the_content = df[df.title == close_match]['show_id'].values[0]

print(index_of_the_content)

s185


In [None]:
index_mapping = {idx: i for i, idx in enumerate(df.show_id)}

#Get the alphanumeric index

alphanumeric_index = df[df.title == close_match]['show_id'].values[0]

# Get the numeric index of the content

index_of_the_content = index_mapping[alphanumeric_index]

print(f"The alphanumeric index {alphanumeric_index} corresponds to the numerical index : {index_of_the_content}")

The alphanumeric index s185 corresponds to the numerical index : 184


In [None]:
# Step 11: Get Similarity Score


similarity_score = list(enumerate(similarity_matrix[index_of_the_content]))

similarity_score = [(list(index_mapping.keys())[idx], score) for idx, score in similarity_score]

print(similarity_score)

[('s1', 0.0455660207349408), ('s2', 0.02336792379438231), ('s3', 0.026854830099765078), ('s4', 0.12137845662979244), ('s5', 0.03761959312621416), ('s6', 0.043123491222605614), ('s7', 0.01805506583428667), ('s8', 0.024367223309115782), ('s9', 0.05572638008080594), ('s10', 0.056257652353698395), ('s11', 0.027018412038431), ('s12', 0.03091320145584698), ('s13', 0.014912437464009859), ('s14', 0.022397892558854448), ('s15', 0.027766000558391263), ('s16', 0.02757995474262674), ('s17', 0.03888475867586173), ('s18', 0.05744918379548933), ('s19', 0.019881671735961046), ('s20', 0.02888005474689522), ('s21', 0.024373090964876068), ('s22', 0.0211129023120691), ('s23', 0.03135875696752318), ('s24', 0.043929565638130295), ('s25', 0.0036122899499373574), ('s26', 0.07537146902065424), ('s27', 0.03447073497396331), ('s28', 0.008803444378428652), ('s29', 0.03359271720792005), ('s30', 0.012719874288569069), ('s31', 0.019330478310734048), ('s32', 0.02583826608681506), ('s33', 0.025256826066616778), ('s34'

In [None]:
len(similarity_score)

8807

In [None]:
# Step 12: Sort Content Based on Similarity Score


sorted_similar_content = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print(sorted_similar_content)

[('s185', 1.0), ('s4228', 0.535634449523322), ('s958', 0.18374226851690426), ('s1147', 0.1834153014147558), ('s467', 0.1750666398033522), ('s4409', 0.1740126024021505), ('s1095', 0.16786335068812872), ('s7868', 0.1672318769809983), ('s6361', 0.1595621390103405), ('s1727', 0.15884658580273903), ('s2407', 0.155217709880969), ('s1624', 0.1540576877409653), ('s75', 0.14567418578561836), ('s4808', 0.1426067983306879), ('s7436', 0.1424933280718202), ('s3054', 0.14177372638546815), ('s5300', 0.14125553063420115), ('s1314', 0.1406574308714688), ('s8606', 0.14048233523066828), ('s6764', 0.13745381617971913), ('s6379', 0.13611572749431902), ('s2570', 0.13377059779904246), ('s5576', 0.13132335779021845), ('s6619', 0.12932600258437665), ('s8244', 0.1274181620228366), ('s2113', 0.12586024828993628), ('s6895', 0.12548844286991956), ('s1401', 0.12403411456376719), ('s4', 0.12137845662979244), ('s1128', 0.12040058984067464), ('s871', 0.11905114058832636), ('s7437', 0.11863115220605178), ('s2923', 0.11

In [None]:
# Step 13: Display Recommended Content


print('Content suggested for you : \n')

i = 1

for movie in sorted_similar_content:
  alphanumeric_index = movie[0]
  title_from_index = df[df['show_id'] == alphanumeric_index]['title'].values[0]
  if (i<=30):
    print(i, '.',title_from_index)
    i+=1

Content suggested for you : 

1 . Sparking Joy
2 . Tidying Up with Marie Kondo
3 . Pet Stars
4 . Haunted: Latin America
5 . My Unorthodox Life
6 . Westside
7 . The Wedding Coach
8 . Rica, Famosa, Latina
9 . Border Patrol
10 . Country Ever After
11 . Lenox Hill
12 . The Bachelorette
13 . The World's Most Amazing Vacation Rentals
14 . Churchill’s Secret Agents: The New Recruits
15 . Mega Food
16 . Border Security: America's Front Line
17 . High Risk
18 . Buried by the Bernards
19 . Top Grier
20 . Fit for Fashion
21 . Bringing Sexy Back
22 . Hoarders
23 . Cheer Squad
24 . Diva Brides
25 . The Chefs' Line
26 . DeMarcus Family Rules
27 . Great Interior Design Challenge
28 . Bling Empire
29 . Jailbirds New Orleans
30 . Prank Encounters
