In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

<h2>Preprocesing<h2>

In [2]:
netflix_titles_df = pd.read_csv('netflix_titles.csv')
netflix_titles_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
netflix_titles_df.drop(netflix_titles_df.columns[[0,1,5,6,7,9]], axis=1, inplace=True)

In [4]:
netflix_titles_df.count()

title          8807
director       6173
cast           7982
rating         8803
listed_in      8807
description    8807
dtype: int64

In [5]:
null_rows = len(netflix_titles_df[netflix_titles_df.isna().any(axis=1)])

In [6]:
print(f'Rows with NaNs: {null_rows} ({(null_rows/netflix_titles_df.shape[0])*100:.0f}%)')

Rows with NaNs: 3108 (35%)


In [7]:
netflix_titles_df.fillna('', inplace=True)

In [8]:
netflix_titles_df.head()

Unnamed: 0,title,director,cast,rating,listed_in,description
0,Dick Johnson Is Dead,Kirsten Johnson,,PG-13,Documentaries,"As her father nears the end of his life, filmm..."
1,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,,,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [9]:
netflix_titles_df[['director','cast']] = netflix_titles_df[['director','cast']].applymap(lambda x: ' '.join(x.replace(' ', '').split(',')[:3]))
netflix_titles_df.head()

Unnamed: 0,title,director,cast,rating,listed_in,description
0,Dick Johnson Is Dead,KirstenJohnson,,PG-13,Documentaries,"As her father nears the end of his life, filmm..."
1,Blood & Water,,AmaQamata KhosiNgema GailMabalane,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,Ganglands,JulienLeclercq,SamiBouajila TracyGotoas SamuelJouy,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,,,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,,MayurMore JitendraKumar RanjanRaj,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [10]:
netflix_titles_df['title_dup'] = netflix_titles_df['title']
titles_corpus = netflix_titles_df.apply(' '.join, axis=1)

In [11]:
titles_corpus.head

<bound method NDFrame.head of 0       Dick Johnson Is Dead KirstenJohnson  PG-13 Doc...
1       Blood & Water  AmaQamata KhosiNgema GailMabala...
2       Ganglands JulienLeclercq SamiBouajila TracyGot...
3       Jailbirds New Orleans   TV-MA Docuseries, Real...
4       Kota Factory  MayurMore JitendraKumar RanjanRa...
                              ...                        
8802    Zodiac DavidFincher MarkRuffalo JakeGyllenhaal...
8803    Zombie Dumb   TV-Y7 Kids' TV, Korean TV Shows,...
8804    Zombieland RubenFleischer JesseEisenberg Woody...
8805    Zoom PeterHewitt TimAllen CourteneyCox ChevyCh...
8806    Zubaan MozezSingh VickyKaushal Sarah-JaneDias ...
Length: 8807, dtype: object>

In [12]:
tfidf_vectorizer_params = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 3), max_df = .5)

<h2>Text Vectorization <h2>

In [13]:
tfidf_vectorizer = tfidf_vectorizer_params.fit_transform(titles_corpus)

In [15]:
pd.DataFrame(tfidf_vectorizer.toarray(), columns=tfidf_vectorizer_params.get_feature_names_out())

Unnamed: 0,000,000 afghans,000 afghans 21,000 chance,000 chance big,000 circle,000 counting,000 counting life,000 euro,000 euro prize,...,영웅의 탄생 youngjunlee,잡는다,잡는다 hong,잡는다 hong seonkim,최강전사,최강전사 미니특공대,최강전사 미니특공대 영웅의,탄생,탄생 youngjunlee,탄생 youngjunlee umsang
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pickle', 'wb'))

In [17]:
vects_cos_sim = cosine_similarity(tfidf_vectorizer, tfidf_vectorizer)

In [18]:
pd.DataFrame(data=vects_cos_sim, index=netflix_titles_df['title'], columns=netflix_titles_df['title']).head()

title,Dick Johnson Is Dead,Blood & Water,Ganglands,Jailbirds New Orleans,Kota Factory,Midnight Mass,My Little Pony: A New Generation,Sankofa,The Great British Baking Show,The Starling,...,Zak Storm,Zed Plus,Zenda,Zindagi Gulzar Hai,Zinzana,Zodiac,Zombie Dumb,Zombieland,Zoom,Zubaan
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dick Johnson Is Dead,1.0,0.0,0.0,0.0,0.001775,0.0,0.001066,0.0,0.004502,0.008746,...,0.0,0.0,0.0,0.001156,0.0,0.0,0.0,0.0,0.003253,0.0
Blood & Water,0.0,1.0,0.013853,0.001383,0.021823,0.031228,0.005726,0.002863,0.002278,0.000785,...,0.003435,0.006547,0.014951,0.016674,0.007021,0.000742,0.011447,0.0,0.0,0.001377
Ganglands,0.0,0.013853,1.0,0.001379,0.017773,0.001354,0.001378,0.002094,0.004543,0.0,...,0.0,0.001961,0.000633,0.016028,0.002103,0.002475,0.009696,0.0,0.001679,0.002284
Jailbirds New Orleans,0.0,0.001383,0.001379,1.0,0.001309,0.001266,0.010838,0.001416,0.00907,0.0,...,0.0,0.001326,0.0,0.0,0.001422,0.0,0.0,0.0,0.006606,0.0
Kota Factory,0.001775,0.021823,0.017773,0.001309,1.0,0.001285,0.0,0.001987,0.00431,0.002997,...,0.0,0.00279,0.000601,0.031513,0.001995,0.0,0.016576,0.005647,0.010199,0.000563
