In [1]:
#!/bin/bash
#curl -L -o ./netflix-shows.zip https://www.kaggle.com/api/v1/datasets/download/shivamb/netflix-shows

##### - Load the dataset 
##### - view the dataframe
##### - Select useful columns required for the task
##### - Data preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
netflix = pd.read_csv("/ds/text/netflix/netflix_titles.csv")

In [4]:
netflix.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [5]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [6]:
netflix.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [7]:
netflix['country'].nunique()

748

In [8]:
shows = netflix.loc[netflix['type']=='TV Show',['title','cast','country','listed_in', 'description']]
movies = netflix.loc[netflix['type']=='Movie',['title','director','cast','country','listed_in', 'description']]

In [9]:
shows.isnull().sum()

title            0
cast           350
country        391
listed_in        0
description      0
dtype: int64

In [10]:
shows.dropna(inplace=True)

In [11]:
shows.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2024 entries, 1 to 8800
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2024 non-null   object
 1   cast         2024 non-null   object
 2   country      2024 non-null   object
 3   listed_in    2024 non-null   object
 4   description  2024 non-null   object
dtypes: object(5)
memory usage: 94.9+ KB


In [12]:
shows.duplicated().sum()

np.int64(0)

In [13]:
movies.isnull().sum()

title            0
director       188
cast           475
country        440
listed_in        0
description      0
dtype: int64

In [14]:
movies.duplicated().sum()

np.int64(0)

In [15]:
movies.dropna(inplace=True)

In [16]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5189 entries, 7 to 8806
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        5189 non-null   object
 1   director     5189 non-null   object
 2   cast         5189 non-null   object
 3   country      5189 non-null   object
 4   listed_in    5189 non-null   object
 5   description  5189 non-null   object
dtypes: object(6)
memory usage: 283.8+ KB


In [17]:
shows['description'] = shows['description'].apply(lambda x:x.split())

In [18]:
shows['description']

1       [After, crossing, paths, at, a, party,, a, Cap...
4       [In, a, city, of, coaching, centers, known, to...
8       [A, talented, batch, of, amateur, bakers, face...
15      [Students, of, color, navigate, the, daily, sl...
17      [Strangers, Diego, and, Isabel, flee, their, h...
                              ...                        
8780    [Nate, frees, a, mythical, being, trapped, in,...
8795    [Now, that, he's, discovered, the, Pendulum, S...
8796    [During, the, Mongol, invasions,, Yunus, Emre,...
8797    [Teen, surfer, Zak, Storm, is, mysteriously, t...
8800    [Strong-willed,, middle-class, Kashaf, and, ca...
Name: description, Length: 2024, dtype: object

In [19]:
movies['description']= movies['description'].apply(lambda x:x.split())
movies['description']

7       [On, a, photo, shoot, in, Ghana,, an, American...
9       [A, woman, adjusting, to, life, after, a, loss...
12      [After, most, of, her, family, is, murdered, i...
24      [When, the, father, of, the, man, she, loves, ...
27      [Mourning, the, loss, of, their, beloved, juni...
                              ...                        
8801    [Recovering, alcoholic, Talal, wakes, up, insi...
8802    [A, political, cartoonist,, a, crime, reporter...
8804    [Looking, to, survive, in, a, world, taken, ov...
8805    [Dragged, from, civilian, life,, a, former, su...
8806    [A, scrappy, but, poor, boy, worms, his, way, ...
Name: description, Length: 5189, dtype: object

In [20]:
movies.head(2)

Unnamed: 0,title,director,cast,country,listed_in,description
7,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","Dramas, Independent Movies, International Movies","[On, a, photo, shoot, in, Ghana,, an, American..."
9,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"Comedies, Dramas","[A, woman, adjusting, to, life, after, a, loss..."


In [21]:
shows.head(2)

Unnamed: 0,title,cast,country,listed_in,description
1,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"International TV Shows, TV Dramas, TV Mysteries","[After, crossing, paths, at, a, party,, a, Cap..."
4,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"International TV Shows, Romantic TV Shows, TV ...","[In, a, city, of, coaching, centers, known, to..."


In [22]:
shows.iloc[5,1]

'Engin Altan Düzyatan, Serdar Gökhan, Hülya Darcan, Kaan Taşaner, Esra Bilgiç, Osman Soykut, Serdar Deniz, Cengiz Coşkun, Reshad Strik, Hande Subaşı'

In [23]:
shows['cast'] = shows['cast'].apply(lambda x: x.strip().split(","))

In [24]:
shows['cast'] = shows['cast'].apply(lambda x:[i.replace(" ", "") for i in x])

In [25]:
shows['cast'].apply(lambda x:x[:5])

1       [AmaQamata, KhosiNgema, GailMabalane, ThabangM...
4       [MayurMore, JitendraKumar, RanjanRaj, AlamKhan...
8       [MelGiedroyc, SuePerkins, MaryBerry, PaulHolly...
15      [LoganBrowning, BrandonP.Bell, DeRonHorton, An...
17      [LuisErnestoFranco, CamilaSodi, SergioGoyri, S...
                              ...                        
8780    [JohnnyYongBosch, J.W.Terry, AlicynPackard, Me...
8795    [MikeLiscio, EmilyBauer, BillyBobThompson, Aly...
8796    [GökhanAtalay, PayidarTüfekçioglu, BaranAkbulu...
8797    [MichaelJohnston, JessicaGee-George, Christine...
8800    [SanamSaeed, FawadKhan, AyeshaOmer, MehreenRah...
Name: cast, Length: 2024, dtype: object

In [26]:
shows['cast'] = shows['cast'].apply(lambda x:x[:5])

In [27]:
shows['listed_in']= shows['listed_in'].apply(lambda x: x.strip().split(","))

In [28]:
shows['listed_in'] = shows['listed_in'].apply(lambda x:[i.replace(" ", "") for i in x])

In [29]:
shows['listed_in'][1]

['InternationalTVShows', 'TVDramas', 'TVMysteries']

In [30]:
movies['director'].apply(lambda x:x.replace(" ", ""))

7              HaileGerima
9            TheodoreMelfi
12      ChristianSchwochow
24               S.Shankar
27             DennisDugan
               ...        
8801         MajidAlAnsari
8802          DavidFincher
8804        RubenFleischer
8805           PeterHewitt
8806            MozezSingh
Name: director, Length: 5189, dtype: object

In [31]:
movies['director'] = movies['director'].apply(lambda x:x.replace(" ", ""))

In [32]:
x="fan"
a = ["".join(x)]

In [33]:
a

['fan']

In [34]:
movies['director'] = movies['director'].apply(lambda x:["".join(x)])

In [35]:
movies['director']

7              [HaileGerima]
9            [TheodoreMelfi]
12      [ChristianSchwochow]
24               [S.Shankar]
27             [DennisDugan]
                ...         
8801         [MajidAlAnsari]
8802          [DavidFincher]
8804        [RubenFleischer]
8805           [PeterHewitt]
8806            [MozezSingh]
Name: director, Length: 5189, dtype: object

In [36]:
movies['cast'] = movies['cast'].apply(lambda x: x.strip().split(","))

In [37]:
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])

In [38]:
movies['cast'] = movies['cast'].apply(lambda x:x[:5])

In [39]:
movies['listed_in']= movies['listed_in'].apply(lambda x: x.strip().split(","))
movies['listed_in'] = movies['listed_in'].apply(lambda x:[i.replace(" ", "") for i in x])

In [40]:
movies['listed_in']

7       [Dramas, IndependentMovies, InternationalMovies]
9                                     [Comedies, Dramas]
12                         [Dramas, InternationalMovies]
24       [Comedies, InternationalMovies, RomanticMovies]
27                                            [Comedies]
                              ...                       
8801            [Dramas, InternationalMovies, Thrillers]
8802                     [CultMovies, Dramas, Thrillers]
8804                            [Comedies, HorrorMovies]
8805                   [Children&FamilyMovies, Comedies]
8806       [Dramas, InternationalMovies, Music&Musicals]
Name: listed_in, Length: 5189, dtype: object

In [41]:
shows.head(2)

Unnamed: 0,title,cast,country,listed_in,description
1,Blood & Water,"[AmaQamata, KhosiNgema, GailMabalane, ThabangM...",South Africa,"[InternationalTVShows, TVDramas, TVMysteries]","[After, crossing, paths, at, a, party,, a, Cap..."
4,Kota Factory,"[MayurMore, JitendraKumar, RanjanRaj, AlamKhan...",India,"[InternationalTVShows, RomanticTVShows, TVCome...","[In, a, city, of, coaching, centers, known, to..."


In [42]:
movies.head(2)

Unnamed: 0,title,director,cast,country,listed_in,description
7,Sankofa,[HaileGerima],"[KofiGhanaba, OyafunmikeOgunlano, AlexandraDua...","United States, Ghana, Burkina Faso, United Kin...","[Dramas, IndependentMovies, InternationalMovies]","[On, a, photo, shoot, in, Ghana,, an, American..."
9,The Starling,[TheodoreMelfi],"[MelissaMcCarthy, ChrisO'Dowd, KevinKline, Tim...",United States,"[Comedies, Dramas]","[A, woman, adjusting, to, life, after, a, loss..."


In [43]:
shows['tags'] = shows['description'] + shows['listed_in'] + shows['cast']
movies['tags'] = movies['description'] + movies['listed_in'] + movies['director'] + movies['cast']

In [44]:
movies.head()

Unnamed: 0,title,director,cast,country,listed_in,description,tags
7,Sankofa,[HaileGerima],"[KofiGhanaba, OyafunmikeOgunlano, AlexandraDua...","United States, Ghana, Burkina Faso, United Kin...","[Dramas, IndependentMovies, InternationalMovies]","[On, a, photo, shoot, in, Ghana,, an, American...","[On, a, photo, shoot, in, Ghana,, an, American..."
9,The Starling,[TheodoreMelfi],"[MelissaMcCarthy, ChrisO'Dowd, KevinKline, Tim...",United States,"[Comedies, Dramas]","[A, woman, adjusting, to, life, after, a, loss...","[A, woman, adjusting, to, life, after, a, loss..."
12,Je Suis Karl,[ChristianSchwochow],"[LunaWedler, JannisNiewöhner, MilanPeschel, Ed...","Germany, Czech Republic","[Dramas, InternationalMovies]","[After, most, of, her, family, is, murdered, i...","[After, most, of, her, family, is, murdered, i..."
24,Jeans,[S.Shankar],"[Prashanth, AishwaryaRaiBachchan, SriLakshmi, ...",India,"[Comedies, InternationalMovies, RomanticMovies]","[When, the, father, of, the, man, she, loves, ...","[When, the, father, of, the, man, she, loves, ..."
27,Grown Ups,[DennisDugan],"[AdamSandler, KevinJames, ChrisRock, DavidSpad...",United States,[Comedies],"[Mourning, the, loss, of, their, beloved, juni...","[Mourning, the, loss, of, their, beloved, juni..."


In [45]:
shows['tags'] = shows['tags'].apply(lambda x : " ".join(x))

In [46]:
shows['tags']

1       After crossing paths at a party, a Cape Town t...
4       In a city of coaching centers known to train I...
8       A talented batch of amateur bakers face off in...
15      Students of color navigate the daily slights a...
17      Strangers Diego and Isabel flee their home in ...
                              ...                        
8780    Nate frees a mythical being trapped in a magic...
8795    Now that he's discovered the Pendulum Summonin...
8796    During the Mongol invasions, Yunus Emre leaves...
8797    Teen surfer Zak Storm is mysteriously transpor...
8800    Strong-willed, middle-class Kashaf and carefre...
Name: tags, Length: 2024, dtype: object

In [47]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [48]:
movies['tags']

7       On a photo shoot in Ghana, an American model s...
9       A woman adjusting to life after a loss contend...
12      After most of her family is murdered in a terr...
24      When the father of the man she loves insists t...
27      Mourning the loss of their beloved junior high...
                              ...                        
8801    Recovering alcoholic Talal wakes up inside a s...
8802    A political cartoonist, a crime reporter and a...
8804    Looking to survive in a world taken over by zo...
8805    Dragged from civilian life, a former superhero...
8806    A scrappy but poor boy worms his way into a ty...
Name: tags, Length: 5189, dtype: object

In [49]:
shows_p = shows[['title', 'tags']]

In [50]:
shows_p.head(2)

Unnamed: 0,title,tags
1,Blood & Water,"After crossing paths at a party, a Cape Town t..."
4,Kota Factory,In a city of coaching centers known to train I...


In [51]:
movies_p = movies[['title', 'tags']]

In [52]:
movies_p.head(2)

Unnamed: 0,title,tags
7,Sankofa,"On a photo shoot in Ghana, an American model s..."
9,The Starling,A woman adjusting to life after a loss contend...


In [53]:
movies_p.iloc[2,1]

'After most of her family is murdered in a terrorist bombing, a young woman is unknowingly lured into joining the very group that killed them. Dramas InternationalMovies ChristianSchwochow LunaWedler JannisNiewöhner MilanPeschel EdinHasanović AnnaFialová'

In [54]:
shows_p['tags'] = shows_p['tags'].apply(lambda x:x.lower())
movies_p['tags'] = movies_p['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shows_p['tags'] = shows_p['tags'].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_p['tags'] = movies_p['tags'].apply(lambda x:x.lower())


In [55]:
shows_p.head(2)

Unnamed: 0,title,tags
1,Blood & Water,"after crossing paths at a party, a cape town t..."
4,Kota Factory,in a city of coaching centers known to train i...


## Stemming and Vectorization

In [56]:
import nltk

In [57]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [58]:
def stem(text):
    y = []
    for i in text.split(): 
        y.append(ps.stem(i))
    return " ".join(y)

In [59]:
print(stem("loving loved lovely unique"))

love love love uniqu


In [60]:
shows_p['tags'] = shows_p['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shows_p['tags'] = shows_p['tags'].apply(stem)


In [61]:
movies_p['tags'] = movies_p['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_p['tags'] = movies_p['tags'].apply(stem)


In [62]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [63]:
cv.fit_transform(shows_p['tags']).toarray().shape

(2024, 5000)

In [64]:
shows_vectors = cv.fit_transform(shows_p['tags']).toarray()
movies_vectors = cv.fit_transform(movies_p['tags']).toarray()

In [65]:
movies_vectors[45]

array([0, 0, 0, ..., 0, 0, 0], shape=(5000,))

In [66]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zoyaakhtar', 'álvarocervant',
       'özgeborak'], shape=(5000,), dtype=object)

## Similarity between vectors - Cosine Similarity

In [67]:
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
cosine_similarity(shows_vectors).shape

(2024, 2024)

In [69]:
cosine_similarity(movies_vectors).shape

(5189, 5189)

In [70]:
shows_similarity = cosine_similarity(shows_vectors)
movies_similarity = cosine_similarity(movies_vectors)

In [71]:
shows_similarity.shape

(2024, 2024)

## Shows Recommendation

In [72]:
shows_p = shows_p.reset_index(drop=True)

In [73]:
shows_p.index = shows_p.index + 1 

In [74]:
shows_p

Unnamed: 0,title,tags
1,Blood & Water,"after cross path at a party, a cape town teen ..."
2,Kota Factory,in a citi of coach center known to train india...
3,The Great British Baking Show,a talent batch of amateur baker face off in a ...
4,Dear White People,student of color navig the daili slight and sl...
5,Falsa identidad,stranger diego and isabel flee their home in m...
...,...,...
2020,Yo-Kai Watch,"nate free a mythic be trap in a magic capsule,..."
2021,Yu-Gi-Oh! Arc-V,now that he' discov the pendulum summon techni...
2022,Yunus Emre,"dure the mongol invasions, yunu emr leav hi ho..."
2023,Zak Storm,teen surfer zak storm is mysteri transport to ...


In [75]:
shows_p[shows_p['title'] == 'Zak Storm'].index

Index([2023], dtype='int64')

In [76]:
sorted(list(enumerate(shows_similarity[2])), reverse=True, key=lambda x: x[1])[:6]

[(2, np.float64(1.0000000000000002)),
 (1114, np.float64(0.2564945880212885)),
 (1193, np.float64(0.22360679774997896)),
 (693, np.float64(0.21693045781865616)),
 (1959, np.float64(0.21693045781865616)),
 (1661, np.float64(0.2051956704170308))]

In [77]:
sorted(list(enumerate(shows_similarity[2])), reverse=True, key=lambda x: x[1])[:6][2][0]

1193

In [78]:
shows_p.iloc[1193]['title']

'The Final Table'

In [79]:
def recommend(show_name):
    show_index = shows_p[shows_p['title'] == show_name].index[0]
    distances_array = shows_similarity[show_index]
    similar_5_shows = sorted(list(enumerate(distances_array)), reverse=True, key=lambda x: x[1])[:6]
    for idx in similar_5_shows:
        print(shows_p.iloc[idx[0]]['title'])

In [80]:
recommend("Zak Storm")

Zindagi Gulzar Hai
Love Me As I Am
Humsafar
Eternal Love
Intersection
Meteor Garden


## Movie Recommendation

In [81]:
movies_p = movies_p.reset_index(drop=True)
movies_p.index = movies_p.index + 1

In [82]:
movies_p

Unnamed: 0,title,tags
1,Sankofa,"on a photo shoot in ghana, an american model s..."
2,The Starling,a woman adjust to life after a loss contend wi...
3,Je Suis Karl,after most of her famili is murder in a terror...
4,Jeans,when the father of the man she love insist tha...
5,Grown Ups,mourn the loss of their belov junior high bask...
...,...,...
5185,Zinzana,recov alcohol talal wake up insid a small-town...
5186,Zodiac,"a polit cartoonist, a crime report and a pair ..."
5187,Zombieland,look to surviv in a world taken over by zombie...
5188,Zoom,"drag from civilian life, a former superhero mu..."


In [83]:
movies_p[movies_p['title'] == 'Zoom'].index[0]

np.int64(5188)

In [85]:
def recommend(movie_name):
    movie_index = movies_p[movies_p['title'] == movie_name].index[0]
    distances_array = movies_similarity[movie_index]
    similar_5_movies = sorted(list(enumerate(distances_array)), reverse= True, key=lambda x:x[1])[1:6]
    for idx in similar_5_movies:
        print(movies_p.iloc[idx[0]]['title'])

In [87]:
recommend("Kuch Kuch Hota Hai")

The Player
Night of Knots
Just Say Yes
Christmas Wedding Planner
Baby Dolls
