In [486]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [487]:
data = pd.read_csv('netflix_titles.csv')
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [488]:
data.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [489]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [490]:
def clean_data(data):
    data.drop(['date_added'], axis=1, inplace=True)
    data.drop(['release_year'], axis=1, inplace=True)
    data.drop(['duration'], axis=1, inplace=True)
    data.drop(['show_id'],axis=1, inplace=True)

In [491]:
clean_data(data)

In [492]:
data.isna().sum()

type              0
title             0
director       2634
cast            825
country         831
rating            4
listed_in         0
description       0
dtype: int64

In [493]:
# Let's make a recommendation system based on the plot of the movie which is saved in data['description']

In [494]:
# Import TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [495]:
#remove stopwords like a and the
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['description'])
tfidf_matrix.shape

(8807, 18895)

In [496]:
#Use cosine similarity to make recommendations
from sklearn.metrics.pairwise import linear_kernel

In [497]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [498]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

In [499]:
indices

title
Dick Johnson Is Dead        0
Blood & Water               1
Ganglands                   2
Jailbirds New Orleans       3
Kota Factory                4
                         ... 
Zodiac                   8802
Zombie Dumb              8803
Zombieland               8804
Zoom                     8805
Zubaan                   8806
Length: 8807, dtype: int64

In [500]:
def get_recommendations(title, cosine_sim=cosine_sim):
    #get index from title
    idx = indices[title]
    
    #get similarity scores from title to other movies titles
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort list based on sim_score
    sim_scores = sorted(sim_scores,key = lambda x: x[1],reverse=True)
    
    #just use first 10 scores
    sim_scores = sim_scores[1:11]
    print(sim_scores)
    
    movie_indices = [i[0] for i in sim_scores]
    
    return data.title.iloc[movie_indices]

In [501]:
get_recommendations('Stranger Things')

[(4733, 0.3177805309398409), (1240, 0.23161914688421315), (1487, 0.20254828101828218), (8198, 0.20251565797382817), (2419, 0.19699498020746717), (6518, 0.1763899880381364), (6760, 0.16949571995688034), (8026, 0.15240044635798838), (4201, 0.15223118720194836), (1270, 0.14984708684090647)]


4733               Rowdy Rathore
1240                  Safe Haven
1487             Sakho & Mangane
8198     The Autopsy of Jane Doe
2419               Big Stone Gap
6518            Come and Find Me
6760                   FirstBorn
8026             Sinister Circle
4201                 Hardy Bucks
1270    Sin senos sí hay paraíso
Name: title, dtype: object

In [502]:
#Let's make a recommendation system based on the cast, director, listed_in,

In [503]:
#We need to convert cast and listed_in into useable structure

In [504]:
def get_list(x):
    list = []
    for i in x.split(','):
         list.append(i)
    return list

In [505]:
data['listed_in'] = data['listed_in'].apply(get_list)

In [506]:
#drop movies which dont have a cast
data.dropna(subset=['cast'], inplace=True)

In [507]:
#just use top three actores of each movie for the recommendation
def get_top3items(x):
    actors = []
    for i in x.split(','):
        actors.append(i)
        if len(actors) == 3:
            return actors
    return []

In [508]:
data['cast'] = data['cast'].apply(get_top3items)

In [509]:
def remove_spaces(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ","")) for i in x]
    

In [510]:
features = ['cast', 'listed_in']

for feature in features:
    data[feature] = data[feature].apply(remove_spaces)

In [511]:
data['director'] = data['director'].replace(np.nan, ' ')

In [512]:
data

Unnamed: 0,type,title,director,cast,country,rating,listed_in,description
1,TV Show,Blood & Water,,"[amaqamata, khosingema, gailmabalane]",South Africa,TV-MA,"[internationaltvshows, tvdramas, tvmysteries]","After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,Julien Leclercq,"[samibouajila, tracygotoas, samueljouy]",,TV-MA,"[crimetvshows, internationaltvshows, tvaction&...",To protect his family from a powerful drug lor...
4,TV Show,Kota Factory,,"[mayurmore, jitendrakumar, ranjanraj]",India,TV-MA,"[internationaltvshows, romantictvshows, tvcome...",In a city of coaching centers known to train I...
5,TV Show,Midnight Mass,Mike Flanagan,"[katesiegel, zachgilford, hamishlinklater]",,TV-MA,"[tvdramas, tvhorror, tvmysteries]",The arrival of a charismatic young priest brin...
6,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","[vanessahudgens, kimikoglenn, jamesmarsden]",,PG,[children&familymovies],Equestria's divided. But a bright-eyed hero be...
...,...,...,...,...,...,...,...,...
8801,Movie,Zinzana,Majid Al Ansari,"[alisuliman, salehbakri, yasa]","United Arab Emirates, Jordan",TV-MA,"[dramas, internationalmovies, thrillers]",Recovering alcoholic Talal wakes up inside a s...
8802,Movie,Zodiac,David Fincher,"[markruffalo, jakegyllenhaal, robertdowneyjr.]",United States,R,"[cultmovies, dramas, thrillers]","A political cartoonist, a crime reporter and a..."
8804,Movie,Zombieland,Ruben Fleischer,"[jesseeisenberg, woodyharrelson, emmastone]",United States,R,"[comedies, horrormovies]",Looking to survive in a world taken over by zo...
8805,Movie,Zoom,Peter Hewitt,"[timallen, courteneycox, chevychase]",United States,PG,"[children&familymovies, comedies]","Dragged from civilian life, a former superhero..."


In [513]:
# make soup out of variables which get used in system
def create_soup(x):
    return '' + x['director'] + ' '.join(x['cast']) + ' ' + ' '.join(x['listed_in'])

In [514]:
data['soup'] = data.apply(create_soup, axis=1)

In [515]:
#same procedure like before only using CountVectorizer now
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
matrix = count.fit_transform(data['soup'])

from sklearn.metrics.pairwise import cosine_similarity

In [516]:
cosine_sim2 = cosine_similarity(matrix, matrix)

In [517]:
data = data.reset_index()
indices = pd.Series(data.index, index=data['title'])

In [518]:
get_recommendations('Stranger Things',cosine_sim2)

[(1323, 0.6249999999999999), (6322, 0.6249999999999999), (2888, 0.5892556509887895), (214, 0.4999999999999999), (3604, 0.4999999999999999), (4799, 0.4999999999999999), (5397, 0.4999999999999999), (7643, 0.4999999999999999), (7747, 0.47434164902525683), (793, 0.4714045207910316)]


1323         Chilling Adventures of Sabrina
6322                                  Helix
2888                            Nightflyers
214                                Manifest
3604                                 The OA
4799                    The Vampire Diaries
5397                               The 4400
7643                         The Messengers
7747    The Twilight Zone (Original Series)
793                    Love, Death & Robots
Name: title, dtype: object