In [17]:
# import librairies
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = [14,14]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

netflix_shows = pd.read_csv('netflix_shows.csv')


In [30]:
netflix_shows

Unnamed: 0,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,...,country_Ukraine,country_UnitedArabEmirates,country_UnitedKingdom,country_UnitedStates,country_Unknown,country_Uruguay,country_WestGermany,date_added_weekday,first_release_year,time_first_release_to_netflix
0,Blood & Water,['Unknown'],"['Ama Qamata', 'Khosi Ngema', 'Gail Mabalane',...",['South Africa'],2021-09-24,2021,TV-MA,2,"['International TV Shows', 'TV Dramas', 'TV My...","After crossing paths at a party, a Cape Town t...",...,0,0,0,0,0,0,0,4,2019,2
1,Ganglands,['Julien Leclercq'],"['Sami Bouajila', 'Tracy Gotoas', 'Samuel Jouy...",['Unknown'],2021-09-24,2021,TV-MA,1,"['Crime TV Shows', 'International TV Shows', '...",To protect his family from a powerful drug lor...,...,0,0,0,0,1,0,0,4,2020,1
2,Jailbirds New Orleans,['Unknown'],['Unknown'],['Unknown'],2021-09-24,2021,TV-MA,1,"['Docuseries', 'Reality TV']","Feuds, flirtations and toilet talk go down amo...",...,0,0,0,0,1,0,0,4,2020,1
3,Kota Factory,['Unknown'],"['Mayur More', 'Jitendra Kumar', 'Ranjan Raj',...",['India'],2021-09-24,2021,TV-MA,2,"['International TV Shows', 'Romantic TV Shows'...",In a city of coaching centers known to train I...,...,0,0,0,0,0,0,0,4,2019,2
4,Midnight Mass,['Mike Flanagan'],"['Kate Siegel', 'Zach Gilford', 'Hamish Linkla...",['Unknown'],2021-09-24,2021,TV-MA,1,"['TV Dramas', 'TV Horror', 'TV Mysteries']",The arrival of a charismatic young priest brin...,...,0,0,0,0,1,0,0,4,2020,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2671,Yu-Gi-Oh! Arc-V,['Unknown'],"['Mike Liscio', 'Emily Bauer', 'Billy Bob Thom...","['Japan', 'Canada']",2018-05-01,2015,TV-Y7,2,"['Anime Series', ""Kids' TV""]",Now that he's discovered the Pendulum Summonin...,...,0,0,0,0,0,0,0,1,2013,5
2672,Yunus Emre,['Unknown'],"['Gökhan Atalay', 'Payidar Tüfekçioglu', 'Bara...",['Turkey'],2017-01-17,2016,TV-PG,2,"['International TV Shows', 'TV Dramas']","During the Mongol invasions, Yunus Emre leaves...",...,0,0,0,0,0,0,0,1,2014,3
2673,Zak Storm,['Unknown'],"['Michael Johnston', 'Jessica Gee-George', 'Ch...","['United States', 'France', 'South Korea', 'In...",2018-09-13,2016,TV-Y7,3,"[""Kids' TV""]",Teen surfer Zak Storm is mysteriously transpor...,...,0,0,0,1,0,0,0,3,2013,5
2674,Zindagi Gulzar Hai,['Unknown'],"['Sanam Saeed', 'Fawad Khan', 'Ayesha Omer', '...",['Pakistan'],2016-12-15,2012,TV-PG,1,"['International TV Shows', 'Romantic TV Shows'...","Strong-willed, middle-class Kashaf and carefre...",...,0,0,0,0,0,0,0,3,2011,5


In [59]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

In [60]:
#Replace NaN with an empty string
netflix_shows['description'] = netflix_shows['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(netflix_shows['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(2676, 10136)

In [61]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#Construct a reverse map of indices and Tv Show titles
indices = pd.Series(netflix_shows.index, index=netflix_shows['title']).drop_duplicates()

In [62]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all TV shows with that TV show
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the TV shows based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar TV shows
    sim_scores = sim_scores[1:11]

    # Get the TV shows indices
    TVshow_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar TV shows
    return netflix_shows['title'].iloc[TVshow_indices], sim_scores

In [None]:
indices['Stranger Things']

In [64]:
get_recommendations('Stranger Things')

(512              Sakho & Mangane
 1538                 Hardy Bucks
 427     Sin senos sí hay paraíso
 2532    Sin Senos sí Hay Paraíso
 1778                     Requiem
 2156              Bitter Daisies
 1837                 Broadchurch
 1786                  Collateral
 1401                 The Society
 98                 Gone for Good
 Name: title, dtype: object,
 [(512, 0.19767943379351433),
  (1538, 0.16705163973984966),
  (427, 0.16358954369966425),
  (2532, 0.16358954369966425),
  (1778, 0.16260396464233956),
  (2156, 0.16189863264772653),
  (1837, 0.15108253469369096),
  (1786, 0.15010955854125457),
  (1401, 0.1489922867728381),
  (98, 0.14505647333688387)])

In [65]:
get_recommendations("Grey's Anatomy")

(836                                Lenox Hill
 253                         Hospital Playlist
 1490                              Secret City
 1058                           Medical Police
 1063                           Anne with an E
 1405                               Cinta Iris
 1517    Littlest Pet Shop: A World of Our Own
 963                          The English Game
 1295                                  Misaeng
 173                                   Between
 Name: title, dtype: object,
 [(836, 0.17277126968906004),
  (253, 0.17069943080660077),
  (1490, 0.11296557548687375),
  (1058, 0.10809629190553063),
  (1063, 0.10640678462149762),
  (1405, 0.10084514407506212),
  (1517, 0.10016066524402008),
  (963, 0.09997954202666945),
  (1295, 0.09568113427323582),
  (173, 0.09559241754279788)])

In [66]:
features=['listed_in','director','cast','description','title']
filters = netflix_shows[features]

In [67]:
#Cleaning the data by making all the words in lower case.
def clean_data(x):
        return str.lower(x.replace(" ", ""))

In [None]:
for feature in features:
    filters[feature] = filters[feature].apply(clean_data)
    
filters.head()

In [69]:
def create_soup(x):
    return x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']

In [70]:
filters['soup'] = filters.apply(create_soup, axis=1)

In [71]:
#create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filters['soup'])

# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
filters

In [72]:
# Reset index of our main DataFrame and construct reverse mapping as before
filters=filters.reset_index()
indices = pd.Series(filters.index, index=filters['title'])

In [73]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
    title=title.replace(' ','').lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all TV Shows with that TV Show
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the TV Shows based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar TV Shows
    sim_scores = sim_scores[1:11]

    # Get the TV Show indices
    TVshow_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar TV Shows
    return netflix_shows['title'].iloc[TVshow_indices], sim_scores

In [74]:
get_recommendations_new('Stranger Things', cosine_sim2)

(1860                 Beyond Stranger Things
 770                    The Umbrella Academy
 1343                           Motown Magic
 2300                                  Helix
 1116                            Nightflyers
 2627    The Twilight Zone (Original Series)
 85                                 Manifest
 2605                         The Messengers
 805                             Warrior Nun
 2099                               The 4400
 Name: title, dtype: object,
 [(1860, 0.6266795614405122),
  (770, 0.3207134902949093),
  (1343, 0.2886751345948129),
  (2300, 0.2752988806446741),
  (1116, 0.2683281572999748),
  (2627, 0.26666666666666666),
  (85, 0.25),
  (2605, 0.25),
  (805, 0.24253562503633297),
  (2099, 0.23570226039551587)])

In [75]:
get_recommendations_new("Grey's Anatomy", cosine_sim2)

(380                  The Wedding Coach
 2566                      The Bachelor
 2494               Pyaar Tune Kya Kiya
 2536                  Sotus The Series
 658                     Emily in Paris
 29      Stories by Rabindranath Tagore
 361                     Ajaibnya Cinta
 365                 Indian Matchmaking
 414               Marriage or Mortgage
 566                   The Bachelorette
 Name: title, dtype: object,
 [(380, 0.24743582965269675),
  (2566, 0.24743582965269675),
  (2494, 0.23145502494313785),
  (2536, 0.23145502494313785),
  (658, 0.2182178902359924),
  (29, 0.21821789023599236),
  (361, 0.21821789023599236),
  (365, 0.21821789023599236),
  (414, 0.21821789023599236),
  (566, 0.21821789023599236)])