## Importing Dependencies and Reading the Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [7]:
netflix = pd.read_csv("D:\\gs\\data\\datasets\\netflix_titles.csv")

In [8]:
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [9]:
netflix.shape

(7787, 12)

## Multi-feature recommendation system.
### Features used - type, listed_in, cast(only 5), rating and description

In [10]:
features = ['type', 'listed_in', 'cast', 'rating', 'description']

In [11]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 395.5+ KB


In [12]:
# We will drop country, date_added, duration, director release_year and replace NaN director by ' '
netflix = netflix.drop(['country', 'date_added','director', 'duration', 'release_year'], axis = 1)

In [13]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   show_id      7787 non-null   object
 1   type         7787 non-null   object
 2   title        7787 non-null   object
 3   cast         7069 non-null   object
 4   rating       7780 non-null   object
 5   listed_in    7787 non-null   object
 6   description  7787 non-null   object
dtypes: object(7)
memory usage: 213.0+ KB


In [14]:
netflix = netflix.dropna(axis = 0)

In [15]:
netflix = pd.DataFrame(netflix)

In [16]:
netflix.head()

Unnamed: 0,show_id,type,title,cast,rating,listed_in,description
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",TV-MA,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",R,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",PG-13,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",PG-13,Dramas,A brilliant group of students become card-coun...


In [17]:
netflix.shape

(7062, 7)

In [18]:
netflix['cast'][0]

'João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi'

In [19]:
def to_list(cast):
    cast = cast.split(", ")
    return cast

In [20]:
netflix['cast'] = netflix['cast'].apply(to_list)
netflix['listed_in'] = netflix['listed_in'].apply(to_list)

In [21]:
netflix.head()

Unnamed: 0,show_id,type,title,cast,rating,listed_in,description
0,s1,TV Show,3%,"[João Miguel, Bianca Comparato, Michel Gomes, ...",TV-MA,"[International TV Shows, TV Dramas, TV Sci-Fi ...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"[Demián Bichir, Héctor Bonilla, Oscar Serrano,...",TV-MA,"[Dramas, International Movies]",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"[Tedd Chan, Stella Chung, Henley Hii, Lawrence...",R,"[Horror Movies, International Movies]","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"[Elijah Wood, John C. Reilly, Jennifer Connell...",PG-13,"[Action & Adventure, Independent Movies, Sci-F...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"[Jim Sturgess, Kevin Spacey, Kate Bosworth, Aa...",PG-13,[Dramas],A brilliant group of students become card-coun...


In [22]:
def get_list(x):
    if isinstance(x, list):
        if len(x) > 5:
            x = x[:5]
        return x
    return []

In [23]:
netflix.head()

Unnamed: 0,show_id,type,title,cast,rating,listed_in,description
0,s1,TV Show,3%,"[João Miguel, Bianca Comparato, Michel Gomes, ...",TV-MA,"[International TV Shows, TV Dramas, TV Sci-Fi ...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,"[Demián Bichir, Héctor Bonilla, Oscar Serrano,...",TV-MA,"[Dramas, International Movies]",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"[Tedd Chan, Stella Chung, Henley Hii, Lawrence...",R,"[Horror Movies, International Movies]","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"[Elijah Wood, John C. Reilly, Jennifer Connell...",PG-13,"[Action & Adventure, Independent Movies, Sci-F...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"[Jim Sturgess, Kevin Spacey, Kate Bosworth, Aa...",PG-13,[Dramas],A brilliant group of students become card-coun...


In [24]:
netflix['listed_in'] = netflix['listed_in'].apply(get_list)
netflix['cast'] = netflix['cast'].apply(get_list)

In [25]:
netflix['cast'][0]

['João Miguel',
 'Bianca Comparato',
 'Michel Gomes',
 'Rodolfo Valente',
 'Vaneza Oliveira']

In [26]:
netflix['description'][0]

'In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.'

#### Preprocessing done

In [27]:
# Cleaning of data
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            #print(x)
            return x.lower().replace(" ","")
        else:
            return ''

In [28]:
for i in features:
    netflix[i] = netflix[i].apply(clean_data)

In [29]:
netflix.head()

Unnamed: 0,show_id,type,title,cast,rating,listed_in,description
0,s1,tvshow,3%,"[joãomiguel, biancacomparato, michelgomes, rod...",tv-ma,"[internationaltvshows, tvdramas, tvsci-fi&fant...",inafuturewheretheeliteinhabitanislandparadisef...
1,s2,movie,7:19,"[demiánbichir, héctorbonilla, oscarserrano, az...",tv-ma,"[dramas, internationalmovies]","afteradevastatingearthquakehitsmexicocity,trap..."
2,s3,movie,23:59,"[teddchan, stellachung, henleyhii, lawrencekoh...",r,"[horrormovies, internationalmovies]","whenanarmyrecruitisfounddead,hisfellowsoldiers..."
3,s4,movie,9,"[elijahwood, johnc.reilly, jenniferconnelly, c...",pg-13,"[action&adventure, independentmovies, sci-fi&f...","inapostapocalypticworld,rag-dollrobotshideinfe..."
4,s5,movie,21,"[jimsturgess, kevinspacey, katebosworth, aaron...",pg-13,[dramas],abrilliantgroupofstudentsbecomecard-countingex...


In [30]:
def create_soup(x):
    return x['type'] + ' ' + x['description'] + ' ' + ' '.join(x['cast']) + ' ' + x['rating'] + ' ' + ' '.join(x['listed_in'])

In [31]:
netflix['soup'] = netflix.apply(create_soup, axis = 1)

In [32]:
netflix['soup'][0]

'tvshow inafuturewheretheeliteinhabitanislandparadisefarfromthecrowdedslums,yougetonechancetojointhe3%savedfromsqualor. joãomiguel biancacomparato michelgomes rodolfovalente vanezaoliveira tv-ma internationaltvshows tvdramas tvsci-fi&fantasy'

In [34]:
indices = pd.Series(netflix.index, index = netflix['title'])

In [35]:
indices

title
3%                          0
7:19                        1
23:59                       2
9                           3
21                          4
                         ... 
Zoom                     7781
Zozo                     7782
Zubaan                   7783
Zulu Man in Japan        7784
Zumbo's Just Desserts    7785
Length: 7062, dtype: int64

In [30]:
# No duplicates

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(netflix['soup'])

In [38]:
count_matrix

<7062x39780 sparse matrix of type '<class 'numpy.int64'>'
	with 91633 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

MemoryError: Unable to allocate 339. MiB for an array with shape (44395008,) and data type float64

In [41]:
cosine_sim

NameError: name 'cosine_sim' is not defined

In [42]:
def get_5_rec(title, cosine_sim = cosine_sim):
    
    # Extracting index of the title of the movie based on which we want to recommend
    idx = indices[title]
    
    # Extracting similarity scores for title
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sorting the list based on similarity scores
    sim_scores = sorted(sim_scores, key = lambda x:x[1], reverse = True)
    
    # Extracting the index of top 5 similar shows/movies
    sim_scores = sim_scores[1:6]
    top_5_idx = [i[0] for i in sim_scores]
    
    # Printing the 5 titles of movies from our dataframe
    print('The top 5 recommendations based on {0} are : '.format(title))
    for i in range(0,5):
        print('{0} : {1}'.format(i + 1, netflix['title'].iloc[top_5_idx[i]]))

NameError: name 'cosine_sim' is not defined

In [38]:
get_5_rec('3%')

The top 5 recommendations based on 3% are : 
1 : Biohackers
2 : Omniscient
3 : The Magicians
4 : Bountiful Blessings
5 : Handsome Siblings


In [39]:
get_5_rec('A Love So Beautiful')

The top 5 recommendations based on A Love So Beautiful are : 
1 : The Mistress
2 : First Love
3 : Nila
4 : That Thing Called Tadhana
5 : The Lighthouse of the Orcas


In [40]:
get_5_rec('Apaharan')

The top 5 recommendations based on Apaharan are : 
1 : I Fine... Thank You... Love You
2 : Deliha
3 : Ek Main Aur Ekk Tu
4 : Hadi İnşallah
5 : Wrong No.
