## Importing the libraries

In [8]:
import numpy as np 
import pandas as pd 

## Importing the dataset

In [9]:
dataset = pd.read_csv('dataset/netflix_dataset.csv')
dataset

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


This dataset has got a lot of columns. We need to first clean this dataset. We only need the `type`, `title` and `description` columns.

In [10]:
df = dataset[['type', 'title', 'description']].copy()
df

Unnamed: 0,type,title,description
0,Movie,Dick Johnson Is Dead,"As her father nears the end of his life, filmm..."
1,TV Show,Blood & Water,"After crossing paths at a party, a Cape Town t..."
2,TV Show,Ganglands,To protect his family from a powerful drug lor...
3,TV Show,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo..."
4,TV Show,Kota Factory,In a city of coaching centers known to train I...
...,...,...,...
8802,Movie,Zodiac,"A political cartoonist, a crime reporter and a..."
8803,TV Show,Zombie Dumb,"While living alone in a spooky town, a young g..."
8804,Movie,Zombieland,Looking to survive in a world taken over by zo...
8805,Movie,Zoom,"Dragged from civilian life, a former superhero..."


Checking if there is any Nan values in any of the columns.

In [11]:
check_type = df['type'].isnull().values.any()
check_title = df['title'].isnull().values.any()
check_description = df['description'].isnull().values.any()
print(check_type, ' ', check_title, ' ', check_type)    #there's none

False   False   False


We shall be using the `TfidfVectorizer` class of scikit-learn library to calculate the tf-idf values for the `description` column.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
description_matrix = vectorizer.fit_transform(df['description'])

In [13]:
description_matrix.toarray() #sparse matrix
print(description_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 121374 stored elements and shape (8807, 18895)>
  Coords	Values
  (0, 6209)	0.16254396363689053
  (0, 11313)	0.30519046546477174
  (0, 5596)	0.2239952783710916
  (0, 9766)	0.12590836562639762
  (0, 6376)	0.2292676890369026
  (0, 9347)	0.3521743123074296
  (0, 9031)	0.3178095805166836
  (0, 15937)	0.27919454940575633
  (0, 4344)	0.18158280908846863
  (0, 8758)	0.3109717601819953
  (0, 3464)	0.3109717601819953
  (0, 18310)	0.22720271728772545
  (0, 7801)	0.16463147802014286
  (0, 6082)	0.20054299311773363
  (0, 8490)	0.33696767624101076
  (1, 4063)	0.3275581833729518
  (1, 12241)	0.24866029168628911
  (1, 12200)	0.24119888352666585
  (1, 2659)	0.3467928201102296
  (1, 17177)	0.19248886760946635
  (1, 16741)	0.19305859063422645
  (1, 15002)	0.21314343523578164
  (1, 13140)	0.24719854905058733
  (1, 12993)	0.2671575311220847
  (1, 14720)	0.17558231498361754
  :	:
  (8805, 17226)	0.2336330622377386
  (8805, 1279)	0.22112243551834

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarities = cosine_similarity(description_matrix)
cosine_similarities = pd.DataFrame(cosine_similarities)

In [15]:
similarity_df = df[['type', 'title']].copy()
similarity_df = pd.concat([similarity_df, cosine_similarities], axis=1)
similarity_df

Unnamed: 0,type,title,0,1,2,3,4,5,6,7,...,8797,8798,8799,8800,8801,8802,8803,8804,8805,8806
0,Movie,Dick Johnson Is Dead,1.000000,0.000,0.000000,0.000000,0.015222,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.015383,0.000000
1,TV Show,Blood & Water,0.000000,1.000,0.000000,0.000000,0.000000,0.031227,0.05166,0.000000,...,0.035331,0.032784,0.116936,0.000000,0.032609,0.000000,0.042,0.000000,0.000000,0.000000
2,TV Show,Ganglands,0.000000,0.000,1.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.000000,0.022301
3,TV Show,Jailbirds New Orleans,0.000000,0.000,0.000000,1.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.016386,0.000000
4,TV Show,Kota Factory,0.015222,0.000,0.000000,0.000000,1.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.035196,0.068784,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,Movie,Zodiac,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.045875,0.052108,0.000000,0.000000,1.000000,0.000,0.060511,0.000000,0.000000
8803,TV Show,Zombie Dumb,0.000000,0.042,0.000000,0.000000,0.000000,0.054920,0.00000,0.000000,...,0.000000,0.037161,0.000000,0.000000,0.036964,0.000000,1.000,0.000000,0.000000,0.000000
8804,Movie,Zombieland,0.000000,0.000,0.000000,0.000000,0.035196,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.042132,0.000000,0.060511,0.000,1.000000,0.000000,0.000000
8805,Movie,Zoom,0.015383,0.000,0.000000,0.016386,0.068784,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,1.000000,0.000000


In [102]:
name = "Thambi"
test = similarity_df[similarity_df["title"] == name].iloc[:, 2:]
index = test.index[0]

print(index.shape)
test = test.squeeze()
similar_5 = test.nlargest(6).index
print(similar_5.shape)
selected_movies = similar_5[similar_5 != index]
print(selected_movies)


'''largest_5 = test.nlargest(5).index
similar = []
similar = largest_5[largest_5 != 340]
similar.shape'''


()
(6,)
Index([1584, 2697, 542, 3618, 438], dtype='int64')


'largest_5 = test.nlargest(5).index\nsimilar = []\nsimilar = largest_5[largest_5 != 340]\nsimilar.shape'

### Model 

In [16]:
class MovieRecommender:
    def __init__(self, data):
        self.similarity_df = data
    
    def recommend(self, name:str):
        movie = similarity_df[similarity_df['title'] == name].iloc[:, 2:]
        movie_index = movie.index[0]
        movie = movie.squeeze()
        similar_5 = movie.nlargest(6).index
        selected_movies = similar_5[similar_5 != movie_index] 
        return selected_movies
    
       

        


In [17]:
recommender = MovieRecommender(similarity_df)
similar_movies = recommender.recommend("Thambi")
for i in similar_movies:
    print(similarity_df.iloc[i, 1])

Ava
Tigertail
Ujala
Crazy Beautiful You
2 Weeks in Lagos
