In [1]:
import numpy as np
import pandas as pd
import warnings

In [2]:
#Ignoring warnings for clearer output
warnings.filterwarnings("ignore")

In [3]:
#loading the dataset
data=pd.read_csv(r"netflix_titles.csv")

In [4]:
#displaying the first 5 rows of the dataset
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
data.shape

(8807, 12)

In [6]:
data.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

Keeping only the relevant columns needed for the recommendation system

In [7]:
data=data[["title","director","cast","listed_in","description"]]

Checking for the null values

In [8]:
data.isnull().sum()

title             0
director       2634
cast            825
listed_in         0
description       0
dtype: int64

Filling the missing values with empty strings 

In [9]:
data.fillna("",inplace=True)

In [10]:
data.isnull().sum()

title          0
director       0
cast           0
listed_in      0
description    0
dtype: int64

Creating a new column "tags" that combins important text features.
This "tag" column combines director, cast, listed_in and description columns

In [11]:
data["tags"]=data["director"]+" "+data["cast"]+" "+data["listed_in"]+" "+data["description"]

In [12]:
data["tags"].head()

0    Kirsten Johnson  Documentaries As her father n...
1     Ama Qamata, Khosi Ngema, Gail Mabalane, Thaba...
2    Julien Leclercq Sami Bouajila, Tracy Gotoas, S...
3      Docuseries, Reality TV Feuds, flirtations an...
4     Mayur More, Jitendra Kumar, Ranjan Raj, Alam ...
Name: tags, dtype: object

We'll use only the 'title' and 'tags' columns for building the content-based recommender

In [13]:
essential_columns=["title","tags"]

In [14]:
movies=data[essential_columns]

In [15]:
movies.head()

Unnamed: 0,title,tags
0,Dick Johnson Is Dead,Kirsten Johnson Documentaries As her father n...
1,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaba..."
2,Ganglands,"Julien Leclercq Sami Bouajila, Tracy Gotoas, S..."
3,Jailbirds New Orleans,"Docuseries, Reality TV Feuds, flirtations an..."
4,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam ..."


In [16]:
movies.columns

Index(['title', 'tags'], dtype='object')

In [17]:
movies.isnull().sum()

title    0
tags     0
dtype: int64

TF-IDF Vectorizer is used for converting textual data to numberical features
Here the "tag" column which consists of text will be converted into numerical values

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

This will remove English stopwords and gives more weight to important terms

In [19]:
tfidf=TfidfVectorizer(stop_words="english")
tfidf_matrix=tfidf.fit_transform(movies["tags"])  # returns a sparse matrix

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

Computing the cosine similarity between all movie vectors

In [21]:
similarity=cosine_similarity(tfidf_matrix)

 Define a recommendation function that takes a movie title as input and returns the top 5 most similar movies based on cosine similarity

In [22]:
def recommend(movie):
    # Get index of the movie from the DataFrame
    movie_index=movies[movies["title"]==movie].index[0]
    # Get similarity scores for that movie with all other
    distances=similarity[movie_index]
    # Sort movies based on similarity score in descending order
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    # Print the top 5 recommended movie titles
    for i in movies_list:
        print(movies.iloc[i[0]].title)

Testing the function

In [23]:
recommend("Kota Factory")

Yeh Meri Family
Girls Hostel
Chaman Bahaar
Betaal
The Creative Indians


Importing pickle to save the necessary data

In [24]:
import pickle

In [25]:
filename="movie_recommendation_system.pkl"
pickle.dump(movies,open("movies.pkl","wb"))  #Saves the 'movies' DataFrame
pickle.dump(similarity,open("similarity.pkl","wb"))  #Saves the similarity matrix

In [26]:
movies = pickle.load(open("movies.pkl","rb"))
similarity = pickle.load(open("similarity.pkl","rb"))