In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# import data
data = pd.read_csv("netflixData.csv")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [8]:
# Print the number of missing values in each column of the dataset
# This helps to identify if there are any columns with missing data that might need to be addressed
print(data.isnull().sum())


Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [9]:
# Select specific columns from the dataset to use for building a machine learning model
# The columns selected are "Title", "Description", "Content Type", and "Genres"
data = data[["Title","Description","Content Type","Genres"]]

# Print the first few rows of the modified dataset to verify the selection
print(data.head())


                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [11]:
# delete the NaN
data = data.dropna()

In [15]:
# Import necessary libraries for text processing
import nltk
import re

# Download the NLTK stopwords dataset
nltk.download('stopwords')

# Initialize the SnowballStemmer for English
stemmer = nltk.SnowballStemmer("english")

# Import the stopwords corpus from NLTK
from nltk.corpus import stopwords
import string

# Create a set of English stopwords for filtering out common words
stopword = set(stopwords.words('english'))

# Define a function to clean the text data
def clean(text):
    # Convert the text to lowercase
    text = str(text).lower()
    
    # Remove text within square brackets
    text = re.sub('\[.*?\]', '', text)
    
    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub('<.*?>+', '', text)
    
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove newline characters
    text = re.sub('\n', '', text)
    
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    # Remove stopwords
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    
    # Apply stemming to the words
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    
    # Return the cleaned text
    return text

# Apply the clean function to the "Title" column of the dataset
data["Title"] = data["Title"].apply(clean)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gheorgheandrei.vaduva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Print a sample of the 'Title' column from the dataset to check the titles
print(data.Title.sample(10))

3572            planet
1457        el cartel 
5571     unauthor live
2990     marvel defend
1719                 b
3557    ordinari peopl
372         angri bird
4042            saawan
2067        hire woman
286           american
Name: Title, dtype: object


In [18]:
# Create a feature list from the 'Genres' column of the dataset
feature = data["Genres"].astype(str).tolist() 

# Initialize the TF-IDF Vectorizer with English stop words
tfidf = TfidfVectorizer(stop_words="english")

# Fit and transform the 'Genres' data into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(feature)

# Compute the cosine similarity between the TF-IDF vectors
similarity = cosine_similarity(tfidf_matrix)

In [19]:
# Create a Series with indices from the dataset's 'Title' column
indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()

In [20]:
# Define a function to get Netflix recommendations based on the title
def netFlix_recommendation(title, similarity = similarity):
    # Get the index of the given title from the indices Series
    index = indices[title]
    
    # Get the similarity scores for the title
    similarity_scores = list(enumerate(similarity[index]))
    
    # Sort the similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Select the top 10 similar titles
    similarity_scores = similarity_scores[0:10]
    
    # Get the indices of these top 10 similar titles
    movieindices = [i[0] for i in similarity_scores]
    
    # Return the titles of the recommended movies
    return data['Title'].iloc[movieindices]

# Print recommendations for the title "girlfriend"
print(netFlix_recommendation("girlfriend"))

3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                  chappell show
Name: Title, dtype: object
