In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [55]:
#Libraries used
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
#Read the input file. (Kaggle)
netflix_data=pd.read_csv("../input/netflixshows/netflix_titles.csv")

#Print input data
netflix_data.head()

In [57]:
netflix_data.info()

In [58]:
#Store the netflix movies and shows separately in two different variables
netflix_shows=netflix_data[netflix_data['type']=='TV Show']
netflix_movies=netflix_data[netflix_data['type']=='Movie']

In [59]:
#Plot a bar graph of type of show versus count
sns.set(style="darkgrid")
ax = sns.countplot(x="type", data=netflix_data, palette="Accent")

In [60]:
#Drop the null values for date
nf_date = netflix_shows[['date_added']].dropna()

#Store the year and month in two separate variables.
nf_date['year'] = nf_date['date_added'].apply(lambda x : x.split(', ')[-1])
nf_date['month'] = nf_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])

#List the order of the months
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]

data_f = nf_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T

#Plot a graph capturing the contents of netflix
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(data_f, cmap='Purples', edgecolors='white', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(data_f.columns), 1), data_f.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(data_f.index), 1), data_f.index, fontsize=7, fontfamily='serif')

plt.title('Contents Update', fontsize=12, fontfamily='calibri', fontweight='bold', position=(0.20, 1.0+0.02))
cbar = plt.colorbar()

cbar.ax.tick_params(labelsize=8) 
cbar.ax.minorticks_on()
plt.show()

In [61]:
#In this plot the darker the color gets, the higher the number of contents rises. 
#What we get from this plot is the number of contents was at its highest in June and 
#July 2021

In [62]:
#Ratings

plt.figure(figsize=(12,10))
sns.set(style="whitegrid")
ax = sns.countplot(x="rating", data=netflix_data, palette="gist_earth_r", order=netflix_data['rating'].value_counts().index[0:15])

In [63]:
#By this countplot of ratings we can realize that TV-MA ( Mature Audiences ) 
#contents have the highest rating numbers. Then it gets lower as in order TV-14 
#( Material that parents or adult guardians may find unsuitable for children under the 
#age of 14 ) and TV-PG ( Parental Guideness ).

In [64]:
#Year

plt.figure(figsize=(12,10))
sns.set(style="whitegrid")
ax = sns.countplot(y="release_year", data=netflix_data, palette="ocean_r", order=netflix_data['release_year'].value_counts().index[0:15])

In [66]:
#This plot shows that most of the contents were produced in 2018 and 2017. 
#Interesting fact is in 2019, 2020 and 2021 the number of produced contents are low. 
#This is mostly because Covid-19 pandemic

In [67]:
#RECOMMENDATION SYSTEM

In [68]:
#The TF-IDF(Term Frequency-Inverse Document Frequency (TF-IDF) ) score is the 
#frequency of a word occurring in a document, down-weighted by the number of documents 
#in which it occurs. This is done to reduce the importance of words that occur frequently 
#in plot overviews and therefore, their significance in computing the final similarity score.

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
#Pre-proccessing of data

#removing stopwords
tf_idf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
netflix_data['description'] = netflix_data['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tf_idf_matrix = tf_idf.fit_transform(netflix_data['description'])

#Output the shape of tfidf_matrix
tf_idf_matrix.shape

In [81]:
#Cosine similarity is a measure of similarity, often used to measure document 
#similarity in text analysis.

In [82]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tf_idf_matrix, tf_idf_matrix)

In [83]:
#Drop the duplicates
indices = pd.Series(netflix_data.index, index=netflix_data['title']).drop_duplicates()

In [84]:
#Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    #Get the indices of the title
    index_title = indices[title]
    #Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[index_title])) 
    #Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) 
    #Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11] 
    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]
    ## Return the top 10 most similar movies
    return netflix_data['title'].iloc[movie_indices] 

In [85]:
#At this point our recommendation system function is ready to use. 
#After defining our recommendation system function we can use it to get recommendations. 
#All we need to do is passing the content name as an argument

In [87]:
get_recommendations('Our Godfather')

In [88]:
get_recommendations('Candyflip')

In [None]:
#It is seen that the model performs well, but is not very accurate.
#Therefore, more metrics are added to the model to improve performance

#Content based filtering on multiple metrics

#Content based filtering on the following factors:
#Title Cast Director Listed in Plot

In [89]:
#Filling null values with empty string
filledna=netflix_data.fillna('')
filledna.head(2)

In [90]:
#Cleaning the data - making all the words lower case
def clean_data(x):
        return str.lower(x.replace(" ", ""))

In [92]:
#Identifying features on which the model is to be filtered
features=['title','director','cast','listed_in','description']
filledna=filledna[features]

In [93]:
for feature in features:
    filledna[feature] = filledna[feature].apply(clean_data)
    
filledna.head(2)

In [94]:
def create_soup(x):
    return x['title']+ ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']

In [95]:
filledna['soup'] = filledna.apply(create_soup, axis=1)

In [99]:
#Now the code is basically similar to the upper model except the fact that count vectorizer is used instead of tf-idf

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna['soup'])

cosine_similarity_2 = cosine_similarity(count_matrix, count_matrix)

In [100]:
filledna=filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna['title'])

In [101]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
    title=title.replace(' ','').lower()
    index_title = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[index_title])) 
    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True) 
    # Get the scores of the 10 most similar movies
    similarity_scores = similarity_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores] 
    # Return the top 10 most similar movies
    return netflix_data['title'].iloc[movie_indices] # Return the top 10 most similar movies

In [104]:
get_recommendations_new('Young Wallander', cosine_similarity_2)

In [105]:
get_recommendations_new('The Murder Detectives', cosine_similarity_2)

In [31]:
get_recommendations_new('Black Mirror', cosine_similarity_2)