In [2]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('netflixData.csv')
data.head()

Unnamed: 0,Show Id,Title,Description,Director,Genres,Cast,Production Country,Release Date,Rating,Duration,Imdb Score,Content Type,Date Added
0,cc1b6ed9-cf9e-4057-8303-34577fb54477,(Un)Well,This docuseries takes a deep dive into the luc...,,Reality TV,,United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
1,e2ef4e91-fb25-42ab-b485-be8e3b23dedb,#Alive,"As a grisly virus rampages a city, a lone man ...",Cho Il,"Horror Movies, International Movies, Thrillers","Yoo Ah-in, Park Shin-hye",South Korea,2020.0,TV-MA,99 min,6.2/10,Movie,"September 8, 2020"
2,b01b73b7-81f6-47a7-86d8-acb63080d525,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Sabina Fedeli, Anna Migotto","Documentaries, International Movies","Helen Mirren, Gengher Gatti",Italy,2019.0,TV-14,95 min,6.4/10,Movie,"July 1, 2020"
3,b6611af0-f53c-4a08-9ffa-9716dc57eb9c,#blackAF,Kenya Barris and his family navigate relations...,,TV Comedies,"Kenya Barris, Rashida Jones, Iman Benson, Genn...",United States,2020.0,TV-MA,1 Season,6.6/10,TV Show,
4,7f2d4170-bab8-4d75-adc2-197f7124c070,#cats_the_mewvie,This pawesome documentary explores how our fel...,Michael Margolis,"Documentaries, International Movies",,Canada,2020.0,TV-14,90 min,5.1/10,Movie,"February 5, 2020"


In [3]:
data.isnull().sum()

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64

In [4]:
df = data[['Title','Description','Genres','Content Type']]
df = df.dropna()
df.head()

Unnamed: 0,Title,Description,Genres,Content Type
0,(Un)Well,This docuseries takes a deep dive into the luc...,Reality TV,TV Show
1,#Alive,"As a grisly virus rampages a city, a lone man ...","Horror Movies, International Movies, Thrillers",Movie
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...","Documentaries, International Movies",Movie
3,#blackAF,Kenya Barris and his family navigate relations...,TV Comedies,TV Show
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,"Documentaries, International Movies",Movie


In [5]:
# DATA CLEAN #
import nltk 
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer('english')
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

df["Title"] = df["Title"].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df.Title.sample(5)

1855    godzilla planet eater
5583                uncut gem
2829                love cuff
2870             luna petunia
5606                    unit 
Name: Title, dtype: object

In [7]:
genre_features = df.Genres.tolist()
tfidf = text.TfidfVectorizer(input=genre_features, stop_words='english')
tfidf_matrix = tfidf.fit_transform(genre_features)
similarity = cosine_similarity(tfidf_matrix)

In [8]:
indices = pd.Series(data.index, index=df['Title']).drop_duplicates()
indices

Title
unwel                           0
aliv                            1
annefrank  parallel stori       2
blackaf                         3
catsthemewvi                    4
                             ... 
الف مبروك                    5962
دفعة القاهرة                 5963
海的儿子                         5964
반드시 잡는다                      5965
최강전사 미니특공대  영웅의 탄생           5966
Length: 5967, dtype: int64

In [25]:
# RECOMMEND MOVIES & OTHERS #
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return df['Title'].iloc[movieindices]

keyword = 'naruto' #input your search keyword
try:
    results = netFlix_recommendation(keyword)
    print("Recommendations for %s" %keyword.capitalize())
    print(results)
except:
    print(f'No matches for {keyword}')

Recommendations for Naruto
79                             
171                        aico
223                  aggretsuko
241              ajin demihuman
365                  angel beat
444                attack titan
502    back street girl gokudol
522                        baki
569                     beastar
689                black butler
Name: Title, dtype: object


In [None]:
# END OF THE PROJECT #