In [1]:
import numpy as np
import pandas as pd
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
data = "../DataSets/Recomendation System/netflix_titles.csv"
df = pd.read_csv(data)
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [5]:
df.dropna(subset = ['cast','title','description','listed_in'], inplace = True, axis = 0)
df = df.reset_index(drop = True)

In [6]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [8]:
#clean text data and combine them
df['listed_in'] = [re.sub(r'[^\w\s]', '', t) for t in df['listed_in']]
df['cast'] = [re.sub(r'[^\w\s]', '', t) for t in df['cast']]
df['description'] = [re.sub(r'[^\w\s]', '', t) for t in df['description']]
df['title'] = [re.sub(r'[^\w\s]', '', t) for t in df['title']]

In [10]:
df['combined'] = df['listed_in'] + '' + df['cast'] + '' + df['title'] + '' + df['description']
df.drop(['listed_in', 'cast', 'description'], axis = 1, inplace = True)
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,combined
0,81145628,Movie,Norm of the North King Sized Adventure,"Richard Finn, Tim Maltby","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,Children Family Movies ComediesAlan Marriott ...
1,80117401,Movie,Jandino Whatever it Takes,,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,StandUp ComedyJandino AsporaatJandino Whatever...
2,70234439,TV Show,Transformers Prime,,United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids TVPeter Cullen Sumalee Montano Frank Welk...
3,80058654,TV Show,Transformers Robots in Disguise,,United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids TVWill Friedle Darren Criss Constance Zim...
4,80125979,Movie,realityhigh,Fernando Lebrija,United States,"September 8, 2017",2017,TV-14,99 min,ComediesNesta Cooper Kate Walsh John Michael H...


In [11]:
#Content Similarity
vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(df['combined'])
cosine_similarities = linear_kernel(matrix, matrix)
movie_title = df['title']
indices = pd.Series(df.index, index = df['title'])

In [12]:
#recommendation model
def content_recommender(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movie_title.iloc[movie_indices]

In [13]:
content_recommender('The Crown')

369     Witches A Century of Murder
5474                       Lovesick
5308                     Versailles
5411                   Intelligence
1753                     Collateral
5613                     Hinterland
4913              Gangs of Hassepur
2277                  Kiss Me First
5542                        Flowers
5088                  The Blacklist
2980             Black Earth Rising
3969                 Watership Down
5276               Call the Midwife
5639                         Cuckoo
4785                 Trio and a Bed
1829                     London Spy
2918                        Dracula
1354    Sommore The Reign Continues
5292                      Dads Army
3071                 La Viuda Negra
5624    Roman Empire Reign of Blood
1245                   Gosford Park
3915    The Real Football Factories
5060                  Ripper Street
1866                       Traitors
5185      The End of the Fing World
3979           Single Ladies Senior
157                         

In [14]:
title = 'The Crown'
suggestions = content_recommender(title)

suggestions_df = pd.DataFrame(data=suggestions)
suggestions_df.to_csv('suggestions_based_on_%s.csv'%title,index=False,header=False)