## Content Based Recommender

In [761]:
import numpy as np
import pandas as pd

import re
import string

import nltk
from rake_nltk import Rake #for extracting significant keywords in the entire text available.
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abdas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abdas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abdas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [762]:
netflixData = pd.read_csv('data/netflix dataset.csv')
netflixData.tail()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
245,246,The Lost Weekend,1945,NOT RATED,01 Jan 1946,101 min,"Drama, Film-Noir",Billy Wilder,"Charles R. Jackson (from the novel by), Charle...","Ray Milland, Jane Wyman, Phillip Terry, Howard...",...,,,,,http://www.rottentomatoes.com/m/lost_weekend/,06 Feb 2001,,Paramount Pictures,,True
246,247,Short Term 12,2013,R,23 Aug 2013,96 min,Drama,Destin Daniel Cretton,Destin Daniel Cretton,"Brie Larson, John Gallagher Jr., Stephanie Bea...",...,,,,,http://www.rottentomatoes.com/m/short_term_12_...,14 Jan 2014,,Cinedigm,http://shortterm12.com,True
247,248,His Girl Friday,1940,APPROVED,18 Jan 1940,92 min,"Comedy, Drama, Romance",Howard Hawks,"Charles Lederer (screen play), Ben Hecht (from...","Cary Grant, Rosalind Russell, Ralph Bellamy, G...",...,,,,,http://www.rottentomatoes.com/m/his_girl_friday/,28 Dec 2004,,Columbia Pictures,,True
248,249,The Straight Story,1999,G,03 Nov 1999,112 min,"Biography, Drama",David Lynch,"John Roach, Mary Sweeney","Sissy Spacek, Jane Galloway Heitz, Joseph A. C...",...,,,,,http://www.rottentomatoes.com/m/straight_story/,07 Nov 2000,,Buena Vista Pictures,http://disney.go.com/DisneyPictures/straightst...,True
249,250,Slumdog Millionaire,2008,R,25 Dec 2008,120 min,Drama,"Danny Boyle, Loveleen Tandan","Simon Beaufoy (screenplay), Vikas Swarup (novel)","Dev Patel, Saurabh Shukla, Anil Kapoor, Raj Zu...",...,,,,,http://www.rottentomatoes.com/m/slumdog_millio...,31 Mar 2009,"$141,243,551",Fox Searchlight Pictures,http://www.foxsearchlight.com/slumdogmillionaire/,True


In [763]:
netflixData.columns

Index(['Unnamed: 0', 'Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre',
       'Director', 'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards',
       'Poster', 'Ratings.Source', 'Ratings.Value', 'Metascore', 'imdbRating',
       'imdbVotes', 'imdbID', 'Type', 'tomatoMeter', 'tomatoImage',
       'tomatoRating', 'tomatoReviews', 'tomatoFresh', 'tomatoRotten',
       'tomatoConsensus', 'tomatoUserMeter', 'tomatoUserRating',
       'tomatoUserReviews', 'tomatoURL', 'DVD', 'BoxOffice', 'Production',
       'Website', 'Response'],
      dtype='object')

In [764]:
netflixData = netflixData[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
netflixData.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


## Cleaning Data

In [765]:
netflixData.isna().sum()

Title       0
Genre       0
Director    0
Actors      0
Plot        0
dtype: int64

### PreProcessing Data
##### To process and join the columns into a single bag of words to be used for cosine similarity

In [766]:
def preprocess(data: pd.Series) -> pd.Series:
    # Remove all non-alphanumeric characters from the string using regular expressions
    data = data.map(lambda x: re.sub('[^0-9a-zA-Z,]+', '', x.lower()))

    if data.str.contains(',').any():
        data = data.map(lambda x: x.replace(',', ' '))
    return data

In [767]:
netflixData['Genre'] = preprocess(netflixData['Genre'])
netflixData['Genre']

0               crime drama
1               crime drama
2               crime drama
3        action crime drama
4               crime drama
               ...         
245          drama filmnoir
246                   drama
247    comedy drama romance
248         biography drama
249                   drama
Name: Genre, Length: 250, dtype: object

In [768]:
netflixData['Director'] = preprocess(netflixData['Director'])
netflixData['Director']

0                  frankdarabont
1             francisfordcoppola
2             francisfordcoppola
3               christophernolan
4                    sidneylumet
                 ...            
245                  billywilder
246          destindanielcretton
247                  howardhawks
248                   davidlynch
249    dannyboyle loveleentandan
Name: Director, Length: 250, dtype: object

In [769]:
netflixData['Actors'] = preprocess(netflixData['Actors'])
netflixData['Actors']

0       timrobbins morganfreeman bobgunton williamsadler
1      marlonbrando alpacino jamescaan richardscastel...
2         alpacino robertduvall dianekeaton robertdeniro
3      christianbale heathledger aaroneckhart michael...
4           martinbalsam johnfiedler leejcobb egmarshall
                             ...                        
245      raymilland janewyman phillipterry howarddasilva
246    brielarson johngallagherjr stephaniebeatriz ra...
247    carygrant rosalindrussell ralphbellamy geneloc...
248    sissyspacek janegallowayheitz josephacarpenter...
249          devpatel saurabhshukla anilkapoor rajzutshi
Name: Actors, Length: 250, dtype: object

In [770]:
def preprocessStemmer(words):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(word) for word in words]
    

In [771]:
def preprocessLemmatisation(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

In [772]:
def removePunctuations(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [773]:
def preProcessPlot(words):
    words = preprocessStemmer(words)
    words = preprocessLemmatisation(words)
    return " ".join(words)

In [774]:
netflixData['Keywords_Plot'] = ""
stop_words = set(stopwords.words('english'))

for index, row in netflixData.iterrows():
    plot = row['Plot']
    plot = removePunctuations(plot)
    
    r = Rake(stopwords=stop_words)
    r.extract_keywords_from_text(plot)
    
    key_words_dict_scores = r.get_word_degrees()
    key_words_dict = list(key_words_dict_scores.keys())
    key_plot = preProcessPlot(key_words_dict)

    # key_phrase_dict = r.get_ranked_phrases()
    # key_phrase_dict = " ".join(key_phrase_dict).split()
    # key_plot = preProcessPlot(key_phrase_dict)
    
    row['Keywords_Plot'] = key_plot
    
netflixData.drop(columns = ['Plot'], inplace=True)

In [775]:
netflixData.head()

Unnamed: 0,Title,Genre,Director,Actors,Keywords_Plot
0,The Shawshank Redemption,crime drama,frankdarabont,timrobbins morganfreeman bobgunton williamsadler,two imprison men bond number year find solac e...
1,The Godfather,crime drama,francisfordcoppola,marlonbrando alpacino jamescaan richardscastel...,age patriarch organ crime dynasti transfer con...
2,The Godfather: Part II,crime drama,francisfordcoppola,alpacino robertduvall dianekeaton robertdeniro,earli life career vito corleon 1920s new york ...
3,The Dark Knight,action crime drama,christophernolan,christianbale heathledger aaroneckhart michael...,menac known joker emerg mysteri past wreak hav...
4,12 Angry Men,crime drama,sidneylumet,martinbalsam johnfiedler leejcobb egmarshall,juri holdout attempt prevent miscarriag justic...


In [776]:
netflixData.set_index('Title', inplace=True)
netflixData.head()

Unnamed: 0_level_0,Genre,Director,Actors,Keywords_Plot
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,crime drama,frankdarabont,timrobbins morganfreeman bobgunton williamsadler,two imprison men bond number year find solac e...
The Godfather,crime drama,francisfordcoppola,marlonbrando alpacino jamescaan richardscastel...,age patriarch organ crime dynasti transfer con...
The Godfather: Part II,crime drama,francisfordcoppola,alpacino robertduvall dianekeaton robertdeniro,earli life career vito corleon 1920s new york ...
The Dark Knight,action crime drama,christophernolan,christianbale heathledger aaroneckhart michael...,menac known joker emerg mysteri past wreak hav...
12 Angry Men,crime drama,sidneylumet,martinbalsam johnfiedler leejcobb egmarshall,juri holdout attempt prevent miscarriag justic...


In [777]:
netflixData['Bag_of_words'] = netflixData.apply(lambda x: ' '.join(x.astype(str)), axis=1)
netflixData.drop(columns = [col for col in netflixData.columns if col!= 'Bag_of_words'], inplace = True)

netflixData.head()

Unnamed: 0_level_0,Bag_of_words
Title,Unnamed: 1_level_1
The Shawshank Redemption,crime drama frankdarabont timrobbins morganfre...
The Godfather,crime drama francisfordcoppola marlonbrando al...
The Godfather: Part II,crime drama francisfordcoppola alpacino robert...
The Dark Knight,action crime drama christophernolan christianb...
12 Angry Men,crime drama sidneylumet martinbalsam johnfiedl...


## Creating the recommender

### 1. Compute TF-IDF matrix

In [778]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(netflixData['Bag_of_words'])

### 2. Compute pairwise similarity (cosine similarity as an example)

In [779]:
cosine_similarities = cosine_similarity(tfidf_matrix)

In [780]:
cosine_similarities

array([[1.        , 0.02867054, 0.02528571, ..., 0.00375638, 0.00393635,
        0.00360676],
       [0.02867054, 1.        , 0.18356229, ..., 0.00402013, 0.00421274,
        0.00386001],
       [0.02528571, 0.18356229, 1.        , ..., 0.00354552, 0.00371539,
        0.0034043 ],
       ...,
       [0.00375638, 0.00402013, 0.00354552, ..., 1.        , 0.00395914,
        0.00362764],
       [0.00393635, 0.00421274, 0.00371539, ..., 0.00395914, 1.        ,
        0.00380145],
       [0.00360676, 0.00386001, 0.0034043 , ..., 0.00362764, 0.00380145,
        1.        ]])

In [781]:
# Using metric='cosine' is same as using cosine similarity
model_knn = NearestNeighbors( algorithm='auto', metric='cosine', n_jobs=-1)
model_knn.fit(tfidf_matrix)

### 3. Generate recommendations

In [782]:
def get_recommendations(title, model, data, k=10):
    idx = data.index.get_loc(title)
    dist, indices = model.kneighbors(tfidf_matrix[idx], n_neighbors=k+1)
    recommended_movie_titles = [data.iloc[i].name for i in indices.flatten() if i != idx]
    return recommended_movie_titles

In [783]:
recommendations = get_recommendations('The Godfather', model_knn, netflixData, 10)
recommendations

[[0.         0.81643771 0.89797027 0.90973238 0.91756473 0.93010623
  0.93138775 0.93359952 0.93360037 0.93616752 0.94187685]]


['The Godfather: Part II',
 'Apocalypse Now',
 'On the Waterfront',
 'Scarface',
 'Heat',
 'Casino',
 'Léon: The Professional',
 'The Night of the Hunter',
 'Dogville',
 'Guardians of the Galaxy']

### 4. Fine-tuning the recommender

#### Using Stemming and Lemmatization to reduce words to its base form