In [None]:
# install required packages

#!pip install --upgrade pip
#!pip install ydata_profiling

In [16]:
# Importing libraries
import pandas as pd
from ydata_profiling import ProfileReport

import nltk
import re
import numpy as np
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ajaykumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ajaykumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Dataset - https://www.kaggle.com/datasets/shivamb/amazon-prime-movies-and-tv-shows
# load the dataset
df = pd.read_csv('amazon_primevideo.csv')

In [None]:
# ydata_profiling
profile = ProfileReport(df, title="Report")
profile

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7586 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
dtypes: int64(1), object(11)
memory usage: 906.5+ KB


In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [41]:
df = df[['title', 'director', 'cast', 'listed_in', 'description', 'type']]
df['title_description'] = df['title'].map(str) + ' ' + df['description']+ ' ' + df['cast']
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6952 entries, 0 to 9667
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              6952 non-null   object
 1   director           6952 non-null   object
 2   cast               6952 non-null   object
 3   listed_in          6952 non-null   object
 4   description        6952 non-null   object
 5   type               6952 non-null   object
 6   title_description  6952 non-null   object
dtypes: object(7)
memory usage: 434.5+ KB


In [42]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

norm_corpus = normalize_corpus(list(df['title_description']))

len(norm_corpus)

6952

In [51]:
norm_corpus

array(['grand seduction small fishing village must procure local doctor secure lucrative business contract unlikely candidate big city doctor paul lewis lands lap trial residence townsfolk rally together charm staying doctors time village winds close acting mayor murray french choice pull stops brendan gleeson taylor kitsch gordon pinsent',
       'take care good night metro family decides fight cyber criminal threatening stability pride mahesh manjrekar abhay mahajan sachin khedekar',
       'secrets deception man discovers wife cheating neighborhood kid goes furious path selfdestruction tom sizemore lorenzo lamas robert lasardo richard jones yancey arias noel gugliemi',
       ...,
       'pride bowery new york city street principles get east side kid trouble civilian conservation corps camp leo gorcey bobby jordan',
       'outpost wartorn eastern europe worldweary group mercenaries discover longhidden secret abandoned wwii bunker ray stevenson julian wadham richard brake michael sm

In [43]:
# Extract TF-IDF Features

tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

(6952, 38170)

In [44]:
# Compute Pairwise Document Similarity

doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6942,6943,6944,6945,6946,6947,6948,6949,6950,6951
0,1.0,0.0,0.0,0.021401,0.0,0.0,0.010557,0.005684,0.0,0.0,...,0.0,0.0,0.0,0.0,0.00755,0.0,0.0,0.010847,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.020262,0.015103,0.0,0.0,0.0,0.0,0.0,0.048734,0.0,0.0
2,0.0,0.0,1.0,0.0,0.006337,0.0,0.0,0.0,0.0,0.0,...,0.0,0.017409,0.0,0.022784,0.0,0.014781,0.023822,0.024565,0.012271,0.0
3,0.021401,0.0,0.0,1.0,0.0,0.004548,0.0,0.011435,0.0,0.0,...,0.0,0.028143,0.0,0.0,0.007387,0.0,0.0,0.016028,0.0,0.004158
4,0.0,0.0,0.006337,0.0,1.0,0.010594,0.007988,0.0,0.0,0.0,...,0.0,0.0,0.01162,0.0,0.0,0.012879,0.01688,0.0,0.0,0.046157


In [45]:
# Get List of Movie Titles

movies_list = df['title'].values
movies_list, movies_list.shape

(array(['The Grand Seduction', 'Take Care Good Night',
        'Secrets of Deception', ..., 'Pride Of The Bowery', 'Outpost',
        'Harry Brown'], dtype=object),
 (6952,))

In [46]:
# Find Top Similar Movies for a Sample Movie

# Find movie ID
movie_idx = np.where(movies_list == 'Monster Maker')[0][0]
movie_idx

4

In [47]:
# Get movie similarities
movie_similarities = doc_sim_df.iloc[movie_idx].values
movie_similarities

array([0.        , 0.        , 0.00633672, ..., 0.        , 0.        ,
       0.04615725])

In [48]:
# Get top 5 similar movie IDs
similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
similar_movies = movies_list[similar_movie_idxs]
similar_movies

array(['The Night is Young', 'Matt Besser: Pot Humor', 'Gamera the Brave',
       'Some Freaks', 'Matt Braunger: Finally Live In Portland'],
      dtype=object)

In [49]:
for movie in movies_list:
    if 'Robocop' in movie:
        print(movie)

Robocop: Prime Directives-Resurrection
Robocop: Prime Directives-Crash and Burn
Robocop: Prime Directives-Dark Justice
Robocop: Prime Directives-Meltdown


In [50]:
def movie_recommender(movie_title, movies=movies_list, doc_sims=doc_sim_df):
    # find movie id
    movie_idx = np.where(movies == movie_title)[0][0]
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values
    # get top 5 similar movie IDs
    similar_movie_idxs = np.argsort(-movie_similarities)[1:6]
    # get top 5 movies
    similar_movies = movies[similar_movie_idxs]
    # return the top 5 movies
    return similar_movies

movie_recommender(movie_title='Robocop: Prime Directives-Meltdown')

array(['Robocop: Prime Directives-Resurrection',
       'Robocop: Prime Directives-Crash and Burn',
       'Robocop: Prime Directives-Dark Justice',
       'Morgan Murphy: Irish Goodbye', 'Coming 2 America'], dtype=object)