In [2]:
import pandas as pd
import numpy as np

# import data from the clean file
df = pd.read_csv('../data/metadata_clean.csv')
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995


 Our plot description-based recommender will take in a movie title as an argument and output a list of movies that are most similar based on their plots
#### STEPS
 - Obtain required data
 - Create TF-IDF vectors (weight of a word in a document is greater if it occurs more frequently in that document and is present in fewer documents) for plot description
 - Compute the pairwise cosine similarity score of every movie (The higher the cosine score, the more similar the documents are to each other)
 - Write the recommender function that takes in a movie title as an argument and outputs movies most similar to it based on the plot


In [4]:
# Preparing the data
# import the original data
orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)

# add the useful features into the cleaned df
df['overview'], df['id'] = orig_df['overview'], orig_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [5]:
# creating tf-idf matrix
# Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF vectorizer object. remove all the english stop words(common words like this, it, the,that etc)
tfidf = TfidfVectorizer(stop_words='english')

# replace NaN with an emoty string
df['overview'] = df['overview'].fillna('')

# construct the required TF-IDF matrix by applying the fit_transformation method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])

# output the shape of the tfidf_matrix.shape
tfidf_matrix.shape

(45466, 75827)

In [6]:
#  Computing the cosine summary score
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
# Building the recommender function
# construct a reverse mapping of indices and movie titles and drop duplicate title if any
indices = pd.Series(df.index, index=df['title']).drop_duplicates()