In [1]:
import pandas as pd
import numpy as np

# import data from the clean file
df = pd.read_csv('../data/metadata_clean.csv')
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995


 Our plot description-based recommender will take in a movie title as an argument and output a list of movies that are most similar based on their plots
#### STEPS
 - Obtain required data
 - Create TF-IDF vectors (weight of a word in a document is greater if it occurs more frequently in that document and is present in fewer documents) for plot description
 - Compute the pairwise cosine similarity score of every movie (The higher the cosine score, the more similar the documents are to each other)
 - Write the recommender function that takes in a movie title as an argument and outputs movies most similar to it based on the plot


In [2]:
# Preparing the data
# import the original data
orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)

# add the useful features into the cleaned df
df['overview'], df['id'] = orig_df['overview'], orig_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [3]:
# creating tf-idf matrix
# Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF vectorizer object. remove all the english stop words(common words like this, it, the,that etc)
tfidf = TfidfVectorizer(stop_words='english')

# replace NaN with an emoty string
df['overview'] = df['overview'].fillna('')

# construct the required TF-IDF matrix by applying the fit_transformation method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['overview'])

# output the shape of the tfidf_matrix.shape
tfidf_matrix.shape

(45466, 75827)

In [4]:
#  Computing the cosine summary score
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:
# Building the recommender function
# construct a reverse mapping of indices and movie titles and drop duplicate title if any
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

### Steps to build our recommender function
 - Declare the title of the movie as an argument
 - Obtain movie index from indices reverse mapping
 - Get the list of cosine similarity scores using cosine_sim. Convert this into a list of tuples where the first element is the position and the second is the similarity score
 - Sort this list of tuples on the basis of the cosine similarity scores
 - Get the top 10 elements of this list. Ignore the first element as it is the movie itself
 - Return the titles corresponding to the indices of the top 10 elements excluding the first

In [6]:
# Function that takes in movie title as input and gives recommendations
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity score of all movies with that movie
    # and convert it into a list of tuples
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the movies indices
    movies_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[movies_indices]

In [7]:
# recommendations for the lion king
content_recommender('The Lion King')

359                                    The Lion King
34682    How the Lion Cub and the Turtle Sang a Song
9353                                The Lion King 1½
9115                  The Lion King 2: Simba's Pride
42829                                           Prey
                            ...                     
45461                                         Subdue
45462                            Century of Birthing
45463                                       Betrayal
45464                               Satan Triumphant
45465                                       Queerama
Name: title, Length: 45466, dtype: object

In [8]:
# recommendations for Toy Story
content_recommender('Toy Story')

0                     Toy Story
15348               Toy Story 3
2997                Toy Story 2
10301    The 40 Year Old Virgin
24523                 Small Fry
                  ...          
45460                Robin Hood
45461                    Subdue
45462       Century of Birthing
45463                  Betrayal
45465                  Queerama
Name: title, Length: 45466, dtype: object

There we go!! First content-based recommender🥳

## Metadata-based recommender

We will use the following metadata
 - The genre
 - The director
 - The 3 main characters
 - Sub-genres/keywords

In [9]:
# load the keyword and credits files
cred_df = pd.read_csv('../data/credits.csv')
key_df = pd.read_csv('../data/keywords.csv')

cred_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [10]:
# keywords dataframe
key_df.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
