In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [6]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [7]:
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [8]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [9]:
credits.columns = ['id', 'title', 'cast', 'crew']
moview = movies.merge(credits, on='id') # merge movies and credits dataset based on same 'id'

In [10]:
movies['overview'].head(5)  # Default is 5 (first 5 row data in dataset)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [11]:
# replace NaN values with empty strings
movies['overview'] = movies['overview'].fillna('')

In [12]:
# function -> using movie description and the keyword associated with the movie and genre to recommend the movie
def create_soup(x):
    return ''.join(x['keywords']) + '' + ''.join(x['genres']) + '' + ''.join(x['overview'])

# allow the users to pass a function and apply it on every single value of the Pandas series
movies['soup'] = movies.apply(create_soup, axis=1)

In [13]:
tfidf = TfidfVectorizer(stop_words='english')   # only abstract the english words for now (based on the stop_words attribute) (chinese words and others cannot)
tfidf_matrix = tfidf.fit_transform(movies['soup']) # vectorize and transform the soup column of movies dataset
tfidf_matrix.shape # shows total number of rows and cols

(4803, 32768)

In [14]:
consine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)   # independant of the magnitude and easy to use when using linear_kernel to create linear kernel (consine sim)

In [15]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates() # create Series and drop duplicates

In [20]:
def get_recommendation(title, consine_sim): # recommend the movies based on the similarities (using the consine_sim attribute)
    # get the index of the movie that match the title
    idx = indices[title]

    # get the pairwise similarity scores of all the movies
    sim_scores = list(enumerate(consine_sim[idx]))
    # sort the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # get the top 10 similar movies
    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

In [21]:
get_recommendation('The Avengers', consine_sim)

7                  Avengers: Age of Ultron
26              Captain America: Civil War
64                       X-Men: Apocalypse
242                         Fantastic Four
511                                  X-Men
79                              Iron Man 2
85     Captain America: The Winter Soldier
169     Captain America: The First Avenger
182                                Ant-Man
68                                Iron Man
Name: title, dtype: object