In [26]:
import ast
import spacy
import numpy as np
import pandas as pd

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [6]:
# merging dataframe
print(f'Credits shape : {credits.shape}, Movies shape : {movies.shape}')
print(f'MergedDF shape : {movies.merge(credits, on="title").shape}')
movies = movies.merge(credits, on='title')

Credits shape : (4803, 4), Movies shape : (4803, 20)
MergedDF shape : (4809, 23)


In [7]:
# required columns
required_col_list = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']
movies = movies[required_col_list]

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [9]:
# missing data
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
# drop the missing data
movies.dropna(inplace=True)

In [11]:
# check for duplicated rows
movies.duplicated().sum()

0

In [15]:
def convert(obj):
    L = []
    for dct in ast.literal_eval(obj):
        L.append(dct["name"])
    return L


def convert_cast(obj):
    L = []
    for dct in ast.literal_eval(obj)[:3]:
        L.append(dct["name"])
    return L


def fetch_director(obj):
    L = []
    for dct in ast.literal_eval(obj):
        if dct["job"] == "Director":
            L.append(dct["name"])
    return L

In [16]:
movies['genres list'] = movies.genres.apply(convert)

In [17]:
movies['kwds list'] = movies.keywords.apply(convert)

In [18]:
movies['top cast'] = movies.cast.apply(convert_cast)

In [19]:
movies['director'] = movies.crew.apply(fetch_director)

In [20]:
movies['overview list'] = movies.overview.apply(lambda x : x.split())

In [21]:
movies['title list'] = movies.title.apply(lambda x : x.split())

In [22]:
# remove space between words
rem_space = lambda x : [i.replace(' ', '') for i in x]

In [23]:
# remove spaces from these columns
movies['top cast'] = movies['top cast'].apply(rem_space)
movies.director = movies.director.apply(rem_space)
movies['genres list'] = movies['genres list'].apply(rem_space)
movies['kwds list'] = movies['kwds list'].apply(rem_space)

In [25]:
# stemming the words
nlp = spacy.load('en_core_web_sm')

In [27]:
def stem(text):
    y = []
    text = ' '.join(text)
    doc = nlp(text)
    for token in doc:
        y.append(token.lemma_)
    
    return y

In [28]:
import time
start_time = time.time()
movies['half tags'] = movies['overview list'] + movies['genres list'] + movies['kwds list']
movies['half tags'] = movies['half tags'].apply(stem)
end_time = time.time()
print(f'Time : {end_time - start_time}')

Time : 118.82698154449463


In [29]:
# make new column 'tags'
movies['tags'] = movies['title list'] + movies['top cast'] + movies.director + movies['half tags']

In [30]:
df = movies.loc[:, ['movie_id', 'title', 'tags']]

In [31]:
# make lower case strings from list
df['tags'] = df['tags'].apply(lambda x : ' '.join(x).lower())

In [34]:
df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,avatar samworthington zoesaldana sigourneyweav...
1,285,Pirates of the Caribbean: At World's End,pirates of the caribbean: at world's end johnn...
2,206647,Spectre,spectre danielcraig christophwaltz léaseydoux ...
3,49026,The Dark Knight Rises,the dark knight rises christianbale michaelcai...
4,49529,John Carter,john carter taylorkitsch lynncollins samantham...


In [35]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df.tags).toarray()
sim_mat = cosine_similarity(vectors)

In [44]:
def recommend(movie):
    idx = df[df['title'] == movie].index[0]
    dist = sim_mat[idx]
    sorted_idx = np.argsort(dist) # asc order
    rec_idx = sorted_idx[-6:] # last 5 indices
    for i in rec_idx[::-1]: # reverse to get the top 5
        print(df.title.iloc[i])

In [45]:
recommend("Harry Potter and the Philosopher's Stone")

Harry Potter and the Philosopher's Stone
Harry Potter and the Chamber of Secrets
Harry Potter and the Goblet of Fire
Harry Potter and the Prisoner of Azkaban
Harry Potter and the Order of the Phoenix
Harry Potter and the Half-Blood Prince


In [46]:
recommend('Inception')

Inception
Minority Report
Duplex
The Helix... Loaded
Transformers: Revenge of the Fallen
Looper


In [39]:
# import pickle
# pickle.dump(sim_mat, open('sim_mat_spacy.pkl', 'wb'))