In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import ast # syntax procesing
from sklearn.feature_extraction.text import CountVectorizer # text vectorization
from sklearn.metrics.pairwise import cosine_similarity # vector similarity
import os # navigate directory
from sklearn.neighbors import NearestNeighbors # proximity based classification
import joblib # save objects to disk

### Getting Data

In [2]:
# load datasets
movies = pd.read_csv('../datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('../datasets/tmdb_5000_credits.csv')

In [3]:
# preview movies data
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
# check shape of movie dataset
movies.shape

(4803, 20)

In [5]:
# check columns in movies dataset
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [6]:
# preview credits dataset
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [7]:
# check shape of credits dataset
credits.shape

(4803, 4)

In [8]:
# check columns in credits dataset
credits.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [9]:
# rename id of movies to movie_id
movies.rename(columns={'id': 'movie_id'}, inplace=True)
# merge movies and credits datasets
merged_data = movies.merge(credits,on=['movie_id', 'title'])
merged_data.shape

(4803, 22)

In [10]:
# check data columns
merged_data.columns

Index(['budget', 'genres', 'homepage', 'movie_id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [11]:
merged_data.head(2)

Unnamed: 0,budget,genres,homepage,movie_id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [12]:
# select relevant columns
selected_cols = ['movie_id','title','genres','overview','keywords','tagline','cast','crew']

# get selected data
sel_data = merged_data[selected_cols]

# show selected data
sel_data.head(2)

Unnamed: 0,movie_id,title,genres,overview,keywords,tagline,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Enter the World of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","At the end of the world, the adventure begins.","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


###  Data Cleaning

In [13]:
data = sel_data.copy()

In [14]:
# check data info
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   genres    4803 non-null   object
 3   overview  4800 non-null   object
 4   keywords  4803 non-null   object
 5   tagline   3959 non-null   object
 6   cast      4803 non-null   object
 7   crew      4803 non-null   object
dtypes: int64(1), object(7)
memory usage: 337.7+ KB


In [15]:
# check for duplicates
data.duplicated().any()

False

In [16]:
# check for missing values
data.isna().sum()

movie_id      0
title         0
genres        0
overview      3
keywords      0
tagline     844
cast          0
crew          0
dtype: int64

In [17]:
# drop tagline column
data.drop('tagline', axis=1, inplace=True)

# drop rows with missing 'overview' data
data.dropna(inplace=True)

In [18]:
# # check value of genres
# sel_data['genres'][0]

In [19]:
# extracts values with 'name' key
def extract_name(text):
    names = []
    for i in ast.literal_eval(text):
        names.append(i['name'])
    return names

In [20]:
# # test extract_name function
# extract_name(sel_data['genres'][0])

In [21]:
# extract genre names
data['genres'] = data['genres'].apply(extract_name)
data.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [22]:
# # check value of keywords
# data['keywords'][0]

In [23]:
# extract keyword names
data['keywords'] = data['keywords'].apply(extract_name)
data.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [24]:
# # check value of cast
# data['cast'][0]

In [25]:
# get the top casts
def get_top_casts(text, num_casts=5):
    casts = []
    for i in ast.literal_eval(text)[:num_casts]:
            casts.append(i['name'])
    return casts 

In [26]:
# # test get_top_casts
# get_top_casts(data['cast'][0])

In [27]:
# get top casts from cast data
data['cast'] = data['cast'].apply(get_top_casts)
data.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [28]:
# get value with 'Director' key
get_director = lambda x: [i['name'] for i in ast.literal_eval(x) if i['job']=='Director']

In [29]:
# # test get_director
# get_director(data['crew'][0])

In [30]:
# get director from crew data
data['crew'] = data['crew'].apply(get_director)
data.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]


In [31]:
# remove spaces from text
remove_space = lambda x: [i.replace(" ","") for i in x]

In [32]:
# # test remove_space function
# remove_space(data['keywords'][0])

In [33]:
# remove spaces from values
data['genres'] = data['genres'].apply(remove_space)
data['keywords'] = data['keywords'].apply(remove_space)
data['cast'] = data['cast'].apply(remove_space)
data['crew'] = data['crew'].apply(remove_space)

In [34]:
data.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","In the 22nd century, a paraplegic Marine is di...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [35]:
# join all features
data['overview'] = data['overview'].apply(lambda x:x.split())
data['tags'] = data['overview'] + data['genres'] + data['keywords'] + data['cast'] + data['crew']

In [36]:
# # show created tags
# data['tags'][:3]

In [37]:
# get cleaned data
cleaned_data = data.drop(columns=['overview','genres','keywords','cast','crew'])
cleaned_data.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [38]:
# get values as strings
cleaned_data['tags'] = cleaned_data['tags'].apply(lambda x: " ".join(x))
cleaned_data.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."


### Data Preprocessing

In [39]:
# get final dataset
final = cleaned_data.copy()

In [40]:
# define vectorizer
vectorizer = CountVectorizer(max_features=5000,stop_words='english')

In [41]:
# get text as vectors
vectors = vectorizer.fit_transform(cleaned_data['tags']).toarray()

In [42]:
# check vector shape
vectors.shape

(4800, 5000)

### Model Building

In [43]:
# similarity = cosine_similarity(vectors)
# # check similarity
# similarity[0]

In [44]:
# define and fit model
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model.fit(vectors)

### Model Testing

In [45]:
# recommend the most similar vectors
def recommend(movie):
    ind = final[final['title'] == movie].index[0] # get index of movie input
    loc = final.index.get_loc(ind) # get iloc of movie
    neighbors = model.kneighbors(vectors[loc].reshape(1, -1), n_neighbors=6)[1]
    for i in neighbors.flatten()[1:6]:
        print(final.iloc[i].title) # recommend the 5 most similar movies

In [46]:
# test model
recommend('Spider-Man 3') # recommend science fiction-adventure movies

Spider-Man 2
Spider-Man
The Amazing Spider-Man 2
The Amazing Spider-Man
Arachnophobia


### Save Model Artifacts

In [47]:
# get movie_id and title
movies_data = final[['movie_id', 'title']]
movies_data.head()

Unnamed: 0,movie_id,title
0,19995,Avatar
1,285,Pirates of the Caribbean: At World's End
2,206647,Spectre
3,49026,The Dark Knight Rises
4,49529,John Carter


In [48]:
# create paths for artifacts
paths = ['../Model', '../features']
for path in paths:
    if os.path.exists(path):
        pass
    else:
        os.mkdir(path)

In [49]:
# save artifacts
joblib.dump(model, open('../model/model','wb'), compress=1)
joblib.dump(vectors,open('../features/vectors','wb'), compress=1)
joblib.dump(movies_data,open('../features/data','wb'))