In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.precision = 3
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = [7, 5]
plt.rcParams['figure.titlesize'] = 15

%matplotlib inline

# Loading The Files

In [3]:
# Load 'Movies' file
movies = pd.read_csv('ml-latest/movies_metadata.csv.zip', 
                     usecols=['budget', 'genres', 'id', 'original_language', 'original_title', 
                              'popularity', 'production_companies', 'production_countries', 
                              'release_date', 'runtime'], low_memory=False)

movies.id = movies.id.str.replace('-', '0').astype('int')
movies.release_date = pd.to_datetime(movies.release_date, errors='coerce')
movies['year'] = movies.release_date.dt.year
movies['month'] = movies.release_date.dt.month
movies = movies.drop(['release_date'], axis=1)
movies = movies[~movies.id.duplicated()]

# Load 'Credits' file
credits = pd.read_csv('ml-latest/credits.csv.zip', usecols=['id', 'crew', 'cast'])
credits = credits[~credits.id.duplicated()]

# Load 'keywords' file
keywords = pd.read_csv('ml-latest/keywords.csv')
keywords = keywords[~keywords.id.duplicated()]

# Load 'tags' file
tags = pd.read_csv('ml-latest/tags.csv', usecols=['movieId', 'tag'])

# Load 'links' file
links = pd.read_csv('ml-latest/links.csv')
links = links.loc[links.tmdbId.notnull()]
links.tmdbId = links.tmdbId.astype('int')

In [4]:
clean_movies = movies.drop(['budget', 'popularity', 'runtime', 'year', 'month'], axis=1)
print(f"Number of unique movies: {len(clean_movies.id.unique())}")
clean_movies.head()

Number of unique movies: 45436


Unnamed: 0,genres,id,original_language,original_title,production_companies,production_countries
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,Toy Story,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,Jumanji,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o..."
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,Grumpier Old Men,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o..."
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,Waiting to Exhale,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o..."
4,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Father of the Bride Part II,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o..."


In [5]:
print(f"Number of unique movies: {len(credits.id.unique())}")
credits.head()

Number of unique movies: 45432


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [6]:
print("Movies appears in 'celan_movies', but not in 'credits': ")
print(set(clean_movies.id.unique())^set(credits.id.unique()))

Movies appears in 'celan_movies', but not in 'credits': 
{401840, 2012009029, 2014001001, 1997008020}


# Merging

In [7]:
metadata = pd.merge(left=clean_movies, right=credits, on='id')

metadata = metadata.merge(keywords, on='id')

metadata_links = pd.merge(left=metadata, right=links, left_on='id', right_on='tmdbId')

tags_per_movie = tags.groupby('movieId')['tag'].apply(list).reset_index(drop=False)

metadata_all = pd.merge(left=metadata_links, right=tags_per_movie, on='movieId')
metadata_all = metadata_all.drop(['tmdbId', 'movieId', 'imdbId'], axis=1)

# Data Cleaning

In [8]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [9]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [35]:
def create_soup(x, dir_weight=0):
    
    return ' '.join(x['keywords']) + ' ' + \
           ' '.join(x['cast']) + ' ' + \
           ' '.join(x['tag']) + ' ' + \
           ' '.join(x['genres']) + ' ' + \
            ' '.join(x['production_companies']) + ' ' + \
            ' '.join(x['production_countries']) + ' ' + \
            x['director'] + (' ' + x['director'])*4

## Extracting Lists

In [11]:
features_to_clean = ['genres', 'production_companies', 'production_countries', 'crew', 'cast', 'keywords']

for feature in features_to_clean:
    metadata_all[feature] = metadata_all[feature].fillna('[]')
    metadata_all[feature] = metadata_all[feature].apply(literal_eval)
    if feature == 'crew':
        metadata_all['director'] = metadata_all[feature].apply(get_director)
        metadata_all = metadata_all.drop([feature], axis=1)
    else:
        metadata_all[feature] = metadata_all[feature].apply(get_list)

## Remove Spaces & Make Strings Lower

In [12]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [13]:
def clean_tags(tags_list):
    
    tags_list = list(set(tags_list))
    res = []
    for tag in tags_list:
        try:
            temp = tag.replace(' ', '').lower()
        except AttributeError:
            temp = ''
        
        res.append(temp)
    
    return res

In [14]:
features_to_clean = ['genres', 'production_companies', 'production_countries', 
                     'cast', 'keywords', 'director', 'tag']

for feature in features_to_clean:
    
    if feature == 'tag':
        metadata_all[feature] = metadata_all[feature].apply(clean_tags)
    else:
        metadata_all[feature] = metadata_all[feature].apply(clean_data)

## Create Bag-of-Words

In [36]:
metadata_all['bow'] = metadata_all.apply(create_soup, axis=1)

# Recommend

In [17]:
def get_recommendations(title, ind_movies, cosine_sim, top_n=10):
    # Get the index of the movie that matches the title
    idx = ind_movies[title]
    if not isinstance(idx, np.int64):
        print("There is more then one movie with this title")
        print("Choosing only the first one...")
        idx = idx[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar movies
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    similarity_score = [round(i[1], 3) for i in sim_scores]

    # Return the top top_n most similar movies
    rec_movies = metadata_all['original_title'].iloc[movie_indices].values
    return dict(zip(rec_movies, similarity_score))
#     return metadata_all['original_title'].iloc[movie_indices], sim_scores, movie_indices

## Build Similarity Matrix

In [37]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata_all['bow'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

## Final Output

In [38]:
indices = pd.Series(metadata_all.index, index=metadata_all['original_title'])

get_recommendations('Inception', indices, cosine_sim, top_n=10)

{'Batman Begins': 0.331,
 'Doodlebug': 0.348,
 'Dunkirk': 0.384,
 'Following': 0.34,
 'Insomnia': 0.322,
 'Interstellar': 0.401,
 'Memento': 0.316,
 'The Dark Knight': 0.366,
 'The Dark Knight Rises': 0.309,
 'The Prestige': 0.38}