In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.options.display.precision = 3
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = [7, 5]
plt.rcParams['figure.titlesize'] = 15

%matplotlib inline

# Loading The Files

In [3]:
# Load 'Movies' file
movies = pd.read_csv('movies_metadata.csv',)

movies.id = movies.id.str.replace('-', '0').astype('int')
movies.release_date = pd.to_datetime(movies.release_date, errors='coerce')
# Load 'Credits' file
credits = pd.read_csv('credits.csv')
credits = credits[~credits.id.duplicated()]

# Load 'keywords' file
keywords = pd.read_csv('keywords.csv')
keywords = keywords[~keywords.id.duplicated()]

# Load 'tags' file
tags = pd.read_csv('tags.csv')

# Load 'links' file
links = pd.read_csv('links.csv')
links = links.loc[links.tmdbId.notnull()]
links.tmdbId = links.tmdbId.astype('int')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
clean_movies = movies.drop(['adult','belongs_to_collection','video','homepage','poster_path',
                           'tagline', 'vote_average','overview'], axis=1)
clean_movies.head()

Unnamed: 0,budget,genres,id,imdb_id,original_language,original_title,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,vote_count
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,Toy Story,21.9,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373600000.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Toy Story,5415.0
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,Jumanji,17.0,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262800000.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Jumanji,2413.0
2,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,Grumpier Old Men,11.7,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Grumpier Old Men,92.0
3,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,en,Waiting to Exhale,3.86,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81450000.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Waiting to Exhale,34.0
4,0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,en,Father of the Bride Part II,8.39,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76580000.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Father of the Bride Part II,173.0


In [5]:
clean_movies.dropna(inplace=True)

In [6]:
clean_movies = clean_movies[clean_movies['status'].isin(['Released'])]

In [7]:
clean_movies.drop(columns=['status'], axis=1, inplace=True)

In [8]:
clean_movies = clean_movies[clean_movies['original_language'].isin(['en'])]

In [9]:
clean_movies.drop(columns=['original_language'], axis=1, inplace=True)

In [10]:
clean_movies['release_date'] = clean_movies['release_date'].astype(str)

In [11]:
clean_movies[['release_year', 'release_month', 'release_day']] = clean_movies['release_date'].str.split('-', expand=True)

In [12]:
clean_movies['release_year'] = clean_movies['release_year'].astype(int)

In [13]:
clean_movies = clean_movies.drop(clean_movies[clean_movies['release_year'] < 1950].index)

In [14]:
clean_movies.drop(columns=['release_date','release_day','release_month'], axis=1, inplace=True)

In [15]:
print(f"Number of unique movies: {len(credits.id.unique())}")
credits.head()

Number of unique movies: 45432


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [16]:
print("Movies appears in 'celan_movies', but not in 'credits': ")
print(set(clean_movies.id.unique())^set(credits.id.unique()))

Movies appears in 'celan_movies', but not in 'credits': 
{2, 3, 15, 19, 26, 30, 262176, 131116, 55, 64, 79, 81, 262227, 86, 91, 262234, 99, 101, 102, 104, 108, 109, 110, 112, 113, 262260, 124, 127, 128, 129, 136, 138, 139, 140, 143, 144, 145, 146, 147, 149, 156, 158, 159, 166, 171, 175, 393407, 194, 195, 198, 204, 211, 212, 216, 219, 222, 223, 224, 225, 228, 229, 393441, 234, 244, 250, 246, 255, 258, 259, 260, 256, 265, 266, 267, 269, 271, 273, 276, 286, 289, 131360, 300, 303, 393519, 393521, 307, 309, 312, 316, 317, 327, 131398, 335, 336, 337, 338, 342, 344, 346, 347, 349, 393562, 393559, 131433, 363, 262514, 262517, 131457, 131455, 387, 390, 391, 262536, 405, 406, 407, 408, 410, 131476, 131478, 262551, 422, 427, 428, 429, 434, 436, 439, 442, 131507, 445, 452, 457, 458, 459, 460, 464, 465, 480, 481, 393695, 26147, 490, 499, 511, 393729, 393731, 523, 537, 542, 543, 548, 552, 553, 554, 393764, 560, 570, 572, 575, 582, 131653, 593, 596, 598, 613, 614, 618, 626, 630, 631, 633, 637, 643, 6

# Merging

In [17]:
metadata = pd.merge(left=clean_movies, right=credits, on='id')

metadata = metadata.merge(keywords, on='id')

metadata_links = pd.merge(left=metadata, right=links, left_on='id', right_on='tmdbId')

tags_per_movie = tags.groupby('movieId')['tag'].apply(list).reset_index(drop=False)

metadata_all = pd.merge(left=metadata_links, right=tags_per_movie, on='movieId')
metadata_all = metadata_all.drop(['tmdbId', 'movieId', 'imdbId'], axis=1)

# Data Cleaning

In [18]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [19]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [20]:
def create_soup(x, dir_weight=0):
    
    return ' '.join(x['keywords']) + ' ' + \
           ' '.join(x['cast']) + ' ' + \
           ' '.join(x['tag']) + ' ' + \
           ' '.join(x['genres']) + ' ' + \
            ' '.join(x['production_companies']) + ' ' + \
            ' '.join(x['production_countries']) + ' ' + \
            x['director'] + (' ' + x['director'])*4

## Extracting Lists

In [21]:
features_to_clean = ['genres', 'production_companies', 'production_countries', 'crew', 'cast', 'keywords']

for feature in features_to_clean:
    metadata_all[feature] = metadata_all[feature].fillna('[]')
    metadata_all[feature] = metadata_all[feature].apply(literal_eval)
    if feature == 'crew':
        metadata_all['director'] = metadata_all[feature].apply(get_director)
        metadata_all = metadata_all.drop([feature], axis=1)
    else:
        metadata_all[feature] = metadata_all[feature].apply(get_list)

## Remove Spaces & Make Strings Lower

In [22]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [23]:
def clean_tags(tags_list):
    
    tags_list = list(set(tags_list))
    res = []
    for tag in tags_list:
        try:
            temp = tag.replace(' ', '').lower()
        except AttributeError:
            temp = ''
        
        res.append(temp)
    
    return res

In [24]:
features_to_clean = ['genres', 'production_companies',
                     'production_countries', 'cast', 
                     'keywords', 'director', 'tag']

for feature in features_to_clean:
    
    if feature == 'tag':
        metadata_all[feature] = metadata_all[feature].apply(clean_tags)
    else:
        metadata_all[feature] = metadata_all[feature].apply(clean_data)

## Create Bag-of-Words

In [25]:
metadata_all['bow'] = metadata_all.apply(create_soup, axis=1)

In [26]:
metadata_all

Unnamed: 0,budget,genres,id,imdb_id,original_title,popularity,production_companies,production_countries,revenue,runtime,spoken_languages,title,vote_count,release_year,cast,keywords,tag,director,bow
0,30000000,"[animation, comedy, family]",862,tt0114709,Toy Story,21.9,[pixaranimationstudios],[unitedstatesofamerica],3.736e+08,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Toy Story,5415.0,1995,"[tomhanks, timallen, donrickles]","[jealousy, toy, boy]","[fun, pixar]",johnlasseter,jealousy toy boy tomhanks timallen donrickles ...
1,65000000,"[adventure, fantasy, family]",8844,tt0113497,Jumanji,17,"[tristarpictures, teitlerfilm, interscopecommu...",[unitedstatesofamerica],2.628e+08,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Jumanji,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst]","[boardgame, disappearance, basedonchildren'sbook]","[fantasy, game, magicboardgame, robinwilliams]",joejohnston,boardgame disappearance basedonchildren'sbook ...
2,0,"[romance, comedy]",15602,tt0113228,Grumpier Old Men,11.7,"[warnerbros., lancastergate]",[unitedstatesofamerica],0.000e+00,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Grumpier Old Men,92.0,1995,"[waltermatthau, jacklemmon, ann-margret]","[fishing, bestfriend, duringcreditsstinger]","[old, moldy]",howarddeutch,fishing bestfriend duringcreditsstinger walter...
3,0,[comedy],11862,tt0113041,Father of the Bride Part II,8.39,"[sandollarproductions, touchstonepictures]",[unitedstatesofamerica],7.658e+07,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Father of the Bride Part II,173.0,1995,"[stevemartin, dianekeaton, martinshort]","[baby, midlifecrisis, confidence]","[pregnancy, remake]",charlesshyer,baby midlifecrisis confidence stevemartin dian...
4,58000000,"[comedy, romance]",11860,tt0114319,Sabrina,6.68,"[paramountpictures, scottrudinproductions, mir...","[germany, unitedstatesofamerica]",0.000e+00,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Sabrina,141.0,1995,"[harrisonford, juliaormond, gregkinnear]","[paris, brotherbrotherrelationship, chauffeur]",[remake],sydneypollack,paris brotherbrotherrelationship chauffeur har...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1261,40000000,"[thriller, action, crime]",324552,tt4425200,John Wick: Chapter 2,49.247505,"[thunderroadpictures, lionsgate, 87eleven]",[unitedstatesofamerica],1.715e+08,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",John Wick: Chapter 2,2924.0,2017,"[keanureeves, common, laurencefishburne]","[italy, gun, roof]","[keanureeves, hitman, secretsociety, darkhero,...",chadstahelski,italy gun roof keanureeves common laurencefish...
1262,97000000,"[action, drama, sciencefiction]",263115,tt3315342,Logan,54.581997,"[twentiethcenturyfoxfilmcorporation, donners'c...",[unitedstatesofamerica],6.168e+08,137.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Logan,6310.0,2017,"[hughjackman, patrickstewart, dafnekeen]","[cyborg, experiment, self-destruction]","[heartbreaking, gritty, emotional, predictible...",jamesmangold,cyborg experiment self-destruction hughjackman...
1263,5000000,"[horror, mystery, thriller]",418078,tt4695012,It Comes at Night,20.504587,"[animalkingdom, a24]",[unitedstatesofamerica],0.000e+00,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",It Comes at Night,357.0,2017,"[joeledgerton, christopherabbott, carmenejogo]","[fire, nihilism, homicide]","[suspenseful, paranoia]",treyedwardshults,fire nihilism homicide joeledgerton christophe...
1264,0,"[drama, horror, mystery]",374430,tt3973198,Black Mirror: White Christmas,24.910782,[],[],0.000e+00,74.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Black Mirror: White Christmas,211.0,2014,"[jonhamm, rafespall, oonachaplin]","[artificialintelligence, confession, technology]","[thoughtprovoking, jonhamm, dystopia, future, ...",carltibbetts,artificialintelligence confession technology j...


# Recommend

In [27]:
def get_recommendations(title, ind_movies, cosine_sim, top_n=10):
    # Get the index of the movie that matches the title
    idx = ind_movies[title]
    if not isinstance(idx, np.int64):
        print("There is more then one movie with this title")
        print("Choosing only the first one...")
        idx = idx[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar movies
    sim_scores = sim_scores[1:top_n+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    similarity_score = [round(i[1], 3) for i in sim_scores]

    # Return the top top_n most similar movies
    rec_movies = metadata_all['original_title'].iloc[movie_indices].values
    return dict(zip(rec_movies, similarity_score))
#     return metadata_all['original_title'].iloc[movie_indices], sim_scores, movie_indices

## Build Similarity Matrix

In [28]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata_all['bow'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [32]:
indices = pd.Series(metadata_all.index, index=metadata_all['original_title'])

get_recommendations('Amadeus', indices, cosine_sim, top_n=10)

{'Dirty Dancing': 0.101,
 'Evita': 0.104,
 'Man on the Moon': 0.72,
 "Mr. Holland's Opus": 0.125,
 'My Fair Lady': 0.107,
 "One Flew Over the Cuckoo's Nest": 0.701,
 'Serpico': 0.103,
 'Tender Mercies': 0.108,
 'The People vs. Larry Flynt': 0.74,
 'The Right Stuff': 0.105}

## Final Output

In [31]:
indices = pd.Series(metadata_all.index, index=metadata_all['original_title'])

get_recommendations('Amadeus', indices, cosine_sim, top_n=10)

{'Dirty Dancing': 0.101,
 'Evita': 0.104,
 'Man on the Moon': 0.72,
 "Mr. Holland's Opus": 0.125,
 'My Fair Lady': 0.107,
 "One Flew Over the Cuckoo's Nest": 0.701,
 'Serpico': 0.103,
 'Tender Mercies': 0.108,
 'The People vs. Larry Flynt': 0.74,
 'The Right Stuff': 0.105}