In [1]:
import numpy as np
import pandas as pd

metadata = pd.read_csv("./data/movies_metadata.csv", low_memory=False)
metadata.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Simple Recommenders

In [459]:
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


### we will take into account only these movies that have more rewievs than 90% of all films 

In [462]:
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [464]:
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]

In [466]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [468]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [470]:
q_movies = q_movies.sort_values('score', ascending=False)
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


# Content-Based Recommender

## Overview based Recomender

#### Computing how important each word is in every film overview

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
metadata['overview'] = metadata['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(metadata['overview'])

In [5]:
from sklearn.metrics.pairwise import linear_kernel
# Matrix of simmilarity of films
similarity_score= linear_kernel(tfidf_matrix, tfidf_matrix)

#### I'm gonna use film_id to find simmilar films since title columnn have not unique values

In [43]:
def get_recommendations(film_id, similarity_score):
    #since id column is an objekt type we need to check if film_id is string
    if isinstance(film_id, int): 
        film_id = str(film_id)
        
    idx = metadata[metadata['id']==film_id].index # index[0] because we need an int value, not series.index
    
    sim_scores = list(enumerate(similarity_score[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    return metadata['title'].iloc[movie_indices]

In [10]:
metadata.loc[metadata['title']=='The Dark Knight Rises', ['title', 'id']]

Unnamed: 0,title,id
18252,The Dark Knight Rises,49026


In [12]:
get_recommendations(49026, similarity_score)

12481                                      The Dark Knight
150                                         Batman Forever
1328                                        Batman Returns
15511                           Batman: Under the Red Hood
585                                                 Batman
21194    Batman Unmasked: The Psychology of the Dark Kn...
9230                    Batman Beyond: Return of the Joker
18035                                     Batman: Year One
19792              Batman: The Dark Knight Returns, Part 1
3095                          Batman: Mask of the Phantasm
Name: title, dtype: object

In [14]:
metadata.loc[metadata['title']=='The Godfather', ['title', 'id']]

Unnamed: 0,title,id
834,The Godfather,238


In [16]:
get_recommendations(238, similarity_score)

1178               The Godfather: Part II
44030    The Godfather Trilogy: 1972-1990
1914              The Godfather: Part III
23126                          Blood Ties
11297                    Household Saints
34717                   Start Liquidation
10821                            Election
38030            A Mother Should Be Loved
17729                   Short Sharp Shock
26293                  Beck 28 - Familjen
Name: title, dtype: object

## Credits, Genres, and Keywords based Recommender

In [3]:
credits = pd.read_csv('./data/credits.csv')
keywords = pd.read_csv('./data/keywords.csv')

# WHY WE NEED TO REMOVE THEM???
metadata = metadata.drop([19730, 29503, 35587])

keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [5]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [6]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [7]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [11]:
# Define new director, cast, genres and keywords features that are in a suitable form.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

metadata[['director', 'cast', 'keywords', 'genres']]

Unnamed: 0,director,cast,keywords,genres
0,John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Howard Deutch,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Forest Whitaker,"[Whitney Houston, Angela Bassett, Loretta Devine]","[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,Charles Shyer,"[Steve Martin, Diane Keaton, Martin Short]","[baby, midlife crisis, confidence]",[Comedy]
...,...,...,...,...
46623,Hamid Nematollah,"[Leila Hatami, Kourosh Tahami, Elham Korda]",[tragic love],"[Drama, Family]"
46624,Lav Diaz,"[Angel Aquino, Perry Dizon, Hazel Orencio]","[artist, play, pinoy]",[Drama]
46625,Mark L. Lester,"[Erika Eleniak, Adam Baldwin, Julie du Page]",[],"[Action, Drama, Thriller]"
46626,Yakov Protazanov,"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...",[],[]


In [13]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [15]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

metadata[['director', 'cast', 'keywords', 'genres']]

Unnamed: 0,director,cast,keywords,genres
0,johnlasseter,"[tomhanks, timallen, donrickles]","[jealousy, toy, boy]","[animation, comedy, family]"
1,joejohnston,"[robinwilliams, jonathanhyde, kirstendunst]","[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,howarddeutch,"[waltermatthau, jacklemmon, ann-margret]","[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"
3,forestwhitaker,"[whitneyhouston, angelabassett, lorettadevine]","[basedonnovel, interracialrelationship, single...","[comedy, drama, romance]"
4,charlesshyer,"[stevemartin, dianekeaton, martinshort]","[baby, midlifecrisis, confidence]",[comedy]
...,...,...,...,...
46623,hamidnematollah,"[leilahatami, kouroshtahami, elhamkorda]",[tragiclove],"[drama, family]"
46624,lavdiaz,"[angelaquino, perrydizon, hazelorencio]","[artist, play, pinoy]",[drama]
46625,markl.lester,"[erikaeleniak, adambaldwin, juliedupage]",[],"[action, drama, thriller]"
46626,yakovprotazanov,"[iwanmosschuchin, nathalielissenko, pavelpavlov]",[],[]


In [17]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [19]:
# Create a new soup feature
metadata['soup'] = metadata.apply(create_soup, axis=1)

metadata[['soup']]

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger walter...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence stevemartin dian...
...,...
46623,tragiclove leilahatami kouroshtahami elhamkord...
46624,artist play pinoy angelaquino perrydizon hazel...
46625,erikaeleniak adambaldwin juliedupage markl.le...
46626,iwanmosschuchin nathalielissenko pavelpavlov ...


In [21]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [23]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

similarity_score2 = cosine_similarity(count_matrix, count_matrix)

In [61]:
def get_recommendations2(film_id, similarity_score):
    idx = metadata[metadata['id']==film_id].index[0] # index[0] because we need an int value, not series.index
    
    sim_scores = list(enumerate(similarity_score[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    return metadata['title'].iloc[movie_indices]

In [67]:
get_recommendations2(49026, similarity_score2)

12541      The Dark Knight
10170        Batman Begins
9271                Shiner
9834       Amongst Friends
7732              Mitchell
516      Romeo Is Bleeding
11411         The Prestige
24040            Quicksand
24984             Deadfall
41043                 Sara
Name: title, dtype: object

In [69]:
get_recommendations2(238, similarity_score2)

1926            The Godfather: Part III
1187             The Godfather: Part II
15534                   The Rain People
18866                         Last Exit
34458                              Rege
35772            Manuscripts Don't Burn
35773            Manuscripts Don't Burn
7961     The Night of the Following Day
18187                 The Son of No One
28637            In the Name of the Law
Name: title, dtype: object