In [19]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval

import warnings;
warnings.simplefilter('ignore')

In [20]:
# basic recommender: we first try to recommend the top 10 movies based on ratings
# filter for vote number greater than 100
d_frame = pd.read_csv('./data/movies_metadata.csv')
keywords = pd.read_csv('./data/keywords.csv')
d_frame['id'] = pd.to_numeric(d_frame['id'], errors="coerce")
keywords['id'] = pd.to_numeric(keywords['id'], errors="coerce")

d_frame = d_frame.dropna(subset=['id'])
keywords = keywords.dropna(subset=['id'])

d_frame['id'] = d_frame['id'].astype('int')
keywords['id'] = keywords['id'].astype('int')

d_frame = pd.merge(d_frame, keywords, on='id')
d_frame_byratings = d_frame[d_frame['vote_count'] > 100].sort_values('vote_average', ascending=[False])
d_frame_byratings.head(10)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
10345,False,,13200000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,19404,tt0112870,hi,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",...,100000000.0,190.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,Come... Fall In Love,Dilwale Dulhania Le Jayenge,False,9.1,661.0,"[{'id': 4344, 'name': 'musical'}]"
40107,False,,0,"[{'id': 99, 'name': 'Documentary'}]",,192040,tt0795176,en,Planet Earth,A documentary miniseries described by its make...,...,0.0,550.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,planet earth as you've never seen it before,Planet Earth,False,8.8,176.0,"[{'id': 11162, 'name': 'miniseries'}, {'id': 2..."
314,False,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,...,28341470.0,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0,"[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."
41272,False,,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 16, ...",https://www.funimationfilms.com/movie/yourname/,372058,tt5311514,ja,君の名は。,High schoolers Mitsuha and Taki are complete s...,...,355298300.0,106.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Your Name.,False,8.5,1030.0,"[{'id': 6152, 'name': 'supernatural'}, {'id': ..."
837,False,"{'id': 230, 'name': 'The Godfather Collection'...",6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.thegodfather.com/,238,tt0068646,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",...,245066400.0,175.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An offer you can't refuse.,The Godfather,False,8.5,6024.0,"[{'id': 131, 'name': 'italy'}, {'id': 699, 'na..."
13301,False,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 99, 'name...",,15584,tt1152758,en,Dear Zachary: A Letter to a Son About His Father,"In 2001, Andrew Bagby, a medical resident, is ...",...,0.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Dear Zachary: A Letter to a Son About His Father,False,8.4,146.0,"[{'id': 1157, 'name': 'wife husband relationsh..."
1185,False,"{'id': 119674, 'name': 'Psycho Collection', 'p...",806948,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",,539,tt0054215,en,Psycho,When larcenous real estate clerk Marion Crane ...,...,32000000.0,109.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The master of suspense moves his cameras into ...,Psycho,False,8.3,2405.0,"[{'id': 612, 'name': 'hotel'}, {'id': 1443, 'n..."
1187,False,"{'id': 230, 'name': 'The Godfather Collection'...",13000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,240,tt0071562,en,The Godfather: Part II,In the continuing saga of the Corleone crime f...,...,47542840.0,200.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"I don't feel I have to wipe everybody out, Tom...",The Godfather: Part II,False,8.3,3418.0,"[{'id': 700, 'name': 'italo-american'}, {'id':..."
1193,False,,30000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,311,tt0087843,en,Once Upon a Time in America,A former Prohibition-era Jewish gangster retur...,...,0.0,229.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"Crime, passion and lust for power - Sergio Leo...",Once Upon a Time in America,False,8.3,1104.0,"[{'id': 314, 'name': 'life and death'}, {'id':..."
12525,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",http://thedarkknight.warnerbros.com/dvdsite/,155,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,...,1004558000.0,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Why So Serious?,The Dark Knight,False,8.3,12269.0,"[{'id': 849, 'name': 'dc comics'}, {'id': 853,..."


In [21]:
# we only consider movies with more than 75% quantile of the votes  
v_counts = d_frame[d_frame['vote_count'].notnull()]['vote_count'].astype('int')
v_averages = d_frame[d_frame['vote_average'].notnull()]['vote_average'].astype('int')
v_75q = v_counts.quantile(0.75)

# next we filter the movies by count of votes 
movies_v75q = d_frame[(d_frame['vote_count'].notnull()) & (d_frame['vote_count'] >= v_75q)]

# create base data frame based on movies_v75q
base_df = movies_v75q[['title', 'original_title', 'genres', 'release_date', 'vote_average', 'vote_count', 'tagline', 'runtime', 'keywords']]

# process genres column to make only names appear 
base_df['genres'] = base_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# process keywords column to make it only the tags
base_df['keywords'] = base_df['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [j['name'] for j in x] if isinstance(x, list) else [])

# find the top movies by vote_average
df_by_va = base_df.sort_values('vote_average', ascending=[False])

# find the top movies by number of votes 
df_by_nv = base_df.sort_values('vote_count', ascending=[False])

# compute the median score of movies_v75q
median_rating = base_df['vote_average'].median()

# Use Bayesian average to compute a combined measure for average user rating + popularity 
overall_average_rating = base_df['vote_average'].mean()

average_counts = base_df['vote_count'].mean()

global_c = average_counts * overall_average_rating

def bayesian_average(mrow):
    cur_movie_rating = mrow['vote_average']
    total_votes = mrow['vote_count']
    b_avg = ((cur_movie_rating * total_votes) + global_c) / (average_counts + total_votes)
    return b_avg

base_df['bavg_rating'] = base_df.apply(bayesian_average, axis=1)

df_by_bavg = base_df.sort_values('bavg_rating', ascending=[False])

In [22]:
# top 10 movies by average ratings 
print("Top 10 movies from all genres by average ratings:")
df_by_va.head(10)

Top 10 movies from all genres by average ratings:


Unnamed: 0,title,original_title,genres,release_date,vote_average,vote_count,tagline,runtime,keywords
44339,Planet Earth II,Planet Earth II,[Documentary],2016-11-06,9.5,50.0,,300.0,"[mountain, island, earth, jungle, miniseries, ..."
10345,Dilwale Dulhania Le Jayenge,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",1995-10-20,9.1,661.0,Come... Fall In Love,190.0,[musical]
44542,Cosmos,Cosmos,[],,9.1,41.0,,60.0,[]
40107,Planet Earth,Planet Earth,[Documentary],2006-12-10,8.8,176.0,planet earth as you've never seen it before,550.0,"[miniseries, great cinematpgraphy]"
43511,Lemonade,Lemonade,[Music],2016-04-23,8.8,45.0,,65.0,"[poetry, southern usa, pop culture, empowermen..."
7675,Sansho the Bailiff,山椒大夫,[Drama],1954-03-31,8.7,68.0,A film of unparalleled beauty by the great Jap...,124.0,"[japan, courtesan, song, exile, banishment, go..."
2754,Stop Making Sense,Stop Making Sense,"[Documentary, Music]",1984-11-16,8.7,47.0,Why stop making sense? Why a movie? Why a big ...,88.0,[concert]
6776,Shoah,Shoah,[Documentary],1985-11-01,8.7,36.0,,566.0,[]
29254,The Jinx: The Life and Deaths of Robert Durst,The Jinx: The Life and Deaths of Robert Durst,[Documentary],2015-02-08,8.6,85.0,Four Decades. Three Murders. And One Very Rich...,240.0,"[murder, crime, real life]"
33491,Human,Human,[Documentary],2015-09-12,8.6,98.0,Accepting your true identity is accepting who ...,263.0,"[society, documentary, world, culture, human, ..."


In [23]:
# top 10 movies by number of ratings 
print("Top 10 movies from all genres by number of ratings (popularity)")
df_by_nv.head(10)

Top 10 movies from all genres by number of ratings (popularity)


Unnamed: 0,title,original_title,genres,release_date,vote_average,vote_count,tagline,runtime,keywords
15547,Inception,Inception,"[Action, Thriller, Science Fiction, Mystery, A...",2010-07-14,8.1,14075.0,Your mind is the scene of the crime.,148.0,"[loss of lover, dream, kidnapping, sleep, subc..."
12525,The Dark Knight,The Dark Knight,"[Drama, Action, Crime, Thriller]",2008-07-16,8.3,12269.0,Why So Serious?,152.0,"[dc comics, crime fighter, secret identity, sc..."
14619,Avatar,Avatar,"[Action, Adventure, Fantasy, Science Fiction]",2009-12-10,7.2,12114.0,Enter the World of Pandora.,162.0,"[culture clash, future, space war, space colon..."
17892,The Avengers,The Avengers,"[Science Fiction, Action, Adventure]",2012-04-25,7.4,12000.0,Some assembly required.,143.0,"[new york, shield, marvel comic, superhero, ba..."
26637,Deadpool,Deadpool,"[Action, Adventure, Comedy]",2016-02-09,7.4,11444.0,Witness the beginning of a happy ending,108.0,"[anti hero, mercenary, marvel comic, superhero..."
22952,Interstellar,Interstellar,"[Adventure, Drama, Science Fiction]",2014-11-05,8.1,11187.0,Mankind was born on Earth. It was never meant ...,169.0,"[saving the world, artificial intelligence, fa..."
20130,Django Unchained,Django Unchained,"[Drama, Western]",2012-12-25,7.8,10297.0,"Life, liberty and the pursuit of vengeance.",165.0,"[bounty hunter, hero, plantation, society, fri..."
23824,Guardians of the Galaxy,Guardians of the Galaxy,"[Action, Science Fiction, Adventure]",2014-07-30,7.9,10014.0,All heroes start somewhere.,121.0,"[marvel comic, spaceship, space, outer space, ..."
2854,Fight Club,Fight Club,[Drama],1999-10-15,8.3,9678.0,Mischief. Mayhem. Soap.,139.0,"[support group, dual identity, nihilism, rage ..."
18318,The Hunger Games,The Hunger Games,"[Science Fiction, Adventure, Fantasy]",2012-03-12,6.9,9634.0,May The Odds Be Ever In Your Favor.,142.0,"[hallucination, dystopia, female protagonist, ..."


In [24]:
# top 10 movies by Bayesian average of popularity and ratings 
print("Top 10 movies from all genres by the Bayesian average of popularity and ratings ")
df_by_bavg.head(10)

Top 10 movies from all genres by the Bayesian average of popularity and ratings 


Unnamed: 0,title,original_title,genres,release_date,vote_average,vote_count,tagline,runtime,keywords,bavg_rating
314,The Shawshank Redemption,The Shawshank Redemption,"[Drama, Crime]",1994-09-23,8.5,8358.0,Fear can hold you prisoner. Hope can set you f...,142.0,"[prison, corruption, police brutality, prison ...",8.398594
837,The Godfather,The Godfather,"[Drama, Crime]",1972-03-14,8.5,6024.0,An offer you can't refuse.,175.0,"[italy, love at first sight, loss of father, p...",8.361788
12525,The Dark Knight,The Dark Knight,"[Drama, Action, Crime, Thriller]",2008-07-16,8.3,12269.0,Why So Serious?,152.0,"[dc comics, crime fighter, secret identity, sc...",8.236298
2854,Fight Club,Fight Club,[Drama],1999-10-15,8.3,9678.0,Mischief. Mayhem. Soap.,139.0,"[support group, dual identity, nihilism, rage ...",8.219932
292,Pulp Fiction,Pulp Fiction,"[Thriller, Crime]",1994-09-10,8.3,8670.0,Just because you are a character doesn't mean ...,154.0,"[transporter, brothel, drug dealer, boxer, mas...",8.21104
522,Schindler's List,Schindler's List,"[Drama, History, War]",1993-11-29,8.3,4436.0,"Whoever saves one life, saves the world entire.",195.0,"[factory, concentration camp, hero, holocaust,...",8.133261
23744,Whiplash,Whiplash,[Drama],2014-10-10,8.3,4376.0,The road to greatness can take you to the edge.,105.0,"[jazz, obsession, conservatory, music teacher,...",8.131169
5501,Spirited Away,千と千尋の神隠し,"[Fantasy, Adventure, Animation, Family]",2001-07-20,8.3,3968.0,The tunnel led Chihiro to a mysterious town...,125.0,"[witch, parent child relationship, magic, dark...",8.115424
351,Forrest Gump,Forrest Gump,"[Comedy, Drama, Romance]",1994-07-06,8.2,8147.0,"The world will never be the same, once you've ...",142.0,"[vietnam veteran, hippie, mentally disabled, r...",8.110355
2219,Life Is Beautiful,La vita è bella,"[Comedy, Drama]",1997-12-20,8.3,3643.0,"An unforgettable fable that proves love, famil...",116.0,"[italy, riddle, bookshop, self sacrifice, mass...",8.100611


In [25]:
# a method to recommend the top movies from each genre 
def movies_by_genre(genre, query_type):
    match query_type:
        case "popularity":
            result = df_by_nv[(df_by_nv["genres"].notnull()) & (df_by_nv["genres"].apply(lambda x: genre in x))]
            return result
        case "ratings":
            result = df_by_va[(df_by_nv["genres"].notnull()) & (df_by_va["genres"].apply(lambda x: genre in x))]
            return result 
        case "bayesian":
            result = df_by_bavg[(df_by_nv["genres"].notnull()) & (df_by_bavg["genres"].apply(lambda x: genre in x))]
            return result 

# get top comedy movies 
movies_by_genre('Romance', 'bayesian').head(10)

Unnamed: 0,title,original_title,genres,release_date,vote_average,vote_count,tagline,runtime,keywords,bavg_rating
351,Forrest Gump,Forrest Gump,"[Comedy, Drama, Romance]",1994-07-06,8.2,8147.0,"The world will never be the same, once you've ...",142.0,"[vietnam veteran, hippie, mentally disabled, r...",8.110355
10345,Dilwale Dulhania Le Jayenge,Dilwale Dulhania Le Jayenge,"[Comedy, Drama, Romance]",1995-10-20,9.1,661.0,Come... Fall In Love,190.0,[musical],8.039
41272,Your Name.,君の名は。,"[Romance, Animation, Drama]",2016-08-26,8.5,1030.0,,106.0,"[supernatural, romance, school, star crossed l...",7.881349
41902,La La Land,La La Land,"[Comedy, Drama, Music, Romance]",2016-11-29,7.9,4745.0,Here's to the fools who dream.,128.0,"[jazz, dance, passion, musical, casting, los a...",7.774838
22241,Her,Her,"[Romance, Science Fiction, Drama]",2013-12-18,7.9,4215.0,A Spike Jonze Love Story,126.0,"[artificial intelligence, computer, love, lone...",7.760484
7237,Eternal Sunshine of the Spotless Mind,Eternal Sunshine of the Spotless Mind,"[Science Fiction, Drama, Romance]",2004-03-19,7.9,3758.0,You can erase someone from your mind. Getting ...,108.0,"[deja vu, regret, jealousy, amnesia, dream, op...",7.745175
4860,Amélie,Le fabuleux destin d'Amélie Poulain,"[Comedy, Romance]",2001-04-25,7.8,3403.0,One person can change your life forever.,122.0,"[paris, love triangle, ghost train, sex-shop, ...",7.641422
25055,The Theory of Everything,The Theory of Everything,"[Drama, Romance]",2014-11-26,7.8,3403.0,His Mind Changed Our World. Her Love Changed His.,123.0,"[wife husband relationship, biography, physici...",7.641422
1141,Cinema Paradiso,Nuovo Cinema Paradiso,"[Drama, Romance]",1988-11-17,8.2,834.0,"A celebration of youth, friendship, and the ev...",124.0,"[sicily, cinema, film director, kiss, coming o...",7.581948
15597,Mr. Nobody,Mr. Nobody,"[Science Fiction, Drama, Romance, Fantasy]",2009-09-11,7.9,1616.0,"Nothing is real, everything is possible.",156.0,[surrealism],7.581215
