In [26]:
import pandas as pd
import numpy as np
from imdb import Cinemagoer
from tqdm.auto import tqdm
import re

# Read movie dataset

In [45]:
df = pd.read_csv("Movies to watch - Ratings.csv")
df = df.iloc[1:,[1,2,3,4,5,6,7,8,9]]
df = df.rename(columns={'Unnamed: 1': "Film", "Unnamed: 11": "Average rating"})
df = df.replace("DNF", np.nan)
df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Annick,Guest (gemiddelde)
1,Inception,8.0,8.25,,8.3,8,,,
2,Good Will Hunting,9.0,9.00,,9.1,,,,
3,Cabin in the Woods,7.0,7.00,6.5,4.6,8.3,,,
4,Independence Day,7.0,7.20,7,3.7,6.9,,,
5,Spotlight,10.0,8.80,8.5,7.7,,,,
...,...,...,...,...,...,...,...,...,...
99,Austin Powers,7.5,6.00,,9.13,7.16,6.014,,
100,Jagten,8.5,9.00,,8.43,,7.801,7.730009,6.90
101,Avatar: The Way of Water,7.0,5.00,5,6.31,6.99,8.213,7.800929,8.17
102,No Country For Old Men,9.0,9.50,,,,,6.105980,


# Download information from IMDB

In [46]:
# select interesting information from the following list:
# ['akas', 'animation department', 'art department', 'art direction', 'aspect ratio', 'assistant director', 'box office', 'camera and electrical department', 'canonical title', 'cast', 
#  'casting department', 'casting director', 'certificates', 'cinematographer', 'color info', 'composer', 'costume department', 'costume designer', 'countries', 'country codes', 
#  'cover url', 'director', 'distributors', 'editor', 'editorial department', 'full-size cover url', 'genres', 'imdbID', 'kind', 'language codes', 'languages', 'localized title', 
#  'location management', 'long imdb canonical title', 'long imdb title', 'make up', 'miscellaneous crew', 'music department', 'original air date', 'original title', 'other companies', 
#  'plot', 'plot outline', 'producer', 'production companies', 'production design', 'production manager', 'rating', 'runtimes', 'script department', 'set decoration', 'smart canonical title', 
#  'smart long imdb canonical title', 'sound crew', 'sound mix', 'special effects', 'special effects companies', 'stunt performer', 'synopsis', 'title', 'transportation department', 'visual effects', 
#  'votes', 'writer', 'year']

interesting_stats = ['aspect ratio', 'box office', 'director', 'genres', 'year', 'rating', 'votes', 'plot']

In [31]:

def retrieve_movie_stats(movie, interesting_stats, ia):    
    res = {}
    # searching the name 
    search = ia.search_movie(movie)
    
    # getting the id
    id = search[0].movieID
    
    # get a movie's info
    movie_info = ia.get_movie(id)
    
    for key in interesting_stats:
        try:
            if key in movie_info:
                if key == "box office":
                    res["Budget"] = movie_info[key]["Budget"]
                    if "Cumulative Worldwide Gross" in movie_info[key]: res["Cumulative Worldwide Gross"] = movie_info[key]["Cumulative Worldwide Gross"]
                elif key == "synopsis":
                    res[key] = movie_info[key][0]
                else:                
                #     if isinstance(movie_info[key], list):
                #         res[key] = ",".join(movie_info[key])
                #     else:                    
                    res[key] = str(movie_info[key])
            else:
                res[key] = np.nan
        except:
            res[key] = np.nan
        
    return res

In [47]:
# create an instance of the Cinemagoer class
cnm = Cinemagoer()

# list interesting columns
columns_to_keep = ['aspect ratio', 'box office', 'director', 'genres', 'year', 'rating', 'votes', 'synopsis']
movie_stats = {}
for movie in df.Film.values:
    movie_stats[movie] = retrieve_movie_stats(movie, columns_to_keep, cnm)
    
movie_stats = pd.DataFrame.from_dict(movie_stats, orient="index")
movie_stats.to_csv("movie_stats.csv")
movie_stats

Unnamed: 0,aspect ratio,Budget,Cumulative Worldwide Gross,director,genres,year,rating,votes,synopsis,box office
Inception,2.39 : 1,"$160,000,000 (estimated)","$825,532,764, 06 Jan 2011",[<Person id:0634240[http] name:_Christopher No...,"['Action', 'Adventure', 'Sci-Fi', 'Thriller']",2010,8.8,2367898,,
Good Will Hunting,1.85 : 1,"$10,000,000 (estimated)","$225,933,435",[<Person id:0001814[http] name:_Gus Van Sant_>],"['Drama', 'Romance']",1997,8.3,980684,,
Cabin in the Woods,2.40 : 1,"$30,000,000 (estimated)","$70,768,144",[<Person id:1206844[http] name:_Drew Goddard_>],"['Horror', 'Mystery', 'Thriller']",2011,7.0,424053,,
Independence Day,2.20 : 1 (70 mm print),"$75,000,000 (estimated)",,[<Person id:0000386[http] name:_Roland Emmeric...,"['Action', 'Adventure', 'Sci-Fi']",1996,7.0,579929,,
Spotlight,1.85 : 1,"$20,000,000 (estimated)","$98,275,238",[<Person id:0565336[http] name:_Tom McCarthy_>],"['Biography', 'Crime', 'Drama']",2015,8.1,474195,,
...,...,...,...,...,...,...,...,...,...,...
Austin Powers,2.39 : 1,"$17,000,000 (estimated)","$67,683,989",[<Person id:0005366[http] name:_Jay Roach_>],"['Adventure', 'Comedy']",1997,7.0,244169,,
Jagten,2.35 : 1,"$3,800,000 (estimated)","$18,309,793",[<Person id:0899121[http] name:_Thomas Vinterb...,['Drama'],2012,8.3,334636,,
Avatar: The Way of Water,1.85 : 1 (3-D version),"$350,000,000 (estimated)",,[<Person id:0000116[http] name:_James Cameron_>],"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2022,7.8,275381,,
No Country For Old Men,2.39 : 1,"$25,000,000 (estimated)","$171,627,166","[<Person id:0001053[http] name:_Ethan Coen_>, ...","['Crime', 'Drama', 'Thriller']",2007,8.2,982132,,


# Process data

## Budget

In [48]:
movie_stats = pd.read_csv("movie_stats.csv")

In [49]:
def process_budget(x):
    # print(x["Budget"])
    x = re.findall("\d{1,3},{0,1}\d{1,3},{0,1}\d{1,3}", str(x))
    if x == []: return float(np.nan)
    else: return float(x[0].replace(",", ""))
    
movie_stats["Budget"] = movie_stats["Budget"].apply(process_budget)

## Gross

In [50]:
movie_stats["Cumulative Worldwide Gross"] = movie_stats["Cumulative Worldwide Gross"].apply(process_budget)

## Genres

### Genre table

In [51]:
# Function which uses re.findall method to convert string to list character wise 
def convert_string_to_list(string):
    return re.findall('[\'\'\"]([a-zA-Z]{0,})[\'\'\"]', string)

In [52]:
# def genre_array_to_table(genres):
#     res = []
    
#     # Loop through the movies and the genres
#     for movie, genre_list in genres.items():
#         genre_list = convert_string_to_list(genre_list)
        
#         # for each genre add a row to the result list
#         for i in range(len(genre_list)):
#             res.append((movie, genre_list[i], i+1))
            
#     return pd.DataFrame(res, columns = ["Film", "Genre", "Genre_order"])
        
# genre_table = genre_array_to_table(movie_stats["genres"])
# genre_table.to_csv("genre_table.csv")
# genre_table

Unnamed: 0,Film,Genre,Genre_order
0,0,Action,1
1,0,Adventure,2
2,0,Thriller,3
3,1,Drama,1
4,1,Romance,2
...,...,...,...
278,101,Thriller,3
279,102,Comedy,1
280,102,Drama,2
281,102,Music,3


### Genre recommender table

In [53]:
# first get a list of all unique categories and also keep the list of categories per movie
all_movie_cats = []
unique_categories = []
for s in movie_stats["genres"].values:
    s = s.replace(" ", "")
    s = s.replace("'", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    movie_cats = s.split(",")
    all_movie_cats.append(movie_cats)
    for i in movie_cats:
        if i not in unique_categories:
            unique_categories.append(i)
unique_categories

['Action',
 'Adventure',
 'Sci-Fi',
 'Thriller',
 'Drama',
 'Romance',
 'Horror',
 'Mystery',
 'Biography',
 'Crime',
 'History',
 'War',
 'Western',
 'Comedy',
 'Music',
 'Animation',
 'Family',
 'Fantasy',
 'Sport',
 'Musical',
 'Documentary']

In [55]:
genre_matrix = {}
# now create a column for every genre with a True of False depending on wheter a movie has a certain genre
for i in range(len(all_movie_cats)):
    movie = movie_stats.iloc[:,0].values[i]
    d = {}
    for genre in unique_categories:
        if genre in all_movie_cats[i]:
            d[genre] = True 
        else:
            d[genre] = False 
    genre_matrix[movie] = d
    #movie_stats.iloc[:,0]
genre_matrix = pd.DataFrame.from_dict(genre_matrix, orient="index").reset_index()
# genre["idnex"] = 
genre_matrix

Unnamed: 0,index,Action,Adventure,Sci-Fi,Thriller,Drama,Romance,Horror,Mystery,Biography,...,War,Western,Comedy,Music,Animation,Family,Fantasy,Sport,Musical,Documentary
0,Inception,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Good Will Hunting,False,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Cabin in the Woods,False,False,False,True,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,Independence Day,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Spotlight,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Austin Powers,False,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
99,Jagten,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100,Avatar: The Way of Water,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
101,No Country For Old Men,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [56]:
movie_stats

Unnamed: 0.1,Unnamed: 0,aspect ratio,Budget,Cumulative Worldwide Gross,director,genres,year,rating,votes,synopsis,box office
0,Inception,2.39 : 1,160000000.0,825532764.0,[<Person id:0634240[http] name:_Christopher No...,"['Action', 'Adventure', 'Sci-Fi', 'Thriller']",2010,8.8,2367898.0,,
1,Good Will Hunting,1.85 : 1,10000000.0,225933435.0,[<Person id:0001814[http] name:_Gus Van Sant_>],"['Drama', 'Romance']",1997,8.3,980684.0,,
2,Cabin in the Woods,2.40 : 1,30000000.0,70768144.0,[<Person id:1206844[http] name:_Drew Goddard_>],"['Horror', 'Mystery', 'Thriller']",2011,7.0,424053.0,,
3,Independence Day,2.20 : 1 (70 mm print),75000000.0,,[<Person id:0000386[http] name:_Roland Emmeric...,"['Action', 'Adventure', 'Sci-Fi']",1996,7.0,579929.0,,
4,Spotlight,1.85 : 1,20000000.0,98275238.0,[<Person id:0565336[http] name:_Tom McCarthy_>],"['Biography', 'Crime', 'Drama']",2015,8.1,474195.0,,
...,...,...,...,...,...,...,...,...,...,...,...
98,Austin Powers,2.39 : 1,17000000.0,67683989.0,[<Person id:0005366[http] name:_Jay Roach_>],"['Adventure', 'Comedy']",1997,7.0,244169.0,,
99,Jagten,2.35 : 1,3800000.0,18309793.0,[<Person id:0899121[http] name:_Thomas Vinterb...,['Drama'],2012,8.3,334636.0,,
100,Avatar: The Way of Water,1.85 : 1 (3-D version),350000000.0,,[<Person id:0000116[http] name:_James Cameron_>],"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2022,7.8,275381.0,,
101,No Country For Old Men,2.39 : 1,25000000.0,171627166.0,"[<Person id:0001053[http] name:_Ethan Coen_>, ...","['Crime', 'Drama', 'Thriller']",2007,8.2,982132.0,,


In [57]:
columns_to_keep = ["Unnamed: 0", "Budget", "Cumulative Worldwide Gross", "year", "rating", "votes", "synopsis" ]

clean_df = pd.merge(df, movie_stats[columns_to_keep], left_on="Film", right_on="Unnamed: 0").drop("Unnamed: 0", axis=1)
clean_df = pd.merge(clean_df, genre_matrix, left_on="Film", right_on="index").drop("index", axis=1)
clean_df.index = clean_df.index + 1
clean_df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Annick,Guest (gemiddelde),Budget,...,War,Western,Comedy,Music,Animation,Family,Fantasy,Sport,Musical,Documentary
1,Inception,8.0,8.25,,8.3,8,,,,160000000.0,...,False,False,False,False,False,False,False,False,False,False
2,Good Will Hunting,9.0,9.00,,9.1,,,,,10000000.0,...,False,False,False,False,False,False,False,False,False,False
3,Cabin in the Woods,7.0,7.00,6.5,4.6,8.3,,,,30000000.0,...,False,False,False,False,False,False,False,False,False,False
4,Independence Day,7.0,7.20,7,3.7,6.9,,,,75000000.0,...,False,False,False,False,False,False,False,False,False,False
5,Spotlight,10.0,8.80,8.5,7.7,,,,,20000000.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,Austin Powers,7.5,6.00,,9.13,7.16,6.014,,,17000000.0,...,False,False,True,False,False,False,False,False,False,False
100,Jagten,8.5,9.00,,8.43,,7.801,7.730009,6.90,3800000.0,...,False,False,False,False,False,False,False,False,False,False
101,Avatar: The Way of Water,7.0,5.00,5,6.31,6.99,8.213,7.800929,8.17,350000000.0,...,False,False,False,False,False,False,True,False,False,False
102,No Country For Old Men,9.0,9.50,,,,,6.105980,,25000000.0,...,False,False,False,False,False,False,False,False,False,False


In [68]:
rating_columns = ["Film", "Seb", "Jos", "Coen", "Stijn", "Merle", "Twan", "Annick", "Guest (gemiddelde)"]
movie_information_columns = ["Film"] + [x for x in list(clean_df.columns) if x not in rating_columns]

clean_df[rating_columns].to_csv("ratings.csv")
clean_df[movie_information_columns].to_csv("movie_information.csv")

# clean_df[[]]