In [1]:
import pandas as pd
import numpy as np
from imdb import Cinemagoer
from tqdm.auto import tqdm
import re

  from .autonotebook import tqdm as notebook_tqdm


# Read movie dataset

In [2]:
df = pd.read_csv("Movies to watch - Ratings.csv")
df = df.iloc[1:,[1,2,3,4,5,6,7,8,9]]
df = df.rename(columns={'Unnamed: 1': "Film", "Unnamed: 11": "Average rating"})
df = df.replace("DNF", np.nan)
df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Annick,Guest (gemiddelde)
1,Inception,8.0,8.25,,8.3,8,,,
2,Good Will Hunting,9.0,9.00,,9.1,,,,
3,Cabin in the Woods,7.0,7.00,6.5,4.6,8.3,,,
4,Independence Day,7.0,7.20,7,3.7,6.9,,,
5,Spotlight,10.0,8.80,8.5,7.7,,,,
...,...,...,...,...,...,...,...,...,...
99,Austin Powers,7.5,6.00,,9.13,7.16,6.014,,
100,Jagten,8.5,9.00,,8.43,,7.801,7.730009,6.90
101,Avatar: The Way of Water,7.0,5.00,5,6.31,6.99,8.213,7.800929,8.17
102,No Country For Old Men,9.0,9.50,,,,,6.105980,


# Download information from IMDB

In [12]:
# select interesting information from the following list:
# ['akas', 'animation department', 'art department', 'art direction', 'aspect ratio', 'assistant director', 'box office', 'camera and electrical department', 'canonical title', 'cast', 
#  'casting department', 'casting director', 'certificates', 'cinematographer', 'color info', 'composer', 'costume department', 'costume designer', 'countries', 'country codes', 
#  'cover url', 'director', 'distributors', 'editor', 'editorial department', 'full-size cover url', 'genres', 'imdbID', 'kind', 'language codes', 'languages', 'localized title', 
#  'location management', 'long imdb canonical title', 'long imdb title', 'make up', 'miscellaneous crew', 'music department', 'original air date', 'original title', 'other companies', 
#  'plot', 'plot outline', 'producer', 'production companies', 'production design', 'production manager', 'rating', 'runtimes', 'script department', 'set decoration', 'smart canonical title', 
#  'smart long imdb canonical title', 'sound crew', 'sound mix', 'special effects', 'special effects companies', 'stunt performer', 'synopsis', 'title', 'transportation department', 'visual effects', 
#  'votes', 'writer', 'year']

interesting_stats = ['aspect ratio', 'box office', 'director', 'genres', 'year', 'rating', 'votes', 'plot outline']

genre_columns = ['Action', 'Adventure', 'Sci-Fi', 'Thriller', 'Drama', 'Romance', 'Horror', 'Mystery', 'Biography',
                                         'Crime', 'History', 'War', 'Western', 'Comedy', 'Music', 'Animation', 'Family', 'Fantasy', 'Sport',
                                         'Musical', 'Documentary']

In [4]:
def process_budget(budget):
    budget = re.findall("\d{1,3},{0,1}\d{1,3},{0,1}\d{1,3}", str(budget))
    if budget == []: return float(np.nan)
    else: return float(budget[0].replace(",", ""))
    
def process_director(director):
    director = re.findall("name:_[a-zA-Z ]{0,}", str(director))
    if director == []: return float(np.nan)
    else: return director[0].split("_")[1]

In [31]:

# def retrieve_movie_stats(movie, interesting_stats, cnm):    
#     res = {}
#     # searching the name 
#     search = cnm.search_movie(movie)
    
#     # getting the id
#     id = search[0].movieID
    
#     # get a movie's info
#     movie_info = cnm.get_movie(id)
    
#     for key in interesting_stats:
#         try:
#             if key in movie_info:
                
#                 if key == "box office":
#                     res["Budget"] = movie_info[key]["Budget"]
#                     if "Cumulative Worldwide Gross" in movie_info[key]: 
#                         res["Cumulative Worldwide Gross"] = movie_info[key]["Cumulative Worldwide Gross"]
                        
#                 elif key == "synopsis":
#                     res[key] = movie_info[key][0]
                    
#                 elif key == "director":
#                     res[key] = movie_info[key].split("_")
                    
#                 elif key == "rating":
#                     res[key] = float(movie_info[key])
                    
#                 else:                
#                 #     if isinstance(movie_info[key], list):
#                 #         res[key] = ",".join(movie_info[key])
#                 #     else:                    
#                     res[key] = str(movie_info[key])
#             else:
#                 res[key] = np.nan
#         except:
#             res[key] = np.nan
        
#     return res

In [13]:
def retrieve_movie_stats(movie, interesting_stats, cnm):    
    res = {}
    # searching the name 
    search = cnm.search_movie(movie)
    
    # getting the id
    id = search[0].movieID
    
    # get a movie's info
    movie_info = cnm.get_movie(id)
    
    for key in interesting_stats:
        try:
            if key in movie_info:
                
                if key == "box office":
                    res["Budget"] = process_budget(movie_info[key]["Budget"])
                    
                    if "Cumulative Worldwide Gross" in movie_info[key]: 
                        res["Cumulative Worldwide Gross"] = process_budget(movie_info[key]["Cumulative Worldwide Gross"])
                        
                elif key == "plot outline":
                    res["synopsis"] = movie_info[key]
                    
                elif key == "director":
                    res[key] = process_director(movie_info[key])
                    
                elif key == "rating":
                    res[key] = float(movie_info[key])
                    
                # elif key == "genres":
                #     for genre in genre_columns:
                #         if genre in movie_info[key]:
                #             res[genre] = True
                #         else:
                #             res[genre] = False
                    
                else:                        
                    res[key] = str(movie_info[key])
            else:
                res[key] = np.nan
        except:
            res[key] = np.nan
        
    return res

In [14]:
# create an instance of the Cinemagoer class
cnm = Cinemagoer()

# list interesting columns
columns_to_keep = ['director', 'box office', 'year', 'rating', 'votes', 'plot outline', 'genres']
movie_stats = {}
for movie in df.Film.values:
    movie_stats[movie] = retrieve_movie_stats(movie, columns_to_keep, cnm)
    
movie_stats = pd.DataFrame.from_dict(movie_stats, orient="index").drop(columns=["plot outline", "box office"]).reset_index()
movie_stats.index = movie_stats.index + 1
movie_stats = movie_stats.rename({"index": "Film"}, axis=1)
movie_stats.to_csv("movie_stats.csv")
movie_stats

Unnamed: 0,director,Budget,Cumulative Worldwide Gross,year,rating,votes,synopsis,genres
Inception,Christopher Nolan,160000000.0,825532764.0,2010,8.8,2370719,"Dom Cobb is a skilled thief, the absolute best...","['Action', 'Adventure', 'Sci-Fi', 'Thriller']"
Good Will Hunting,Gus Van Sant,10000000.0,225933435.0,1997,8.3,981725,A touching tale of a wayward young man who str...,"['Drama', 'Romance']"
Cabin in the Woods,Drew Goddard,30000000.0,70768144.0,2011,7.0,424652,Five teenagers head off for a weekend at a sec...,"['Horror', 'Mystery', 'Thriller']"
Independence Day,Roland Emmerich,75000000.0,,1996,7.0,580406,"On July 2nd, communications systems worldwide ...","['Action', 'Adventure', 'Sci-Fi']"
Spotlight,Tom McCarthy,20000000.0,98275238.0,2015,8.1,474697,"When the Boston Globe's tenacious ""Spotlight"" ...","['Biography', 'Crime', 'Drama']"
...,...,...,...,...,...,...,...,...
Austin Powers,Jay Roach,17000000.0,67683989.0,1997,7.0,244435,Austin Powers is a 60's spy who is cryogenical...,"['Adventure', 'Comedy']"
Jagten,Thomas Vinterberg,3800000.0,18309793.0,2012,8.3,334566,Lucas is a Kindergarten teacher who takes grea...,['Drama']
Avatar: The Way of Water,James Cameron,350000000.0,,2022,7.8,287081,,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
No Country For Old Men,Ethan Coen,25000000.0,171627166.0,2007,8.2,983162,"In rural Texas, welder and hunter Llewelyn Mos...","['Crime', 'Drama', 'Thriller']"


# Process data

In [36]:
movie_stats = pd.read_csv("movie_stats.csv", index_col="Unnamed: 0")
movie_stats

Unnamed: 0,Film,director,Budget,Cumulative Worldwide Gross,year,rating,votes,synopsis,genres
1,Inception,Christopher Nolan,160000000.0,825532764.0,2010,8.8,2370719.0,"Dom Cobb is a skilled thief, the absolute best...","['Action', 'Adventure', 'Sci-Fi', 'Thriller']"
2,Good Will Hunting,Gus Van Sant,10000000.0,225933435.0,1997,8.3,981725.0,A touching tale of a wayward young man who str...,"['Drama', 'Romance']"
3,Cabin in the Woods,Drew Goddard,30000000.0,70768144.0,2011,7.0,424652.0,Five teenagers head off for a weekend at a sec...,"['Horror', 'Mystery', 'Thriller']"
4,Independence Day,Roland Emmerich,75000000.0,,1996,7.0,580406.0,"On July 2nd, communications systems worldwide ...","['Action', 'Adventure', 'Sci-Fi']"
5,Spotlight,Tom McCarthy,20000000.0,98275238.0,2015,8.1,474697.0,"When the Boston Globe's tenacious ""Spotlight"" ...","['Biography', 'Crime', 'Drama']"
...,...,...,...,...,...,...,...,...,...
99,Austin Powers,Jay Roach,17000000.0,67683989.0,1997,7.0,244435.0,Austin Powers is a 60's spy who is cryogenical...,"['Adventure', 'Comedy']"
100,Jagten,Thomas Vinterberg,3800000.0,18309793.0,2012,8.3,334566.0,Lucas is a Kindergarten teacher who takes grea...,['Drama']
101,Avatar: The Way of Water,James Cameron,350000000.0,,2022,7.8,287081.0,,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
102,No Country For Old Men,Ethan Coen,25000000.0,171627166.0,2007,8.2,983162.0,"In rural Texas, welder and hunter Llewelyn Mos...","['Crime', 'Drama', 'Thriller']"


## Budget

In [49]:
# def process_budget(x):
#     # print(x["Budget"])
#     x = re.findall("\d{1,3},{0,1}\d{1,3},{0,1}\d{1,3}", str(x))
#     if x == []: return float(np.nan)
#     else: return float(x[0].replace(",", ""))
    
# movie_stats["Budget"] = movie_stats["Budget"].apply(process_budget)

## Gross

In [50]:
# movie_stats["Cumulative Worldwide Gross"] = movie_stats["Cumulative Worldwide Gross"].apply(process_budget)

## Genres

### Genre table

In [51]:
# # Function which uses re.findall method to convert string to list character wise 
# def convert_string_to_list(string):
#     return re.findall('[\'\'\"]([a-zA-Z]{0,})[\'\'\"]', string)

In [52]:
# def genre_array_to_table(genres):
#     res = []
    
#     # Loop through the movies and the genres
#     for movie, genre_list in genres.items():
#         genre_list = convert_string_to_list(genre_list)
        
#         # for each genre add a row to the result list
#         for i in range(len(genre_list)):
#             res.append((movie, genre_list[i], i+1))
            
#     return pd.DataFrame(res, columns = ["Film", "Genre", "Genre_order"])
        
# genre_table = genre_array_to_table(movie_stats["genres"])
# genre_table.to_csv("genre_table.csv")
# genre_table

Unnamed: 0,Film,Genre,Genre_order
0,0,Action,1
1,0,Adventure,2
2,0,Thriller,3
3,1,Drama,1
4,1,Romance,2
...,...,...,...
278,101,Thriller,3
279,102,Comedy,1
280,102,Drama,2
281,102,Music,3


### Genre recommender table

In [37]:
# first get a list of all unique categories and also keep the list of categories per movie
all_movie_cats = []
unique_categories = []
for s in movie_stats["genres"].values:
    s = s.replace(" ", "")
    s = s.replace("'", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    movie_cats = s.split(",")
    all_movie_cats.append(movie_cats)
    for i in movie_cats:
        if i not in unique_categories:
            unique_categories.append(i)
unique_categories

['Action',
 'Adventure',
 'Sci-Fi',
 'Thriller',
 'Drama',
 'Romance',
 'Horror',
 'Mystery',
 'Biography',
 'Crime',
 'History',
 'War',
 'Western',
 'Comedy',
 'Music',
 'Animation',
 'Family',
 'Fantasy',
 'Sport',
 'Musical']

In [52]:
genre_matrix = {}
# now create a column for every genre with a True of False depending on wheter a movie has a certain genre
for i in range(len(all_movie_cats)):
    movie = movie_stats.iloc[:,0].values[i]
    d = {}
    for genre in genre_columns:
        if genre in all_movie_cats[i]:
            d[genre] = True 
        else:
            d[genre] = False 
    genre_matrix[movie] = d
    #movie_stats.iloc[:,0]
genre_matrix = pd.DataFrame.from_dict(genre_matrix, orient="index").reset_index()
# genre["idnex"] = 
genre_matrix

Unnamed: 0,index,Action,Adventure,Sci-Fi,Thriller,Drama,Romance,Horror,Mystery,Biography,...,War,Western,Comedy,Music,Animation,Family,Fantasy,Sport,Musical,Documentary
0,Inception,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Good Will Hunting,False,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Cabin in the Woods,False,False,False,True,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,Independence Day,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Spotlight,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Austin Powers,False,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
99,Jagten,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100,Avatar: The Way of Water,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
101,No Country For Old Men,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [39]:
movie_stats

Unnamed: 0,Film,director,Budget,Cumulative Worldwide Gross,year,rating,votes,synopsis,genres
1,Inception,Christopher Nolan,160000000.0,825532764.0,2010,8.8,2370719.0,"Dom Cobb is a skilled thief, the absolute best...","['Action', 'Adventure', 'Sci-Fi', 'Thriller']"
2,Good Will Hunting,Gus Van Sant,10000000.0,225933435.0,1997,8.3,981725.0,A touching tale of a wayward young man who str...,"['Drama', 'Romance']"
3,Cabin in the Woods,Drew Goddard,30000000.0,70768144.0,2011,7.0,424652.0,Five teenagers head off for a weekend at a sec...,"['Horror', 'Mystery', 'Thriller']"
4,Independence Day,Roland Emmerich,75000000.0,,1996,7.0,580406.0,"On July 2nd, communications systems worldwide ...","['Action', 'Adventure', 'Sci-Fi']"
5,Spotlight,Tom McCarthy,20000000.0,98275238.0,2015,8.1,474697.0,"When the Boston Globe's tenacious ""Spotlight"" ...","['Biography', 'Crime', 'Drama']"
...,...,...,...,...,...,...,...,...,...
99,Austin Powers,Jay Roach,17000000.0,67683989.0,1997,7.0,244435.0,Austin Powers is a 60's spy who is cryogenical...,"['Adventure', 'Comedy']"
100,Jagten,Thomas Vinterberg,3800000.0,18309793.0,2012,8.3,334566.0,Lucas is a Kindergarten teacher who takes grea...,['Drama']
101,Avatar: The Way of Water,James Cameron,350000000.0,,2022,7.8,287081.0,,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']"
102,No Country For Old Men,Ethan Coen,25000000.0,171627166.0,2007,8.2,983162.0,"In rural Texas, welder and hunter Llewelyn Mos...","['Crime', 'Drama', 'Thriller']"


In [46]:
df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Annick,Guest (gemiddelde)
1,Inception,8.0,8.25,,8.3,8,,,
2,Good Will Hunting,9.0,9.00,,9.1,,,,
3,Cabin in the Woods,7.0,7.00,6.5,4.6,8.3,,,
4,Independence Day,7.0,7.20,7,3.7,6.9,,,
5,Spotlight,10.0,8.80,8.5,7.7,,,,
...,...,...,...,...,...,...,...,...,...
99,Austin Powers,7.5,6.00,,9.13,7.16,6.014,,
100,Jagten,8.5,9.00,,8.43,,7.801,7.730009,6.90
101,Avatar: The Way of Water,7.0,5.00,5,6.31,6.99,8.213,7.800929,8.17
102,No Country For Old Men,9.0,9.50,,,,,6.105980,


In [53]:
columns_to_keep = ["Film", "director", "Budget", "Cumulative Worldwide Gross", "year", "rating", "votes", "synopsis" ]

clean_df = pd.merge(df, movie_stats[columns_to_keep], left_on="Film", right_on="Film")
clean_df = pd.merge(clean_df, genre_matrix, left_on="Film", right_on="index").drop("index", axis=1)
clean_df.index = clean_df.index + 1
clean_df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Annick,Guest (gemiddelde),director,...,War,Western,Comedy,Music,Animation,Family,Fantasy,Sport,Musical,Documentary
1,Inception,8.0,8.25,,8.3,8,,,,Christopher Nolan,...,False,False,False,False,False,False,False,False,False,False
2,Good Will Hunting,9.0,9.00,,9.1,,,,,Gus Van Sant,...,False,False,False,False,False,False,False,False,False,False
3,Cabin in the Woods,7.0,7.00,6.5,4.6,8.3,,,,Drew Goddard,...,False,False,False,False,False,False,False,False,False,False
4,Independence Day,7.0,7.20,7,3.7,6.9,,,,Roland Emmerich,...,False,False,False,False,False,False,False,False,False,False
5,Spotlight,10.0,8.80,8.5,7.7,,,,,Tom McCarthy,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,Austin Powers,7.5,6.00,,9.13,7.16,6.014,,,Jay Roach,...,False,False,True,False,False,False,False,False,False,False
100,Jagten,8.5,9.00,,8.43,,7.801,7.730009,6.90,Thomas Vinterberg,...,False,False,False,False,False,False,False,False,False,False
101,Avatar: The Way of Water,7.0,5.00,5,6.31,6.99,8.213,7.800929,8.17,James Cameron,...,False,False,False,False,False,False,True,False,False,False
102,No Country For Old Men,9.0,9.50,,,,,6.105980,,Ethan Coen,...,False,False,False,False,False,False,False,False,False,False


In [54]:
clean_df.columns

Index(['Film', 'Seb', 'Jos', 'Coen', 'Stijn', 'Merle', 'Twan', 'Annick',
       'Guest (gemiddelde)', 'director', 'Budget',
       'Cumulative Worldwide Gross', 'year', 'rating', 'votes', 'synopsis',
       'Action', 'Adventure', 'Sci-Fi', 'Thriller', 'Drama', 'Romance',
       'Horror', 'Mystery', 'Biography', 'Crime', 'History', 'War', 'Western',
       'Comedy', 'Music', 'Animation', 'Family', 'Fantasy', 'Sport', 'Musical',
       'Documentary'],
      dtype='object')

In [55]:
rating_columns = ["Film", "Seb", "Jos", "Coen", "Stijn", "Merle", "Twan", "Annick", "Guest (gemiddelde)"]
movie_information_columns = ["Film"] + [x for x in list(clean_df.columns) if x not in rating_columns]

clean_df[rating_columns].to_csv("ratings.csv")
clean_df[movie_information_columns].to_csv("movie_information.csv")

# clean_df[[]]