In [None]:
'''
Download
IMDb Movie Dataset: All Movies by Genre
from
https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre
and
Ultimate Film Statistics Dataset with Production Budget and Domestic (US) and Worldwide Gross
from
https://www.kaggle.com/datasets/alessandrolobello/the-ultimate-film-statistics-dataset-for-ml
'''

In [1]:
# first df: IMDb Movie Dataset

# joining all 16 seperate datasets into 1

import pandas as pd

# file location
directory = 'data/IMDb Movie Dataset/' 

# list of CSV files
csv_files = [
    'action.csv', 'adventure.csv', 'animation.csv', 'biography.csv', 'crime.csv',
    'family.csv', 'fantasy.csv', 'film-noir.csv', 'history.csv', 'horror.csv',
    'mystery.csv', 'romance.csv', 'scifi.csv', 'sports.csv', 'thriller.csv', 'war.csv'
]

# empty list to store dfs
dataframes = []

# loop through the list of files and read each one
for file in csv_files:
    file_path = directory + file
    df = pd.read_csv(file_path)
    dataframes.append(df)

# concatenate all dfs into 1
combined_df = pd.concat(dataframes, ignore_index=True)

In [2]:
combined_df.head()

Unnamed: 0,movie_id,movie_name,year,certificate,runtime,genre,rating,description,director,director_id,star,star_id,votes,gross(in $)
0,tt9114286,Black Panther: Wakanda Forever,2022,PG-13,161 min,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,/name/nm3363032/,"Letitia Wright, \nLupita Nyong'o, \nDanai Guri...","/name/nm4004793/,/name/nm2143282/,/name/nm1775...",204835.0,
1,tt1630029,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,/name/nm0000116/,"Sam Worthington, \nZoe Saldana, \nSigourney We...","/name/nm0941777/,/name/nm0757855/,/name/nm0000...",295119.0,
2,tt5884796,Plane,2023,R,107 min,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,/name/nm0724938/,"Gerard Butler, \nMike Colter, \nTony Goldwyn, ...","/name/nm0124930/,/name/nm1591496/,/name/nm0001...",26220.0,
3,tt6710474,Everything Everywhere All at Once,2022,R,139 min,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",/name/nm3453283/,"Michelle Yeoh, \nStephanie Hsu, \nJamie Lee Cu...","/name/nm3215397/,/name/nm0000706/,/name/nm3513...",327858.0,
4,tt5433140,Fast X,2023,,,"Action, Crime, Mystery",,Dom Toretto and his family are targeted by the...,Louis Leterrier,/name/nm0504642/,"Vin Diesel, \nJordana Brewster, \nTyrese Gibso...","/name/nm0004874/,/name/nm0108287/,/name/nm0879...",,


In [3]:
combined_df.shape

(368300, 14)

In [4]:
combined_df.certificate.unique()

array(['PG-13', 'R', nan, 'Not Rated', 'TV-MA', 'PG', 'TV-14', '18+',
       'Approved', '13+', 'G', 'Unrated', 'Passed', 'M/PG', 'M', 'TV-PG',
       'TV-Y7', 'GP', 'TV-G', 'NC-17', '16+', 'X', 'TV-Y7-FV', 'TV-Y',
       '12', 'MA-13', 'E', 'T', 'E10+', 'Open', 'AO', 'TV-13', 'F', 'A',
       'U', 'MA-17', '18', 'UA 7+', 'UA', 'UA 16+', '13', '16', '7',
       'UA 13+', 'All'], dtype=object)

In [5]:
# filter movies with US age ratings for theatrical releases

certificates = ['G', 'PG', 'PG-13', 'R', 'NC-17', 'M', 'X']
df = combined_df[combined_df['certificate'].isin(certificates)]
df.shape

(45501, 14)

In [6]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['director_id', 'star', 'star_id', 'votes']

# drop the columns
df = df.drop(columns=columns_to_drop)

In [7]:
# rename columns
df = df.rename(columns={'movie_id': 'imdbid', 'movie_name': 'title', 'certificate': 'age_rating'})

In [8]:
df.head()

Unnamed: 0,imdbid,title,year,age_rating,runtime,genre,rating,description,director,gross(in $)
0,tt9114286,Black Panther: Wakanda Forever,2022,PG-13,161 min,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,
1,tt1630029,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,
2,tt5884796,Plane,2023,R,107 min,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,
3,tt6710474,Everything Everywhere All at Once,2022,R,139 min,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",
5,tt10954600,Ant-Man and the Wasp: Quantumania,2023,PG-13,125 min,"Action, Adventure, Comedy",6.6,"Scott Lang and Hope Van Dyne, along with Hank ...",Peyton Reed,


In [9]:
# second df: Ultimate_Film_Statistics_Dataset

df_uf = pd.read_csv('data/Ultimate_Film_Statistics_Dataset.csv')

# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['production_date', 'genres', 'director_name', 'director_professions', 'director_birthYear', 'director_deathYear', 'movie_averageRating', 'movie_numerOfVotes', 'approval_Index']

# drop columns
df_uf = df_uf.drop(columns=columns_to_drop)

# rename columns
df_uf = df_uf.rename(columns={'movie_title': 'title', 'Production budget $': 'production_budget', 'Domestic gross $': 'domestic_gross', 'Worldwide gross $': 'worldwide_gross'})
print(df_uf.shape)

(4380, 5)


In [10]:
df_uf.head()

Unnamed: 0,title,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,Avatar: The Way of Water,192.0,460000000,667830256,2265935552
1,Avengers: Endgame,181.0,400000000,858373000,2794731755
2,Pirates of the Caribbean: On Stranger Tides,137.0,379000000,241071802,1045713802
3,Avengers: Age of Ultron,141.0,365000000,459005868,1395316979
4,Avengers: Infinity War,149.0,300000000,678815482,2048359754


In [11]:
# merge dfs
merged_df = pd.merge(df, df_uf, on='title', how='left')

In [12]:
merged_df

Unnamed: 0,imdbid,title,year,age_rating,runtime,genre,rating,description,director,gross(in $),runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,tt9114286,Black Panther: Wakanda Forever,2022,PG-13,161 min,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,,,,,
1,tt1630029,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,,192.0,460000000.0,667830256.0,2.265936e+09
2,tt5884796,Plane,2023,R,107 min,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,,,,,
3,tt6710474,Everything Everywhere All at Once,2022,R,139 min,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",,,,,
4,tt10954600,Ant-Man and the Wasp: Quantumania,2023,PG-13,125 min,"Action, Adventure, Comedy",6.6,"Scott Lang and Hope Van Dyne, along with Hank ...",Peyton Reed,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46173,tt0168210,A Taste of Hell,1973,PG,90 min,"Action, War, Drama",5.0,THEY FOUGHT SAVAGELY - for love of their count...,"Basil Bradbury, \nNeil Yarema",,,,,
46174,tt0102850,Schacko Klak,1989,R,90 min,"Drama, War",6.2,"The title of the film, set in Luxembourg in 19...","Frank Hoffmann, \nPaul Kieffer",,,,,
46175,tt0453313,Wo de mei li xiang chou,2002,PG-13,120 min,"Drama, War",4.9,Taylor Parks decides to talk four friends into...,Zhong Yu,,,,,
46176,tt0123817,Edge,1997,R,100 min,"Action, Drama, Thriller",6.7,Edge is the story of Edward 'Edge' Jones (Edge...,Brian Harty,,,,,


In [13]:
merged_df.worldwide_gross.isna().sum()

35909

In [14]:
df = merged_df.dropna(subset=['worldwide_gross'])
df.shape

(10269, 14)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10269 entries, 1 to 46171
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   imdbid             10269 non-null  object 
 1   title              10269 non-null  object 
 2   year               10263 non-null  object 
 3   age_rating         10269 non-null  object 
 4   runtime            10252 non-null  object 
 5   genre              10269 non-null  object 
 6   rating             10239 non-null  float64
 7   description        10269 non-null  object 
 8   director           10261 non-null  object 
 9   gross(in $)        9156 non-null   float64
 10  runtime_minutes    10269 non-null  float64
 11  production_budget  10269 non-null  float64
 12  domestic_gross     10269 non-null  float64
 13  worldwide_gross    10269 non-null  float64
dtypes: float64(6), object(8)
memory usage: 1.2+ MB


In [16]:
df[df['age_rating'].isna()]

Unnamed: 0,imdbid,title,year,age_rating,runtime,genre,rating,description,director,gross(in $),runtime_minutes,production_budget,domestic_gross,worldwide_gross


In [17]:
df.columns

Index(['imdbid', 'title', 'year', 'age_rating', 'runtime', 'genre', 'rating',
       'description', 'director', 'gross(in $)', 'runtime_minutes',
       'production_budget', 'domestic_gross', 'worldwide_gross'],
      dtype='object')

In [18]:
# drop duplicate rows based on 'imdbid', keep only the first occurrence
df_cleaned = df.drop_duplicates(subset=['imdbid'], keep='first')

In [19]:
df_cleaned.shape

(4051, 14)

In [20]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4051 entries, 1 to 46020
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   imdbid             4051 non-null   object 
 1   title              4051 non-null   object 
 2   year               4048 non-null   object 
 3   age_rating         4051 non-null   object 
 4   runtime            4040 non-null   object 
 5   genre              4051 non-null   object 
 6   rating             4032 non-null   float64
 7   description        4051 non-null   object 
 8   director           4048 non-null   object 
 9   gross(in $)        3597 non-null   float64
 10  runtime_minutes    4051 non-null   float64
 11  production_budget  4051 non-null   float64
 12  domestic_gross     4051 non-null   float64
 13  worldwide_gross    4051 non-null   float64
dtypes: float64(6), object(8)
memory usage: 474.7+ KB


In [21]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['runtime', 'rating', 'gross(in $)',]

# drop columns
df = df_cleaned.drop(columns=columns_to_drop)

In [22]:
# update missing or wrong values
df.loc[df['imdbid'] == 'tt13400336', 'year'] = 2023
df.loc[df['imdbid'] == 'tt13400336', 'runtime_minutes'] = 133
df.loc[df['imdbid'] == 'tt2531030', 'year'] = 1985
df.loc[df['imdbid'] == 'tt2531030', 'director'] = 'Barry Levinson'
df.loc[df['imdbid'] == 'tt4338664', 'year'] = 2001
df.loc[df['imdbid'] == 'tt4338664', 'director'] = 'Barry Levinson'

In [23]:
df.age_rating.unique()

array(['PG-13', 'R', 'PG', 'G', 'X', 'M'], dtype=object)

In [24]:
# map numbers to the respective age ratings

# age rating to numerical value
age_rating_mapping = {
    'G': 0,
    'PG': 6,
    'PG-13': 13,
    'R': 17,
    'NC-17': 18,
    'M': 17,
    'X': 17
}

# create new column 'age_rating_number' based on mapping
df['age_rating_number'] = df['age_rating'].map(age_rating_mapping)

In [None]:
# split genre classification into seperate columns

genre_split = df['genre'].str.split(', ', expand=True)

# rename new columns
genre_split.columns = ['genre_1', 'genre_2', 'genre_3']

# join new columns back to original df
df = df.join(genre_split)

In [25]:
# remove tt at beginning of 'imdbid' and convert to type int
df['imdbid'] = df['imdbid'].str.replace('tt', '').astype(int)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4051 entries, 1 to 46020
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   imdbid             4051 non-null   int32  
 1   title              4051 non-null   object 
 2   year               4051 non-null   object 
 3   age_rating         4051 non-null   object 
 4   genre              4051 non-null   object 
 5   description        4051 non-null   object 
 6   director           4050 non-null   object 
 7   runtime_minutes    4051 non-null   float64
 8   production_budget  4051 non-null   float64
 9   domestic_gross     4051 non-null   float64
 10  worldwide_gross    4051 non-null   float64
 11  age_rating_number  4051 non-null   int64  
dtypes: float64(4), int32(1), int64(1), object(6)
memory usage: 395.6+ KB


In [27]:
# create dummies for genres

# generate dummy variables with column prefix
genre_dummies = df['genre'].str.get_dummies(sep=', ').rename(lambda x: 'genre_' + x.lower(), axis=1).astype(int)

# concatenate genre dummies with original DataFrame
df = pd.concat([df, genre_dummies], axis=1)

In [28]:
# new feature: 'financial_success' = 'worldwide_gross' > 'production_budget'*2
df['financial_success'] = (df['worldwide_gross'] > df['production_budget'] * 2).astype(int)

In [29]:
# new feature 'ROI' (return on investment)
df['ROI'] = df['worldwide_gross'] / df['production_budget']

In [30]:
df.head()

Unnamed: 0,imdbid,title,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,...,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_sport,genre_thriller,genre_war,genre_western,financial_success,ROI
1,1630029,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",Jake Sully lives with his newfound family form...,James Cameron,192.0,460000000.0,667830256.0,...,0,0,0,0,0,0,0,0,1,4.925947
6,12593682,Bullet Train,2022,R,"Action, Comedy, Thriller",Five assassins aboard a swiftly-moving bullet ...,David Leitch,127.0,85900000.0,103368602.0,...,0,0,0,0,0,1,0,0,1,2.780856
8,1745960,Top Gun: Maverick,2022,PG-13,"Action, Drama","After thirty years, Maverick is still pushing ...",Joseph Kosinski,130.0,170000000.0,718732821.0,...,0,0,0,0,0,0,0,0,1,8.733913
9,1825683,Black Panther,2018,PG-13,"Action, Adventure, Sci-Fi","T'Challa, heir to the hidden but advanced king...",Ryan Coogler,134.0,200000000.0,700059566.0,...,0,0,0,1,0,0,0,0,1,6.682472
10,1877830,The Batman,2022,PG-13,"Action, Crime, Drama",When a sadistic serial killer begins murdering...,Matt Reeves,176.0,200000000.0,369345583.0,...,0,0,0,0,0,0,0,0,1,3.837659


In [31]:
df.columns

Index(['imdbid', 'title', 'year', 'age_rating', 'genre', 'description',
       'director', 'runtime_minutes', 'production_budget', 'domestic_gross',
       'worldwide_gross', 'age_rating_number', 'genre_action',
       'genre_adventure', 'genre_animation', 'genre_biography', 'genre_comedy',
       'genre_crime', 'genre_drama', 'genre_family', 'genre_fantasy',
       'genre_film-noir', 'genre_history', 'genre_horror', 'genre_music',
       'genre_musical', 'genre_mystery', 'genre_romance', 'genre_sci-fi',
       'genre_sport', 'genre_thriller', 'genre_war', 'genre_western',
       'financial_success', 'ROI'],
      dtype='object')

In [32]:
print((df['financial_success'] == 0).sum())
print((df['financial_success'] == 1).sum())

2005
2046


In [33]:
# output df to csv
output_file = 'data/movie_metadata_master.csv'
df.to_csv(output_file, index=False)