In [1]:
'''
Download
[1] IMDb Movie Dataset: All Movies by Genre
from
https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre
unzip into data/IMDb Movie Dataset/

and
[2] Ultimate Film Statistics Dataset with Production Budget and Domestic (US) and Worldwide Gross
from
https://www.kaggle.com/datasets/alessandrolobello/the-ultimate-film-statistics-dataset-for-ml
'''

'\nDownload\n[1] IMDb Movie Dataset: All Movies by Genre\nfrom\nhttps://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre\nand\n[2] Ultimate Film Statistics Dataset with Production Budget and Domestic (US) and Worldwide Gross\nfrom\nhttps://www.kaggle.com/datasets/alessandrolobello/the-ultimate-film-statistics-dataset-for-ml\n'

In [3]:
# first df: IMDb Movie Dataset

# joining all 16 seperate datasets into 1

import pandas as pd

# file location
directory = 'data/IMDb Movie Dataset/' 

# list of CSV files
csv_files = [
    'action.csv', 'adventure.csv', 'animation.csv', 'biography.csv', 'crime.csv',
    'family.csv', 'fantasy.csv', 'film-noir.csv', 'history.csv', 'horror.csv',
    'mystery.csv', 'romance.csv', 'scifi.csv', 'sports.csv', 'thriller.csv', 'war.csv'
]

# empty list to store dfs
dataframes = []

# loop through the list of files and read each one
for file in csv_files:
    file_path = directory + file
    df = pd.read_csv(file_path)
    dataframes.append(df)

# concatenate all dfs into 1
combined_df = pd.concat(dataframes, ignore_index=True)

In [4]:
combined_df.head()

Unnamed: 0,movie_id,movie_name,year,certificate,runtime,genre,rating,description,director,director_id,star,star_id,votes,gross(in $)
0,tt9114286,Black Panther: Wakanda Forever,2022,PG-13,161 min,"Action, Adventure, Drama",6.9,The people of Wakanda fight to protect their h...,Ryan Coogler,/name/nm3363032/,"Letitia Wright, \nLupita Nyong'o, \nDanai Guri...","/name/nm4004793/,/name/nm2143282/,/name/nm1775...",204835.0,
1,tt1630029,Avatar: The Way of Water,2022,PG-13,192 min,"Action, Adventure, Fantasy",7.8,Jake Sully lives with his newfound family form...,James Cameron,/name/nm0000116/,"Sam Worthington, \nZoe Saldana, \nSigourney We...","/name/nm0941777/,/name/nm0757855/,/name/nm0000...",295119.0,
2,tt5884796,Plane,2023,R,107 min,"Action, Thriller",6.5,A pilot finds himself caught in a war zone aft...,Jean-François Richet,/name/nm0724938/,"Gerard Butler, \nMike Colter, \nTony Goldwyn, ...","/name/nm0124930/,/name/nm1591496/,/name/nm0001...",26220.0,
3,tt6710474,Everything Everywhere All at Once,2022,R,139 min,"Action, Adventure, Comedy",8.0,A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",/name/nm3453283/,"Michelle Yeoh, \nStephanie Hsu, \nJamie Lee Cu...","/name/nm3215397/,/name/nm0000706/,/name/nm3513...",327858.0,
4,tt5433140,Fast X,2023,,,"Action, Crime, Mystery",,Dom Toretto and his family are targeted by the...,Louis Leterrier,/name/nm0504642/,"Vin Diesel, \nJordana Brewster, \nTyrese Gibso...","/name/nm0004874/,/name/nm0108287/,/name/nm0879...",,


In [5]:
combined_df.shape

(368300, 14)

In [6]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['director_id', 'star', 'star_id', 'votes', 'runtime', 'rating', 'description', 'gross(in $)']

# drop the columns
df = combined_df.drop(columns=columns_to_drop)

In [7]:
# rename columns
df = df.rename(columns={'movie_id': 'imdbid', 'movie_name': 'title', 'certificate': 'age_rating'})

In [8]:
df.head()

Unnamed: 0,imdbid,title,year,age_rating,genre,director
0,tt9114286,Black Panther: Wakanda Forever,2022,PG-13,"Action, Adventure, Drama",Ryan Coogler
1,tt1630029,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",James Cameron
2,tt5884796,Plane,2023,R,"Action, Thriller",Jean-François Richet
3,tt6710474,Everything Everywhere All at Once,2022,R,"Action, Adventure, Comedy","Dan Kwan, \nDaniel Scheinert"
4,tt5433140,Fast X,2023,,"Action, Crime, Mystery",Louis Leterrier


In [9]:
df.columns

Index(['imdbid', 'title', 'year', 'age_rating', 'genre', 'director'], dtype='object')

In [10]:
# check for spaces at beginning or end of 'imdbid'
starts_with_space = df['imdbid'].str.startswith(' ')
ends_with_space = df['imdbid'].str.endswith(' ')

# combine both conditions
has_leading_or_trailing_space = starts_with_space | ends_with_space

# print rows with leading or trailing spaces
print("Rows with leading or trailing spaces in 'imdbid':")
print(df[has_leading_or_trailing_space])

Rows with leading or trailing spaces in 'imdbid':
Empty DataFrame
Columns: [imdbid, title, year, age_rating, genre, director]
Index: []


In [11]:
# remove 'tt' in imdbid
df['imdbid'] = df['imdbid'].str.replace('tt', '', regex=False)

# replace non-numeric values with NaN
df['imdbid'] = pd.to_numeric(df['imdbid'], errors='coerce')

# convert to integer
df['imdbid'] = df['imdbid'].astype('int64')

In [12]:
# count number of NaN in imdbid
print(df.imdbid.isna().sum())

0


In [13]:
# count number of duplicates in imdbid
print(df['imdbid'].duplicated().sum())

125103


In [14]:
# drop duplicates in imdbid, keep only the first occurrence
df = df.drop_duplicates(subset=['imdbid'], keep='first')

In [15]:
df.shape

(243197, 6)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 243197 entries, 0 to 368299
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   imdbid      243197 non-null  int64 
 1   title       243195 non-null  object
 2   year        204290 non-null  object
 3   age_rating  55170 non-null   object
 4   genre       243197 non-null  object
 5   director    222358 non-null  object
dtypes: int64(1), object(5)
memory usage: 13.0+ MB


In [18]:
# second df: Ultimate_Film_Statistics_Dataset

df_uf = pd.read_csv('data\movie_statistic_dataset.csv')

# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['production_date', 'genres', 'director_name', 'director_professions', 'director_birthYear', 'director_deathYear', 'movie_averageRating', 'movie_numerOfVotes', 'approval_Index']

# drop columns
df_uf = df_uf.drop(columns=columns_to_drop)

# rename columns
df_uf = df_uf.rename(columns={'movie_title': 'title', 'Production budget $': 'production_budget', 'Domestic gross $': 'domestic_gross', 'Worldwide gross $': 'worldwide_gross'})

In [19]:
df_uf.head()

Unnamed: 0,title,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,Avatar: The Way of Water,192.0,460000000,667830256,2265935552
1,Avengers: Endgame,181.0,400000000,858373000,2794731755
2,Pirates of the Caribbean: On Stranger Tides,137.0,379000000,241071802,1045713802
3,Avengers: Age of Ultron,141.0,365000000,459005868,1395316979
4,Avengers: Infinity War,149.0,300000000,678815482,2048359754


In [20]:
df_uf.columns

Index(['title', 'runtime_minutes', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [21]:
# count number of duplicate titles
print(df_uf['title'].duplicated().sum())

139


In [22]:
df_uf.shape

(4380, 5)

In [23]:
# merge df with df2 on 'imdbid'
merged_df = pd.merge(df, df_uf, on='title', how='left')

In [24]:
df = merged_df

In [25]:
df.shape

(243943, 10)

In [26]:
df.duplicated(subset=['imdbid']).sum()

746

In [27]:
df.columns

Index(['imdbid', 'title', 'year', 'age_rating', 'genre', 'director',
       'runtime_minutes', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [28]:
# update missing or wrong values
df.loc[df['imdbid'] == '13400336', 'year'] = 2023
df.loc[df['imdbid'] == '13400336', 'runtime_minutes'] = 133
df.loc[df['imdbid'] == '2531030', 'year'] = 1985
df.loc[df['imdbid'] == '2531030', 'director'] = 'Barry Levinson'
df.loc[df['imdbid'] == '4338664', 'year'] = 2001
df.loc[df['imdbid'] == '4338664', 'director'] = 'Barry Levinson'

In [29]:
df[df['director'].isna()]

Unnamed: 0,imdbid,title,year,age_rating,genre,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross
1158,5950044,Superman: Legacy,2025,,"Action, Adventure, Fantasy",,,,,
1763,5782232,Twisters,,,"Action, Adventure",,,,,
2297,22084516,Karate Kid,2024,,Action,,,,,
3437,6334364,Deadshot,,,"Action, Adventure, Comedy",,,,,
3461,990406,Gears of War,I,,"Action, Adventure, Horror",,,,,
...,...,...,...,...,...,...,...,...,...,...
243897,5434856,March of Death,,,War,,,,,
243905,7686388,Quest for a Star,,,War,,,,,
243914,15239338,Stars of the Pacific,III,,"Drama, War",,,,,
243920,6615198,Chosonui pyol 1,,,"Drama, War",,,,,


In [30]:
df.head()

Unnamed: 0,imdbid,title,year,age_rating,genre,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,9114286,Black Panther: Wakanda Forever,2022,PG-13,"Action, Adventure, Drama",Ryan Coogler,,,,
1,1630029,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",James Cameron,192.0,460000000.0,667830256.0,2265936000.0
2,5884796,Plane,2023,R,"Action, Thriller",Jean-François Richet,,,,
3,6710474,Everything Everywhere All at Once,2022,R,"Action, Adventure, Comedy","Dan Kwan, \nDaniel Scheinert",,,,
4,5433140,Fast X,2023,,"Action, Crime, Mystery",Louis Leterrier,,,,


In [31]:
df.shape

(243943, 10)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243943 entries, 0 to 243942
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   imdbid             243943 non-null  int64  
 1   title              243941 non-null  object 
 2   year               205021 non-null  object 
 3   age_rating         55523 non-null   object 
 4   genre              243943 non-null  object 
 5   director           223049 non-null  object 
 6   runtime_minutes    7632 non-null    float64
 7   production_budget  7632 non-null    float64
 8   domestic_gross     7632 non-null    float64
 9   worldwide_gross    7632 non-null    float64
dtypes: float64(4), int64(1), object(5)
memory usage: 18.6+ MB


In [33]:
df.imdbid.isna().sum()

0

In [34]:
# convert imdbid to integer
df['imdbid'] = df['imdbid'].astype('int64')

In [35]:
df.columns

Index(['imdbid', 'title', 'year', 'age_rating', 'genre', 'director',
       'runtime_minutes', 'production_budget', 'domestic_gross',
       'worldwide_gross'],
      dtype='object')

In [36]:
# drop rows where either 'runtime_minutes' or 'imdbid' is NaN
df = df.dropna(subset=['runtime_minutes', 'imdbid'])

# count the remaining rows
count = len(df)

print(f"Number of rows with values in both 'runtime_minutes' and 'imdbid': {count}")

Number of rows with values in both 'runtime_minutes' and 'imdbid': 7632


In [37]:
# output df to csv
output_file = 'data/movie_metadata_set_01.csv'
df.to_csv(output_file, index=False)