In [1]:
import pandas as pd

df = pd.read_csv('data/movie_meta_data.csv')

In [2]:
df.shape

(2858, 25)

In [3]:
df.columns

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords', 'genres', 'taglines', 'synopsis'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   imdbid                     2858 non-null   int64 
 1   title                      2858 non-null   object
 2   akas                       2652 non-null   object
 3   year                       2858 non-null   int64 
 4   metascore                  2858 non-null   int64 
 5   imdb user rating           2858 non-null   int64 
 6   number of imdb user votes  2858 non-null   int64 
 7   awards                     2243 non-null   object
 8   opening weekend            1739 non-null   object
 9   producers                  2640 non-null   object
 10  budget                     1624 non-null   object
 11  script department          2220 non-null   object
 12  production companies       2682 non-null   object
 13  writers                    2696 non-null   object
 14  director

In [5]:
df.duplicated(subset=['imdbid']).sum()

0

In [6]:
df['age restrict'].isna().sum()

332

In [7]:
# rename columns to replace space with underscore
df.columns = df.columns.str.replace(' ', '_')

In [8]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['akas', 'metascore', 'imdb_user_rating', 'number_of_imdb_user_votes', 'awards', 'producers', 'script_department', 'writers', 'casting_directors', 'plot', 'plot_outline', 'keywords',
                   'taglines', 'synopsis', 'budget', 'year', 'opening_weekend', 'production_companies', 'directors', 'cast', 'countries', 'age_restrict', 'genres']

# Drop the columns
df = df.drop(columns=columns_to_drop)

In [9]:
df.head()

Unnamed: 0,imdbid,title
0,120770,A Night at the Roxbury
1,132512,At First Sight
2,118661,The Avengers
3,215545,Bamboozled
4,118715,The Big Lebowski


In [10]:
df2 = pd.read_csv('data/movie_metadata_set_01.csv')
df2.head()

Unnamed: 0,imdbid,title,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,9114286,Black Panther: Wakanda Forever,2022,PG-13,"Action, Adventure, Drama",The people of Wakanda fight to protect their h...,Ryan Coogler,,,,
1,1630029,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",Jake Sully lives with his newfound family form...,James Cameron,192.0,460000000.0,667830256.0,2265936000.0
2,5884796,Plane,2023,R,"Action, Thriller",A pilot finds himself caught in a war zone aft...,Jean-François Richet,,,,
3,6710474,Everything Everywhere All at Once,2022,R,"Action, Adventure, Comedy",A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",,,,
4,5433140,Fast X,2023,,"Action, Crime, Mystery",Dom Toretto and his family are targeted by the...,Louis Leterrier,,,,


In [11]:
df2.shape

(369726, 11)

In [12]:
df2.duplicated(subset=['imdbid']).sum()

126529

In [13]:
# merge df with df2 on 'imdbid'
merged_df = pd.merge(df, df2, on='imdbid', how='left')

In [14]:
merged_df.head()

Unnamed: 0,imdbid,title_x,title_y,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,120770,A Night at the Roxbury,A Night at the Roxbury,1998,PG-13,"Comedy, Music, Romance",Two dim-witted brothers dream of owning their ...,John Fortenberry,82.0,17000000.0,30331165.0,30331160.0
1,132512,At First Sight,At First Sight,1999,PG-13,"Drama, Romance",A blind man has an operation to regain his sig...,Irwin Winkler,128.0,40000000.0,22365133.0,22365130.0
2,118661,The Avengers,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Two British Agents team up to stop Sir August ...,Jeremiah S. Chechik,143.0,225000000.0,623357910.0,1515100000.0
3,118661,The Avengers,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Two British Agents team up to stop Sir August ...,Jeremiah S. Chechik,89.0,60000000.0,23385416.0,48585420.0
4,118661,The Avengers,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Two British Agents team up to stop Sir August ...,Jeremiah S. Chechik,143.0,225000000.0,623357910.0,1515100000.0


In [15]:
merged_df.shape

(5736, 12)

In [16]:
merged_df.duplicated(subset=['imdbid']).sum()

2878

In [17]:
# drop duplicate rows based on 'imdbid', keep only the first occurrence
df = merged_df.drop_duplicates(subset=['imdbid'], keep='first')

In [18]:
df.shape

(2858, 12)

In [19]:
df.production_budget.isna().sum()

1620

In [20]:
# drop rows that don't have budget, gross
df = df.dropna(subset=['production_budget'])

In [21]:
df.shape

(1238, 12)

In [22]:
# new feature: 'financial_success' = 'worldwide_gross' > 'production_budget'*2
df['financial_success'] = (df['worldwide_gross'] > df['production_budget'] * 2).astype(int)

In [23]:
# new feature 'ROI' (return on investment)
df['ROI'] = df['worldwide_gross'] / df['production_budget']

In [24]:
# check for unique values in age_rating
df.age_rating.unique()

array(['PG-13', 'R', 'PG', nan, 'G', 'TV-MA', '18+', 'Unrated',
       'Not Rated', 'Passed', 'U', 'UA', 'GP', 'Approved', 'A', '13+'],
      dtype=object)

In [25]:
# check for missing age ratings
age_ratings_missing = ['Unrated', 'Not Rated', 'Passed', 'Approved']
df[df['age_rating'].isin(age_ratings_missing)]

Unnamed: 0,imdbid,title_x,title_y,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross,financial_success,ROI
762,77402,Dawn of the Dead,Dawn of the Dead,1978,Unrated,"Horror, Thriller","During an escalating zombie epidemic, two Phil...",George A. Romero,101.0,28000000.0,58990765.0,103452900.0,1,3.694746
764,88993,Day of the Dead,Day of the Dead,1985,Not Rated,"Horror, Thriller","As the world is overrun by zombies, a group of...",George A. Romero,101.0,3500000.0,5804262.0,34004260.0,1,9.715503
976,45793,From Here to Eternity,From Here to Eternity,1953,Passed,"Drama, Romance, War","At a U.S. Army base in 1941 Hawaii, a private ...",Fred Zinnemann,118.0,1650000.0,30500000.0,30500000.0,1,18.484848
1297,25316,It Happened One Night,It Happened One Night,1934,Passed,"Comedy, Romance",A renegade reporter trailing a young runaway h...,Frank Capra,105.0,325000.0,2500000.0,2500000.0,1,7.692308
2641,47437,Sabrina,Sabrina,1954,Passed,"Comedy, Drama, Romance",A playboy becomes interested in the daughter o...,Billy Wilder,127.0,58000000.0,53458319.0,87100000.0,0,1.501724
2799,16220,The Phantom of the Opera,The Phantom of the Opera,1925,Passed,Horror,"A mad, disfigured composer seeks love with a l...","Rupert Julian, \nLon Chaney, \nErnst Laemmle, ...",143.0,55000000.0,51225796.0,153770100.0,1,2.795819
2973,2321549,The Babadook,The Babadook,2014,Not Rated,"Horror, Mystery",A single mother and her child fall into a deep...,Jennifer Kent,94.0,2000000.0,950792.0,7482387.0,1,3.741194
3075,53604,The Apartment,The Apartment,1960,Approved,"Comedy, Drama, Romance",A Manhattan insurance clerk tries to rise in h...,Billy Wilder,125.0,3000000.0,18600000.0,24600000.0,1,8.199999
3096,38348,Beauty and the Beast,Beauty and the Beast,1946,Not Rated,"Drama, Fantasy, Romance",A beautiful young woman takes her father's pla...,"Jean Cocteau, \nRené Clément",129.0,160000000.0,504014165.0,1268697000.0,1,7.929359
3155,34587,Cat People,Cat People,1942,Not Rated,"Fantasy, Horror, Thriller",An American man marries a Serbian immigrant wh...,Jacques Tourneur,118.0,12500000.0,7000000.0,21000000.0,0,1.68


In [26]:
df[df.age_rating.isna()]

Unnamed: 0,imdbid,title_x,title_y,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross,financial_success,ROI
168,32976,Rebecca,Rebecca,1940.0,,"Drama, Film-Noir, Mystery",A self-conscious woman juggles adjusting to he...,Alfred Hitchcock,130.0,1288000.0,6000000.0,6002370.0,1,4.660225
299,41113,All the King's Men,All the King's Men,1949.0,,"Drama, Film-Noir","The rise and fall of a corrupt politician, who...",Robert Rossen,128.0,55000000.0,7221458.0,9521458.0,0,0.173117
2807,164167,Sidewalks of New York,Sidewalks of New York,1923.0,,"Drama, Sport",Add a Plot,Lester Park,108.0,1000000.0,2402459.0,3575308.0,1,3.575308
2919,2901014,A Beautiful Day in the Neighborhood - IMDb,A Beautiful Day in the Neighborhood,,,Biography,A look at the life and early work of legendary...,Marielle Heller,109.0,45000000.0,61696436.0,68590003.0,0,1.524222
3500,37884,The Lost Weekend,The Lost Weekend,1945.0,,"Drama, Film-Noir",The desperate life of a chronic alcoholic is f...,Billy Wilder,101.0,1250000.0,11000000.0,11000000.0,1,8.8
3620,1729217,Playback,Playback,1962.0,,"Crime, Drama, Thriller",A young policeman becomes involved with a glam...,Quentin Lawrence,98.0,1500000.0,264.0,54945.0,0,0.03663
3813,58648,The Thin Red Line,The Thin Red Line,1964.0,,"Drama, War","In Guadalcanal during World War II, a private ...",Andrew Marton,170.0,52000000.0,36400491.0,97709034.0,0,1.87902


In [27]:
# update missing age ratings
df.loc[df['imdbid'] == 77402, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 88993, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 45793, 'age_rating'] = '13+'
df.loc[df['imdbid'] == 25316, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 47437, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 16220, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 2321549, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 53604, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 38348, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 34587, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 61811, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 56172, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 54135, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 47296, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 63522, 'age_rating'] = 'M'
df.loc[df['imdbid'] == 31381, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 24216, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 48281, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 5074352, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 49513, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 55824, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 63350, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 57115, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 32976, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 41113, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 164167, 'age_rating'] = 'G'
df.loc[df['imdbid'] == 2901014, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 2901014, 'year'] = 2019
df.loc[df['imdbid'] == 37884, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 1729217, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 58648, 'age_rating'] = 'R'

In [28]:
# map numbers to the respective age ratings

# age rating to numerical value
age_rating_mapping = {
    'G': 0,
    'PG': 6,
    'PG-13': 13,
    'R': 17,
    'NC-17': 18,
    'M': 17,
    'X': 17,
    'TV-MA': 17,
    '18+': 18,
    'U': 0,
    'UA': 13,
    'GP': 6,
    'A': 18,
    '13+': 13
}

# create new column 'age_rating_number' based on mapping
df['age_rating_number'] = df['age_rating'].map(age_rating_mapping)

In [29]:
df.isna().sum()

imdbid               0
title_x              0
title_y              0
year                 0
age_rating           0
genre                0
description          0
director             0
runtime_minutes      0
production_budget    0
domestic_gross       0
worldwide_gross      0
financial_success    0
ROI                  0
age_rating_number    0
dtype: int64

In [30]:
# create dummies for genres

# generate dummy variables with column prefix
genre_dummies = df['genre'].str.get_dummies(sep=', ').rename(lambda x: 'genre_' + x.lower(), axis=1).astype(int)

# concatenate genre dummies with original DataFrame
df = pd.concat([df, genre_dummies], axis=1)

In [31]:
df.columns

Index(['imdbid', 'title_x', 'title_y', 'year', 'age_rating', 'genre',
       'description', 'director', 'runtime_minutes', 'production_budget',
       'domestic_gross', 'worldwide_gross', 'financial_success', 'ROI',
       'age_rating_number', 'genre_action', 'genre_adventure',
       'genre_animation', 'genre_biography', 'genre_comedy', 'genre_crime',
       'genre_drama', 'genre_family', 'genre_fantasy', 'genre_film-noir',
       'genre_history', 'genre_horror', 'genre_music', 'genre_musical',
       'genre_mystery', 'genre_romance', 'genre_sci-fi', 'genre_sport',
       'genre_thriller', 'genre_war', 'genre_western'],
      dtype='object')

In [32]:
# drop column title_y and rename title_x
df = df.drop(columns=['title_y'])
df = df.rename(columns={'title_x': 'title'})

In [33]:
# output df to csv
output_file = 'data/movie_metadata_final.csv'
df.to_csv(output_file, index=False)