In [2]:
import pandas as pd

df = pd.read_csv('data/movie_metadata/movie_meta_data.csv')

In [2]:
df.shape

(2858, 25)

In [3]:
df.columns

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords', 'genres', 'taglines', 'synopsis'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   imdbid                     2858 non-null   int64 
 1   title                      2858 non-null   object
 2   akas                       2652 non-null   object
 3   year                       2858 non-null   int64 
 4   metascore                  2858 non-null   int64 
 5   imdb user rating           2858 non-null   int64 
 6   number of imdb user votes  2858 non-null   int64 
 7   awards                     2243 non-null   object
 8   opening weekend            1739 non-null   object
 9   producers                  2640 non-null   object
 10  budget                     1624 non-null   object
 11  script department          2220 non-null   object
 12  production companies       2682 non-null   object
 13  writers                    2696 non-null   object
 14  director

In [5]:
df.duplicated(subset=['imdbid']).sum()

0

In [4]:
df['age restrict'].isna().sum()

332

In [6]:
# rename columns to replace space with underscore
df.columns = df.columns.str.replace(' ', '_')

In [7]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['akas', 'metascore', 'imdb_user_rating', 'number_of_imdb_user_votes', 'awards', 'producers', 'script_department', 'writers', 'casting_directors', 'plot', 'plot_outline', 'keywords',
                   'taglines', 'synopsis', 'budget', 'year', 'opening_weekend', 'production_companies', 'directors', 'cast', 'countries', 'age_restrict', 'genres']

# Drop the columns
df = df.drop(columns=columns_to_drop)

In [8]:
df.head()

Unnamed: 0,imdbid,title
0,120770,A Night at the Roxbury
1,132512,At First Sight
2,118661,The Avengers
3,215545,Bamboozled
4,118715,The Big Lebowski


In [9]:
df2 = pd.read_csv('data/movie_metadata_set_01.csv')
df2.head()

Unnamed: 0,imdbid,title,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,9114286,Black Panther: Wakanda Forever,2022,PG-13,"Action, Adventure, Drama",The people of Wakanda fight to protect their h...,Ryan Coogler,,,,
1,1630029,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",Jake Sully lives with his newfound family form...,James Cameron,192.0,460000000.0,667830256.0,2265936000.0
2,5884796,Plane,2023,R,"Action, Thriller",A pilot finds himself caught in a war zone aft...,Jean-François Richet,,,,
3,6710474,Everything Everywhere All at Once,2022,R,"Action, Adventure, Comedy",A middle-aged Chinese immigrant is swept up in...,"Dan Kwan, \nDaniel Scheinert",,,,
4,5433140,Fast X,2023,,"Action, Crime, Mystery",Dom Toretto and his family are targeted by the...,Louis Leterrier,,,,


In [10]:
df2.shape

(369726, 11)

In [11]:
df2.duplicated(subset=['imdbid']).sum()

126529

In [12]:
# merge df with df2 on 'imdbid'
merged_df = pd.merge(df, df2, on='imdbid', how='left')

In [13]:
merged_df.head()

Unnamed: 0,imdbid,title_x,title_y,year,age_rating,genre,description,director,runtime_minutes,production_budget,domestic_gross,worldwide_gross
0,120770,A Night at the Roxbury,A Night at the Roxbury,1998,PG-13,"Comedy, Music, Romance",Two dim-witted brothers dream of owning their ...,John Fortenberry,82.0,17000000.0,30331165.0,30331160.0
1,132512,At First Sight,At First Sight,1999,PG-13,"Drama, Romance",A blind man has an operation to regain his sig...,Irwin Winkler,128.0,40000000.0,22365133.0,22365130.0
2,118661,The Avengers,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Two British Agents team up to stop Sir August ...,Jeremiah S. Chechik,143.0,225000000.0,623357910.0,1515100000.0
3,118661,The Avengers,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Two British Agents team up to stop Sir August ...,Jeremiah S. Chechik,89.0,60000000.0,23385416.0,48585420.0
4,118661,The Avengers,The Avengers,1998,PG-13,"Action, Adventure, Sci-Fi",Two British Agents team up to stop Sir August ...,Jeremiah S. Chechik,143.0,225000000.0,623357910.0,1515100000.0


In [14]:
merged_df.shape

(5736, 12)

In [15]:
merged_df.duplicated(subset=['imdbid']).sum()

2878

In [16]:
# drop duplicate rows based on 'imdbid', keep only the first occurrence
df = merged_df.drop_duplicates(subset=['imdbid'], keep='first')

In [17]:
df.shape

(2858, 12)

In [None]:
# new feature: 'financial_success' = 'worldwide_gross' > 'production_budget'*2
df['financial_success'] = (df['worldwide_gross'] > df['production_budget'] * 2).astype(int)

In [None]:
# new feature 'ROI' (return on investment)
df['ROI'] = df['worldwide_gross'] / df['production_budget']

In [None]:
# create dummies for genres

# generate dummy variables with column prefix
genre_dummies = df['genre'].str.get_dummies(sep=', ').rename(lambda x: 'genre_' + x.lower(), axis=1).astype(int)

# concatenate genre dummies with original DataFrame
df = pd.concat([df, genre_dummies], axis=1)

In [None]:
df.head()

In [None]:
df.columns

In [18]:
df.production_budget.isna().sum()

1620

In [None]:
df.genre.isna().sum()