In [None]:
'''
Download
[3] Movie Scripts Corpus Dataset
from
https://www.kaggle.com/datasets/gufukuro/movie-scripts-corpus
'''

In [None]:
import pandas as pd

df = pd.read_csv('data/movie_metadata/movie_meta_data.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.duplicated(subset=['imdbid']).sum()

In [None]:
# rename columns to replace space with underscore
df.columns = df.columns.str.replace(' ', '_')

In [None]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['akas', 'metascore', 'imdb_user_rating', 'number_of_imdb_user_votes', 'awards', 'producers', 'script_department', 'writers', 'casting_directors', 'plot', 'plot_outline', 'keywords',
                   'taglines', 'synopsis', 'budget', 'year', 'opening_weekend', 'production_companies', 'directors', 'cast', 'countries', 'age_restrict', 'genres']

# Drop the columns
df = df.drop(columns=columns_to_drop)

In [None]:
df.head()

In [None]:
df2 = pd.read_csv('data/movie_metadata_set_01.csv')
df2.head()

In [None]:
df2.shape

In [None]:
df2.duplicated(subset=['imdbid']).sum()

In [None]:
# merge df with df2 on 'imdbid'
merged_df = pd.merge(df, df2, on='imdbid', how='left')

In [None]:
merged_df.head()

In [None]:
merged_df.shape

In [None]:
merged_df.duplicated(subset=['imdbid']).sum()

In [None]:
# drop duplicate rows based on 'imdbid', keep only the first occurrence
df = merged_df.drop_duplicates(subset=['imdbid'], keep='first')

In [None]:
df.shape

In [None]:
df.production_budget.isna().sum()

In [None]:
# drop rows that don't have budget, gross
df = df.dropna(subset=['production_budget'])

In [None]:
df.shape

In [None]:
# new feature: 'financial_success' = 'worldwide_gross' > 'production_budget'*2
df['financial_success'] = (df['worldwide_gross'] > df['production_budget'] * 2).astype(int)

In [None]:
# new feature 'ROI' (return on investment)
df['ROI'] = (df['worldwide_gross'] - df['production_budget']) / df['production_budget']

In [None]:
# check for unique values in age_rating
df.age_rating.unique()

In [None]:
# check for missing age ratings
age_ratings_missing = ['Unrated', 'Not Rated', 'Passed', 'Approved']
df[df['age_rating'].isin(age_ratings_missing)]

In [None]:
df[df.age_rating.isna()]

In [None]:
# update missing age ratings, wrong title and imdbid
df.loc[df['title_y'] == 'A Beautiful Day in the Neighborhood', 'title_x'] = 'A Beautiful Day in the Neighborhood'
df.loc[df['title_x'] == 'A Beautiful Day in the Neighborhood', 'imdbid'] = 3224458
df.loc[df['imdbid'] == 77402, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 88993, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 45793, 'age_rating'] = '13+'
df.loc[df['imdbid'] == 25316, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 47437, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 16220, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 2321549, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 53604, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 38348, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 34587, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 61811, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 56172, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 54135, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 47296, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 63522, 'age_rating'] = 'M'
df.loc[df['imdbid'] == 31381, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 24216, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 48281, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 5074352, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 49513, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 55824, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 63350, 'age_rating'] = 'X'
df.loc[df['imdbid'] == 57115, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 32976, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 41113, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 164167, 'age_rating'] = 'G'
df.loc[df['imdbid'] == 3224458, 'age_rating'] = 'PG'
df.loc[df['imdbid'] == 3224458, 'year'] = 2019
df.loc[df['imdbid'] == 37884, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 1729217, 'age_rating'] = 'PG-13'
df.loc[df['imdbid'] == 58648, 'age_rating'] = 'R'

In [None]:
# map numbers to the respective age ratings

# age rating to numerical value
age_rating_mapping = {
    'G': 0,
    'PG': 6,
    'PG-13': 13,
    'R': 17,
    'NC-17': 18,
    'M': 17,
    'X': 17,
    'TV-MA': 17,
    '18+': 18,
    'U': 0,
    'UA': 13,
    'GP': 6,
    'A': 18,
    '13+': 13
}

# create new column 'age_rating_number' based on mapping
df['age_rating_number'] = df['age_rating'].map(age_rating_mapping)

In [None]:
df.isna().sum()

In [None]:
# create dummies for genres

# generate dummy variables with column prefix
genre_dummies = df['genre'].str.get_dummies(sep=', ').rename(lambda x: 'genre_' + x.lower(), axis=1).astype(int)

# concatenate genre dummies with original DataFrame
df = pd.concat([df, genre_dummies], axis=1)

In [None]:
df.columns

In [None]:
# drop column title_y and rename title_x
df = df.drop(columns=['title_y'])
df = df.rename(columns={'title_x': 'title'})

In [None]:
# output df to csv
output_file = 'data/01_movie_metadata.csv'
df.to_csv(output_file, index=False)