In [None]:
'''
Download
[1] IMDb Movie Dataset: All Movies by Genre
from
https://www.kaggle.com/datasets/rajugc/imdb-movies-dataset-based-on-genre
and
[2] Ultimate Film Statistics Dataset with Production Budget and Domestic (US) and Worldwide Gross
from
https://www.kaggle.com/datasets/alessandrolobello/the-ultimate-film-statistics-dataset-for-ml
'''

In [None]:
# first df: IMDb Movie Dataset

# joining all 16 seperate datasets into 1

import pandas as pd

# file location
directory = 'data/IMDb Movie Dataset/' 

# list of CSV files
csv_files = [
    'action.csv', 'adventure.csv', 'animation.csv', 'biography.csv', 'crime.csv',
    'family.csv', 'fantasy.csv', 'film-noir.csv', 'history.csv', 'horror.csv',
    'mystery.csv', 'romance.csv', 'scifi.csv', 'sports.csv', 'thriller.csv', 'war.csv'
]

# empty list to store dfs
dataframes = []

# loop through the list of files and read each one
for file in csv_files:
    file_path = directory + file
    df = pd.read_csv(file_path)
    dataframes.append(df)

# concatenate all dfs into 1
combined_df = pd.concat(dataframes, ignore_index=True)

In [None]:
combined_df.head()

In [None]:
combined_df.shape

In [None]:
# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['director_id', 'star', 'star_id', 'votes', 'runtime', 'rating', 'description', 'gross(in $)']

# drop the columns
df = combined_df.drop(columns=columns_to_drop)

In [None]:
# rename columns
df = df.rename(columns={'movie_id': 'imdbid', 'movie_name': 'title', 'certificate': 'age_rating'})

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# check for spaces at beginning or end of 'imdbid'
starts_with_space = df['imdbid'].str.startswith(' ')
ends_with_space = df['imdbid'].str.endswith(' ')

# combine both conditions
has_leading_or_trailing_space = starts_with_space | ends_with_space

# print rows with leading or trailing spaces
print("Rows with leading or trailing spaces in 'imdbid':")
print(df[has_leading_or_trailing_space])

In [None]:
# remove 'tt' in imdbid
df['imdbid'] = df['imdbid'].str.replace('tt', '', regex=False)

# replace non-numeric values with NaN
df['imdbid'] = pd.to_numeric(df['imdbid'], errors='coerce')

# convert to integer
df['imdbid'] = df['imdbid'].astype('int64')

In [None]:
# count number of NaN in imdbid
print(df.imdbid.isna().sum())

In [None]:
# count number of duplicates in imdbid
print(df['imdbid'].duplicated().sum())

In [None]:
# drop duplicates in imdbid, keep only the first occurrence
df = df.drop_duplicates(subset=['imdbid'], keep='first')

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# second df: Ultimate_Film_Statistics_Dataset

df_uf = pd.read_csv('data/Ultimate_Film_Statistics_Dataset.csv')

# drop unnecessary columns

# list of columns to drop
columns_to_drop = ['production_date', 'genres', 'director_name', 'director_professions', 'director_birthYear', 'director_deathYear', 'movie_averageRating', 'movie_numerOfVotes', 'approval_Index']

# drop columns
df_uf = df_uf.drop(columns=columns_to_drop)

# rename columns
df_uf = df_uf.rename(columns={'movie_title': 'title', 'Production budget $': 'production_budget', 'Domestic gross $': 'domestic_gross', 'Worldwide gross $': 'worldwide_gross'})

In [None]:
df_uf.head()

In [None]:
df_uf.columns

In [None]:
# count number of duplicate titles
print(df_uf['title'].duplicated().sum())

In [None]:
df_uf.shape

In [None]:
# merge df with df2 on 'imdbid'
merged_df = pd.merge(df, df_uf, on='title', how='left')

In [None]:
df = merged_df

In [None]:
df.shape

In [None]:
df.duplicated(subset=['imdbid']).sum()

In [None]:
df.columns

In [None]:
# update missing or wrong values
df.loc[df['imdbid'] == '13400336', 'year'] = 2023
df.loc[df['imdbid'] == '13400336', 'runtime_minutes'] = 133
df.loc[df['imdbid'] == '2531030', 'year'] = 1985
df.loc[df['imdbid'] == '2531030', 'director'] = 'Barry Levinson'
df.loc[df['imdbid'] == '4338664', 'year'] = 2001
df.loc[df['imdbid'] == '4338664', 'director'] = 'Barry Levinson'

In [None]:
df[df['director'].isna()]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.imdbid.isna().sum()

In [None]:
# convert imdbid to integer
df['imdbid'] = df['imdbid'].astype('int64')

In [None]:
df.columns

In [None]:
# drop rows where either 'runtime_minutes' or 'imdbid' is NaN
df = df.dropna(subset=['runtime_minutes', 'imdbid'])

# count the remaining rows
count = len(df)

print(f"Number of rows with values in both 'runtime_minutes' and 'imdbid': {count}")

In [None]:
# output df to csv
output_file = 'data/movie_metadata_set_01.csv'
df.to_csv(output_file, index=False)