# Data Pre-Processing

In [98]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

In [99]:
data_path = Path("../data/raw.jsonl")
processed_data_path = Path("../data/processed.csv")
df = pd.read_json(data_path, lines=True)

In [100]:
df.head()

Unnamed: 0,movie_id,title,release_date,budget,revenue_worldwide,runtime,genres,imdb_id,franchise,cast_popularity_mean,cast_popularity_max,director_popularity,original_language,imdb_rating,imdb_votes,mpaa_rating,domestic_box_office,rotten_tomatoes_score,metacritic_score,awards_text
0,983044,The Arctic Convoy,2023-12-25,6600000,3637940,109,"[War, Drama]",tt27724113,False,1.1104,2.1157,0.756,no,6.5,4266.0,,2869.0,100.0,,3 wins & 6 nominations total
1,851976,Small World,2021-09-10,0,0,117,"[Crime, Action, Drama]",tt14319068,False,1.366133,2.1604,0.6415,pl,5.6,927.0,,,,,1 win
2,12,Finding Nemo,2003-05-30,94000000,940335536,100,"[Animation, Family]",tt0266543,True,2.125667,2.9015,2.0785,en,8.2,1185437.0,G,380843261.0,99.0,90.0,Won 1 Oscar. 49 wins & 63 nominations total
3,16,Dancer in the Dark,2000-09-01,12500000,40061153,140,"[Drama, Crime]",tt0168629,False,2.4853,3.2558,3.4113,en,7.9,122419.0,R,4184036.0,69.0,63.0,Nominated for 1 Oscar. 34 wins & 48 nomination...
4,17,The Dark,2005-09-28,0,6593579,87,"[Horror, Thriller, Mystery]",tt0411267,False,3.6373,6.2285,0.7014,en,5.3,11802.0,R,,40.0,,1 nomination total


In [101]:
df.columns

Index(['movie_id', 'title', 'release_date', 'budget', 'revenue_worldwide',
       'runtime', 'genres', 'imdb_id', 'franchise', 'cast_popularity_mean',
       'cast_popularity_max', 'director_popularity', 'original_language',
       'imdb_rating', 'imdb_votes', 'mpaa_rating', 'domestic_box_office',
       'rotten_tomatoes_score', 'metacritic_score', 'awards_text'],
      dtype='object')

## Counting and Removing Duplicates

## Converting Numerical Values From Strings

In [102]:
numerical_columns =[
    "budget",
    "revenue_worldwide",
    "runtime",
    "imdb_rating",
    "cast_popularity_mean",
    "cast_popularity_max",
    "director_popularity",
    "imdb_rating",
    "imdb_votes",
    "domestic_box_office",
    "rotten_tomatoes_score",
    "metacritic_score"
]

In [103]:
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

## Dropping Empty Records

In [104]:
df.dropna(subset=[
    "budget",
    "revenue_worldwide",
    "imdb_rating",
    "budget",
    "imdb_rating",
    "cast_popularity_mean",
    "cast_popularity_max",
    "director_popularity"
], inplace=True)

## Standardizing Values

In [105]:
scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df.head()

Unnamed: 0,movie_id,title,release_date,budget,revenue_worldwide,runtime,genres,imdb_id,franchise,cast_popularity_mean,cast_popularity_max,director_popularity,original_language,imdb_rating,imdb_votes,mpaa_rating,domestic_box_office,rotten_tomatoes_score,metacritic_score,awards_text
0,983044,The Arctic Convoy,2023-12-25,0.013472,0.001759,0.186325,"[War, Drama]",tt27724113,False,0.057893,0.05408,0.045147,no,0.666667,0.001341,,3e-06,1.0,,3 wins & 6 nominations total
1,851976,Small World,2021-09-10,0.0,0.0,0.2,"[Crime, Action, Drama]",tt14319068,False,0.071943,0.055254,0.038178,pl,0.551282,0.000265,,,,,1 win
2,12,Finding Nemo,2003-05-30,0.191876,0.454659,0.17094,"[Animation, Family]",tt0266543,True,0.113672,0.074722,0.125648,en,0.884615,0.381986,G,0.406596,0.99,0.913043,Won 1 Oscar. 49 wins & 63 nominations total
3,16,Dancer in the Dark,2000-09-01,0.025515,0.01937,0.239316,"[Drama, Crime]",tt0168629,False,0.13343,0.084029,0.206776,en,0.846154,0.039417,R,0.004466,0.69,0.619565,Nominated for 1 Oscar. 34 wins & 48 nomination...
4,17,The Dark,2005-09-28,0.0,0.003188,0.148718,"[Horror, Thriller, Mystery]",tt0411267,False,0.196721,0.162118,0.041824,en,0.512821,0.003769,R,,0.4,,1 nomination total


## Extracting Months

In [106]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [107]:
df['month'] = df['release_date'].dt.month

In [108]:
def assign_season(month):
    if month in [6, 7, 8]: return 'Summer'
    if month in [12, 1, 2]: return 'Winter'
    return 'Other'

In [109]:
df['season'] = df['month'].apply(assign_season)

## Renaming Critics Score and Audience Rating

In [110]:
df['critics_score'] = df[['rotten_tomatoes_score', 'metacritic_score']].mean(axis=1)
df['audience_score'] = df['imdb_rating']

## Saving the Processed Data

In [111]:
df.to_csv(processed_data_path)

---