# Data Pre-Processing

In [None]:
import pandas as pd
from pathlib import Path

In [39]:
data_path = Path("../data/raw.jsonl")
processed_data_path = Path("../data/processed.csv")
df = pd.read_json(data_path, lines=True)

In [40]:
df.head()

Unnamed: 0,movie_id,title,release_date,budget,revenue_worldwide,runtime,genres,imdb_id,franchise,cast_popularity_mean,cast_popularity_max,director_popularity,original_language,imdb_rating,imdb_votes,mpaa_rating,domestic_box_office,rotten_tomatoes_score,metacritic_score,awards_text
0,8193,Napoleon Dynamite,2004-06-11,400000,46118097,95,[Comedy],tt0374900,False,1.358067,2.479,1.3126,en,7.0,251071.0,PG,44540956.0,72.0,64.0,10 wins & 23 nominations total
1,663558,New Gods: Nezha Reborn,2021-02-06,0,70000000,117,"[Animation, Fantasy, Action]",tt13269670,True,0.8655,1.3269,0.5921,zh,6.8,4451.0,TV-14,,80.0,,3 wins & 3 nominations total
2,614409,To All the Boys: Always and Forever,2021-02-12,0,0,115,"[Romance, Comedy, Drama]",tt10676012,True,3.196233,5.6727,0.2037,en,6.3,37385.0,TV-14,,79.0,65.0,2 wins & 2 nominations total
3,12,Finding Nemo,2003-05-30,94000000,940335536,100,"[Animation, Family]",tt0266543,True,1.427567,1.8011,1.2339,en,8.2,1185437.0,G,380843261.0,99.0,90.0,Won 1 Oscar. 49 wins & 63 nominations total
4,16,Dancer in the Dark,2000-09-01,12500000,40061153,140,"[Drama, Crime]",tt0168629,False,1.903567,2.5962,2.2434,en,7.9,122419.0,R,4184036.0,69.0,63.0,Nominated for 1 Oscar. 34 wins & 48 nomination...


In [41]:
df.columns

Index(['movie_id', 'title', 'release_date', 'budget', 'revenue_worldwide',
       'runtime', 'genres', 'imdb_id', 'franchise', 'cast_popularity_mean',
       'cast_popularity_max', 'director_popularity', 'original_language',
       'imdb_rating', 'imdb_votes', 'mpaa_rating', 'domestic_box_office',
       'rotten_tomatoes_score', 'metacritic_score', 'awards_text'],
      dtype='object')

## Converting Numerical Values From Strings

In [None]:
numerical_columns =[
    "budget",
    "revenue_worldwide",
    "imdb_rating",
    "cast_popularity_mean",
    "cast_popularity_max",
    "director_popularity"
]

In [43]:
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

## Dropping Useless Records

In [44]:
df.dropna(subset=[
    "budget",
    "revenue_worldwide",
    "imdb_rating",
    "budget",
    "imdb_rating",
    "cast_popularity_mean",
    "cast_popularity_max",
    "director_popularity"
], inplace=True)

## Normalizing Values

In [45]:
df["imdb_rating"] = df["imdb_rating"] / 10

df["cast_popularity_mean"] = (
    df["cast_popularity_mean"] - df["cast_popularity_mean"].mean()
) / df["cast_popularity_mean"].std()

df["budget"] = (
    df["budget"] - df["budget"].mean()
) / df["budget"].std()

## Extracting Months

In [46]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [47]:
df['month'] = df['release_date'].dt.month

In [48]:
def assign_season(month):
    if month in [6, 7, 8]: return 'Summer'
    if month in [12, 1, 2]: return 'Winter'
    return 'Other'

In [49]:
df['season'] = df['month'].apply(assign_season)

## Rating Scores

In [50]:
df['critics_score'] = df[['rotten_tomatoes_score', 'metacritic_score']].mean(axis=1) / 10
df['audience_score'] = df['imdb_rating']

## Saving the Processed Data

In [51]:
df.to_csv(processed_data_path)

---