In [99]:
import json
import pandas as pd

with open("../data/raw/tmdb_popular.json") as f:
    pop_data = json.load(f)

with open("../data/raw/tmdb_top_rated.json") as f:
    topr_data = json.load(f)

with open("../data/raw/tmdb_upcoming.json") as f:
    upc_data = json.load(f)

with open("../data/raw/genres.json") as f:
    genres_data = json.load(f)

pop_movies = []
topr_movies = []
upc_movies = []

for page_data in pop_data.values():
    results = page_data.get("results", [])
    pop_movies.extend(results)
for page_data in topr_data.values():
    results = page_data.get("results", [])
    topr_movies.extend(results)
for page_data in upc_data.values():
    results = page_data.get("results", [])
    upc_movies.extend(results)

In [100]:
df_pop = pd.DataFrame(pop_movies)
df_topr = pd.DataFrame(topr_movies)
df_upc = pd.DataFrame(upc_movies)
df_genres = pd.DataFrame(genres_data['genres'])

In [101]:
def optimize_df(df, parse_dates=None, verbose=True):
    import numpy as np
    import pandas as pd

    start_mem = df.memory_usage(deep=True).sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        # Skip columns that contain lists (like genre_ids)
        if df[col].apply(lambda x: isinstance(x, list)).any():
            continue

        if col_type in ["int64", "int32"]:
            if (
                df[col].min() >= np.iinfo("int8").min
                and df[col].max() <= np.iinfo("int8").max
            ):
                df[col] = df[col].astype("int8")
            elif (
                df[col].min() >= np.iinfo("int16").min
                and df[col].max() <= np.iinfo("int16").max
            ):
                df[col] = df[col].astype("int16")
            elif (
                df[col].min() >= np.iinfo("int32").min
                and df[col].max() <= np.iinfo("int32").max
            ):
                df[col] = df[col].astype("int32")

        elif col_type == "float64":
            df[col] = df[col].astype("float32")

        elif col_type == "object":
            if parse_dates and col in parse_dates:
                df[col] = pd.to_datetime(df[col], errors="coerce")
            else:
                num_unique = df[col].nunique()
                num_total = len(df[col])
                if num_unique / num_total < 0.5:
                    df[col] = df[col].astype("category")
                else:
                    df[col] = df[col].astype("string")

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(
            f"🔧 Memory usage reduced from {start_mem:.2f} MB to {end_mem:.2f} MB "
            f"({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)"
        )

    return df

In [102]:
df_genres = optimize_df(df_genres)

🔧 Memory usage reduced from 0.00 MB to 0.00 MB (7.6% reduction)


In [103]:
df_genres.dtypes

id               int16
name    string[python]
dtype: object

In [104]:
df_pop = optimize_df(df_pop, parse_dates='release_date')

🔧 Memory usage reduced from 9.17 MB to 8.50 MB (7.3% reduction)


In [105]:
df_pop.dtypes

adult                          bool
backdrop_path        string[python]
genre_ids                    object
id                            int32
original_language          category
original_title       string[python]
overview             string[python]
popularity                  float32
poster_path          string[python]
release_date         datetime64[ns]
title                string[python]
video                          bool
vote_average                float32
vote_count                    int32
dtype: object

In [106]:
df_upc = optimize_df(df_upc,parse_dates='release_date')

🔧 Memory usage reduced from 0.97 MB to 0.94 MB (2.8% reduction)


In [107]:
df_topr = optimize_df(df_topr, parse_dates='release_date')

🔧 Memory usage reduced from 9.09 MB to 8.34 MB (8.3% reduction)
