In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import ast
import os

In [12]:
df = pd.read_csv("../data/data.csv")

# Clean list columns
list_cols = ["genres", "privacy_types", "data_categories", "data_types"]
for col in list_cols:
    df[col] = df[col].apply(ast.literal_eval)

In [13]:
df["log_price"] = np.log1p(df["price"])  # Normalize price

df["has_iap"] = df["has_in_app_purchases"].astype(int)  # binary

In [14]:
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(df["genres"])
genres_df = pd.DataFrame(genres_encoded, columns=[f"genre_{g}" for g in mlb_genres.classes_])

In [15]:
ohe = OneHotEncoder(sparse_output=False)
categorical_features = df[["country_code", "content_rating"]]
ohe_features = ohe.fit_transform(categorical_features)
ohe_df = pd.DataFrame(ohe_features, columns=ohe.get_feature_names_out(["country_code", "content_rating"]))

metadata = pd.concat([
    df[["log_price", "has_iap"]],
    genres_df,
    ohe_df
], axis=1)

In [16]:
mlb_privacy = MultiLabelBinarizer()
mlb_categories = MultiLabelBinarizer()
mlb_types = MultiLabelBinarizer()

y_privacy = mlb_privacy.fit_transform(df["privacy_types"])
y_categories = mlb_categories.fit_transform(df["data_categories"])
y_types = mlb_types.fit_transform(df["data_types"])

In [17]:
os.makedirs("../processed", exist_ok=True)

np.save("../processed/X_metadata.npy", metadata.to_numpy())
np.save("../processed/y_privacy.npy", y_privacy)
np.save("../processed/y_categories.npy", y_categories)
np.save("../processed/y_types.npy", y_types)

# Save encoders
import pickle

with open("../processed/mlb_privacy.pkl", "wb") as f:
    pickle.dump(mlb_privacy, f)
with open("../processed/mlb_categories.pkl", "wb") as f:
    pickle.dump(mlb_categories, f)
with open("../processed/mlb_types.pkl", "wb") as f:
    pickle.dump(mlb_types, f)
with open("../processed/mlb_genres.pkl", "wb") as f:
    pickle.dump(mlb_genres, f)
with open("../processed/ohe_meta.pkl", "wb") as f:
    pickle.dump(ohe, f)

In [18]:
print("Metadata shape:", metadata.shape)
print("Privacy labels:", y_privacy.shape)
print("Category labels:", y_categories.shape)
print("Data types labels:", y_types.shape)

Metadata shape: (10000, 62)
Privacy labels: (10000, 4)
Category labels: (10000, 16)
Data types labels: (10000, 34)
