In [1]:
import os
import pandas as pd

#### 8.1. Load Latest Feature-Selected Datasets

We load the datasets from the previous step (step 7) where model-based feature importances were calculated.  
These datasets already contain only relevant features selected based on the ExtraTreesClassifier analysis.

In [2]:
df_train = pd.read_parquet("./data/7/df_train.parquet")
df_test = pd.read_parquet("./data/7/df_test.parquet")
df_val = pd.read_parquet("./data/7/df_val.parquet")

#### 8.2. Select Top Features for Modeling

We manually select a subset of features based on their importance scores and relevance for fraud detection.  
This step reduces dimensionality and focuses the model on the most informative predictors.

In [3]:
selected_columns = [
    "step", "hour", "oldbalanceOrg", "amount_over_total", "amount_over_old",
    "day", "oldbalanceDest", "newbalanceOrig", "abs_delta_sender",
    "abs_balance_sender_error", "amount_over_new", "delta_sender", "balance_sender_error", "isFraud"
]

df_train = df_train[selected_columns]
df_test = df_test[selected_columns]
df_val = df_val[selected_columns]

#### 8.3. Save Reduced Datasets

We save the train, validation, and test sets containing only the selected top features into a new directory (`./data/8/`).  
This ensures that all downstream modeling uses the same curated set of important features.

In [4]:
os.makedirs("./data/8/", exist_ok=True)

df_train.to_parquet("./data/8/df_train.parquet", index=False)
df_val.to_parquet("./data/8/df_val.parquet", index=False)
df_test.to_parquet("./data/8/df_test.parquet", index=False)