In [4]:
import os
import pandas as pd
from sklearn.preprocessing import RobustScaler

#### 9.1. Load Feature-Selected Datasets

We load the datasets from step 8, which contain only the top features selected based on importance analysis.  
At this point, the datasets are reduced in dimensionality and ready for scaling and modeling.

In [5]:
df_train = pd.read_parquet("./data/8/df_train.parquet")
df_test = pd.read_parquet("./data/8/df_test.parquet")
df_val = pd.read_parquet("./data/8/df_val.parquet")

#### 9.2. Separate Features and Target

We split each dataset into:
- **X**: the feature matrix containing predictors  
- **y**: the target column `isFraud`  

This separation is required before applying scaling and feeding the data into models.

In [6]:
feature_cols = df_train.columns.drop("isFraud")
X_train, y_train = df_train[feature_cols], df_train["isFraud"]
X_val, y_val = df_val[feature_cols], df_val["isFraud"]
X_test, y_test = df_test[feature_cols], df_test["isFraud"]

#### 9.3. Scale Features Using RobustScaler

We apply `RobustScaler` to normalize the features:
- Robust to outliers, which is important for financial datasets with high-value transactions
- Scales the data using median and interquartile range instead of mean and standard deviation

We fit the scaler on the training set and apply it to validation and test sets to avoid data leakage.

In [7]:
scaler = RobustScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_cols)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=feature_cols)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=feature_cols)

X_train["isFraud"] = y_train
X_test["isFraud"] = y_test
X_val["isFraud"] = y_val

#### 9.4. Add Target Column Back and Save Scaled Datasets

After scaling, we add the `isFraud` column back to each dataset to maintain a complete DataFrame.  
Finally, the scaled datasets are saved into `./data/9/` for consistent use in downstream modeling.

In [8]:
os.makedirs("./data/9/", exist_ok=True)

X_train.to_parquet("./data/9/df_train.parquet", index=False)
X_val.to_parquet("./data/9/df_val.parquet", index=False)
X_test.to_parquet("./data/9/df_test.parquet", index=False)