# 02 - Data Preprocessing

Goal: create clean train/test splits with proper scaling while preventing data leakage. No resampling here; imbalance is handled later.

Steps:
- Load raw data
- Stratified train-test split (`random_state=42`)
- Scale features using `StandardScaler` (fit on train only)
- Save processed splits to `data/processed`

Outputs:
- X_train_scaled.csv, X_test_scaled.csv
- y_train.csv, y_test.csv

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from pathlib import Path
import joblib

# Use path relative to this notebook (../data/...) so it works even if run from notebooks folder
RAW_PATH = Path('../data/raw/credit_card_fraud_dataset.csv')
PROCESSED_DIR = Path('../data/processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Load data
# Target column in this dataset is 'IsFraud'
data = pd.read_csv(RAW_PATH)
X = data.drop(columns=['IsFraud'])
y = data['IsFraud']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Preprocessor: scale numeric, one-hot encode categorical (fit on train only)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names for saving
feature_names = preprocessor.get_feature_names_out()

# Persist processed data and preprocessor
pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed,
             columns=feature_names).to_csv(PROCESSED_DIR / "X_train_scaled.csv", index=False)

pd.DataFrame(X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed,
             columns=feature_names).to_csv(PROCESSED_DIR / "X_test_scaled.csv", index=False)

y_train.to_csv(PROCESSED_DIR / "y_train.csv", index=False)
y_test.to_csv(PROCESSED_DIR / "y_test.csv", index=False)
joblib.dump(preprocessor, PROCESSED_DIR / "preprocessor.joblib")

X_train.shape, X_test.shape

((80000, 6), (20000, 6))

Note: Resampling to address imbalance is deferred to `03_imbalance_handling.ipynb` to keep data leakage risks low.