# 02 - Data Preprocessing

Goal: create clean train/test splits with proper scaling while preventing data leakage. No resampling here; imbalance is handled later.

Steps:
- Load raw data
- Stratified train-test split (`random_state=42`)
- Scale features using `StandardScaler` (fit on train only)
- Save processed splits to `data/processed`

Outputs:
- X_train_scaled.csv, X_test_scaled.csv
- y_train.csv, y_test.csv

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from pathlib import Path
import joblib
import re
import json

# Use path relative to this notebook (../data/...) so it works even if run from notebooks folder
RAW_PATH = Path('../data/raw/credit_card_fraud_dataset.csv')
PROCESSED_DIR = Path('../data/processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Setup results directories
RESULTS_DIR = Path('../results')
METRICS_DIR = RESULTS_DIR / 'metrics'
METRICS_DIR.mkdir(parents=True, exist_ok=True)

# Load data
# Target column in this dataset is 'IsFraud'
data = pd.read_csv(RAW_PATH)
X = data.drop(columns=['IsFraud'])
y = data['IsFraud']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Preprocessor: scale numeric, one-hot encode categorical (fit on train only)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names and clean them for ML library compatibility (especially LightGBM)
feature_names = preprocessor.get_feature_names_out()

# Clean column names: replace special characters with underscores
# This fixes issues with LightGBM not accepting characters like ':', '.', etc.
def clean_column_names(columns):
    """Replace special characters with underscores for ML library compatibility"""
    return [re.sub(r'[^\w]', '_', col) for col in columns]

feature_names_clean = clean_column_names(feature_names)

print(f"Original feature names (first 5): {feature_names[:5]}")
print(f"Cleaned feature names (first 5): {feature_names_clean[:5]}")
print(f"Total features: {len(feature_names_clean)}")

# Persist processed data and preprocessor
pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, "toarray") else X_train_processed,
             columns=feature_names_clean).to_csv(PROCESSED_DIR / "X_train_scaled.csv", index=False)

pd.DataFrame(X_test_processed.toarray() if hasattr(X_test_processed, "toarray") else X_test_processed,
             columns=feature_names_clean).to_csv(PROCESSED_DIR / "X_test_scaled.csv", index=False)

y_train.to_csv(PROCESSED_DIR / "y_train.csv", index=False)
y_test.to_csv(PROCESSED_DIR / "y_test.csv", index=False)
joblib.dump(preprocessor, PROCESSED_DIR / "preprocessor.joblib")

print(f"\nSaved processed files to: {PROCESSED_DIR}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

# Save preprocessing metadata
preprocessing_metadata = {
    'original_shape': {'train': list(X_train.shape), 'test': list(X_test.shape)},
    'processed_shape': {'train': [X_train_processed.shape[0], len(feature_names_clean)], 
                       'test': [X_test_processed.shape[0], len(feature_names_clean)]},
    'numeric_features': list(numeric_features),
    'categorical_features': list(categorical_features),
    'total_features_after_encoding': len(feature_names_clean),
    'test_split_size': 0.2,
    'random_state': 42,
    'class_distribution': {
        'train': {'non_fraud': int((y_train == 0).sum()), 'fraud': int((y_train == 1).sum())},
        'test': {'non_fraud': int((y_test == 0).sum()), 'fraud': int((y_test == 1).sum())}
    }
}

with open(METRICS_DIR / '02_preprocessing_metadata.json', 'w') as f:
    json.dump(preprocessing_metadata, f, indent=2)
print(f"Saved preprocessing metadata to: {METRICS_DIR / '02_preprocessing_metadata.json'}")

X_train.shape, X_test.shape

Original feature names (first 5): ['num__TransactionID' 'num__Amount' 'num__MerchantID'
 'cat__TransactionDate_00:35.5' 'cat__TransactionDate_00:35.6']
Cleaned feature names (first 5): ['num__TransactionID', 'num__Amount', 'num__MerchantID', 'cat__TransactionDate_00_35_5', 'cat__TransactionDate_00_35_6']
Total features: 375

Saved processed files to: ..\data\processed
X_train shape: (80000, 6)
X_test shape: (20000, 6)
Saved preprocessing metadata to: ..\results\metrics\02_preprocessing_metadata.json


((80000, 6), (20000, 6))

Note: Resampling to address imbalance is deferred to `03_imbalance_handling.ipynb` to keep data leakage risks low.