# Data Preprocessing - Bank Fraud Detection

This notebook covers:
- Data cleaning
- Feature engineering
- Data transformation
- Handling class imbalance
- Train/validation/test split


In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

from src.data.data_processor import process_fraud_data
from src.features.feature_engineering import FeatureEngineer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Process the raw data (using sample for faster processing)
# For production, use full dataset or larger sample
df = process_fraud_data(
    input_path='../data/raw/fraud.csv',
    output_path='../data/processed/fraud_data.csv',
    sample_size=500000,  # Adjust based on your computational resources
    random_state=42
)

print(f"Processed dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


INFO:src.data.data_processor:Loading data from ../data/raw/fraud.csv...
INFO:src.data.data_processor:Sampling 500,000 rows from dataset...
INFO:src.data.data_processor:Loaded 500,000 sampled rows
INFO:src.data.data_processor:Starting data preprocessing...
INFO:src.data.data_processor:Processed data shape: (500000, 20)
INFO:src.data.data_processor:Fraud cases: 636 (0.1272%)
INFO:src.data.data_processor:Processed data saved to ..\data\processed\fraud_data.csv


Processed dataset shape: (500000, 20)

Columns: ['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'is_fraud', 'step_hour', 'step_day', 'is_weekend', 'amount_log', 'amount_sqrt', 'amount_squared', 'balance_change_org', 'orig_zero_balance', 'balance_change_dest', 'dest_zero_balance', 'amount_balance_ratio', 'type_encoded']


Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,is_fraud,step_hour,step_day,is_weekend,amount_log,amount_sqrt,amount_squared,balance_change_org,orig_zero_balance,balance_change_dest,dest_zero_balance,amount_balance_ratio,type_encoded
0,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,1,0,0,9.364703,108.019165,136145500.0,-11668.14,0,0.0,1,0.280788,3
1,1,DEBIT,9302.79,11299.0,1996.21,29832.0,16896.7,0,1,0,0,9.138177,96.450974,86541900.0,-9302.79,0,-12935.3,0,0.823256,2
2,1,PAYMENT,24213.67,0.0,0.0,0.0,0.0,0,1,0,0,10.094714,155.607422,586301800.0,0.0,1,0.0,1,24213.67,3
3,1,PAYMENT,2791.42,300481.0,297689.58,0.0,0.0,0,1,0,0,7.934664,52.833889,7792026.0,-2791.42,0,0.0,1,0.00929,3
4,1,PAYMENT,5281.48,152019.0,146737.52,0.0,0.0,0,1,0,0,8.572151,72.67379,27894030.0,-5281.48,0,0.0,1,0.034742,3


In [4]:
# Prepare features and target
target_col = 'is_fraud'
feature_cols = [col for col in df.columns if col not in [target_col, 'type']]

X = df[feature_cols]
y = df[target_col]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFraud cases: {y.sum()} ({(y.sum()/len(y)*100):.4f}%)")


Features shape: (500000, 18)
Target shape: (500000,)

Fraud cases: 636 (0.1272%)


In [5]:
# Train/Validation/Test split (60/20/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp  # 0.25 * 0.8 = 0.2
)

print(f"Train set: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nTrain fraud rate: {(y_train.sum()/len(y_train)*100):.4f}%")
print(f"Validation fraud rate: {(y_val.sum()/len(y_val)*100):.4f}%")
print(f"Test fraud rate: {(y_test.sum()/len(y_test)*100):.4f}%")


Train set: 300,000 samples (60.0%)
Validation set: 100,000 samples (20.0%)
Test set: 100,000 samples (20.0%)

Train fraud rate: 0.1273%
Validation fraud rate: 0.1270%
Test fraud rate: 0.1270%


In [None]:
# Save processed splits
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_val.to_csv('../data/processed/X_val.csv', i ndex=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False, header=['is_fraud'])
y_val.to_csv('../data/processed/y_val.csv', index=False, header=['is_fraud'])
y_test.to_csv('../data/processed/y_test.csv', index=False, header=['is_fraud'])

print("Processed data splits saved successfully!")


Processed data splits saved successfully!
