# Fraud Detection Pipeline Execution

This notebook provides an interactive environment to run the fraud detection pipeline stages: Data Processing, Feature Engineering, and Model Training.

In [1]:
import sys
import logging
from pathlib import Path
import pandas as pd
import numpy as np
import pickle

# Add src to path
BASE_PATH = Path('..').resolve()
sys.path.insert(0, str(BASE_PATH / 'src'))

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

from data_processing import DataProcessor
from feature_engineering import FeatureEngineer
from train import FraudDetectionTrainer

XGBoost not available.


## 1. Data Processing

In [2]:
data_path = BASE_PATH / 'Data' / 'raw'
processed_path = BASE_PATH / 'Data' / 'processed'
processed_path.mkdir(parents=True, exist_ok=True)

processor = DataProcessor(data_path)

# Load and merge data
df = processor.load_and_merge_data()

# Output shape
print(f"Data Shape: {df.shape}")

Loading transaction data...
Transaction data loaded: (590540, 394)
Loading identity data...
Identity data loaded: (144233, 41)
Merging datasets...
Identity coverage: 24.4%
Memory reduced from 2513.97 MB to 1603.31 MB (36.2% reduction)
Final dataset shape: (590540, 434)


Data Shape: (590540, 434)


## 2. Feature Engineering

We use the optimized threshold (0.95) for dropping missing columns based on our previous validation.

In [3]:
features_path = BASE_PATH / 'Data' / 'features'
features_path.mkdir(parents=True, exist_ok=True)

# Initialize Feature Engineer with 0.95 threshold
fe = FeatureEngineer(drop_missing_threshold=0.95)

# Fit and Transform
train_processed = fe.fit_transform(df)

# Save artifacts
fe.save_artifacts(features_path / 'feature_artifacts.pkl')
train_processed.to_parquet(processed_path / 'train_processed.parquet', index=False)

print(f"Processed Data Shape: {train_processed.shape}")

Starting feature engineering pipeline...
Handling missing values (threshold: 0.95)...
Dropping 9 columns with >95.0% missing
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = train_df[col].isnull().astype(np.int8)
  train_df[f'{col}_missing'] = trai

Processed Data Shape: (590540, 451)


## 3. Model Training

We train LightGBM, Random Forest, and Decision Tree baseline.

In [4]:
# Load feature columns from artifacts
with open(features_path / 'feature_artifacts.pkl', 'rb') as f:
    artifacts = pickle.load(f)
feature_cols = artifacts['feature_cols']

# Prepare Training Data (Time-based split)
train_df_sorted = train_processed.sort_values('TransactionDT').reset_index(drop=True)
split_idx = int(len(train_df_sorted) * 0.8)

available_features = [c for c in feature_cols if c in train_df_sorted.columns]

X_train = train_df_sorted.iloc[:split_idx][available_features]
y_train = train_df_sorted.iloc[:split_idx]['isFraud'].values.astype(int)
X_val = train_df_sorted.iloc[split_idx:][available_features]
y_val = train_df_sorted.iloc[split_idx:]['isFraud'].values.astype(int)

print(f"Training Samples: {len(X_train)}")
print(f"Validation Samples: {len(X_val)}")

Training Samples: 472432
Validation Samples: 118108


In [5]:
# Initialize Trainer
trainer = FraudDetectionTrainer(experiment_name="notebook_interactive")

# Train Models
results = trainer.train_all_models(X_train, y_train, X_val, y_val)

Training all models...
Class weights - 0: 0.5182, 1: 14.2307
Scale pos weight: 27.4615
Training LightGBM...
LightGBM PR-AUC: 0.2645
LightGBM ROC-AUC: 0.8344
Training Random Forest...
Random Forest PR-AUC: 0.4621
Random Forest ROC-AUC: 0.8812
Training Decision Tree Baseline...
Decision Tree PR-AUC: 0.2382
Decision Tree ROC-AUC: 0.8328
Best model: RandomForest (PR-AUC: 0.4621)


## 4. Results Comparison

In [6]:
results_df = pd.DataFrame(results).T
results_df = results_df[['roc_auc', 'pr_auc', 'f1', 'optimal_threshold']]
print(results_df)

# Save best model
output_model_path = BASE_PATH / 'outputs' / 'models' / 'model_notebook.pkl'
trainer.save_model(output_model_path, available_features)

Model saved to C:\Users\Lenovo\Desktop\fraud-detection-mlops\outputs\models\model_notebook.pkl


               roc_auc    pr_auc        f1  optimal_threshold
LightGBM      0.834371  0.264513  0.378164           0.117417
RandomForest  0.881206  0.462066  0.457026           0.782500
DecisionTree  0.832824  0.238226  0.349281           0.892985
