# Fraud Detection Pipeline Execution

This notebook provides an interactive environment to run the fraud detection pipeline stages: Data Processing, Feature Engineering, and Model Training.

In [None]:
import sys
import logging
from pathlib import Path
import pandas as pd
import numpy as np
import pickle

# Add src to path
BASE_PATH = Path('..').resolve()
sys.path.insert(0, str(BASE_PATH / 'src'))

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)

from data_processing import DataProcessor
from feature_engineering import FeatureEngineer
from train import FraudDetectionTrainer

## 1. Data Processing

In [None]:
data_path = BASE_PATH / 'Data' / 'raw'
processed_path = BASE_PATH / 'Data' / 'processed'
processed_path.mkdir(parents=True, exist_ok=True)

processor = DataProcessor(data_path)

# Load and merge data
df = processor.load_and_merge_data()

# Output shape
print(f"Data Shape: {df.shape}")

## 2. Feature Engineering

We use the optimized threshold (0.95) for dropping missing columns based on our previous validation.

In [None]:
features_path = BASE_PATH / 'Data' / 'features'
features_path.mkdir(parents=True, exist_ok=True)

# Initialize Feature Engineer with 0.95 threshold
fe = FeatureEngineer(drop_missing_threshold=0.95)

# Fit and Transform
train_processed = fe.fit_transform(df)

# Save artifacts
fe.save_artifacts(features_path / 'feature_artifacts.pkl')
train_processed.to_parquet(processed_path / 'train_processed.parquet', index=False)

print(f"Processed Data Shape: {train_processed.shape}")

## 3. Model Training

We train LightGBM, Random Forest, and Decision Tree baseline.

In [None]:
# Load feature columns from artifacts
with open(features_path / 'feature_artifacts.pkl', 'rb') as f:
    artifacts = pickle.load(f)
feature_cols = artifacts['feature_cols']

# Prepare Training Data (Time-based split)
train_df_sorted = train_processed.sort_values('TransactionDT').reset_index(drop=True)
split_idx = int(len(train_df_sorted) * 0.8)

available_features = [c for c in feature_cols if c in train_df_sorted.columns]

X_train = train_df_sorted.iloc[:split_idx][available_features]
y_train = train_df_sorted.iloc[:split_idx]['isFraud'].values.astype(int)
X_val = train_df_sorted.iloc[split_idx:][available_features]
y_val = train_df_sorted.iloc[split_idx:]['isFraud'].values.astype(int)

print(f"Training Samples: {len(X_train)}")
print(f"Validation Samples: {len(X_val)}")

In [None]:
# Initialize Trainer
trainer = FraudDetectionTrainer(experiment_name="notebook_interactive")

# Train Models
results = trainer.train_all_models(X_train, y_train, X_val, y_val)

## 4. Results Comparison

In [None]:
results_df = pd.DataFrame(results).T
results_df = results_df[['roc_auc', 'pr_auc', 'f1', 'optimal_threshold']]
print(results_df)

# Save best model
output_model_path = BASE_PATH / 'outputs' / 'models' / 'model_notebook.pkl'
trainer.save_model(output_model_path, available_features)