# Cost-Sensitive Fraud Detection with Threshold Optimization

This notebook implements a cost-sensitive machine learning approach to fraud detection with decision threshold optimization.

## 1. Introduction

Financial fraud detection requires explicit modeling of asymmetric misclassification costs.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

## 2. Dataset Loading and Exploration

In [None]:
# Load the Credit Card Fraud Detection dataset
df = pd.read_csv('data/creditcard.csv')
print(f'Dataset shape: {df.shape}')
print(f'\nClass distribution:\n{df.Class.value_counts()}')

## 3. Feature Engineering and Data Preparation

In [None]:
# Prepare features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 4. Cost-Sensitive Model Training

In [None]:
# Train cost-sensitive model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Get predictions and probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC-AUC Score: {roc_auc:.4f}')

## 5. Threshold Optimization

In [None]:
# Define cost matrix: [FP_cost, FN_cost]
fp_cost = 1  # False positive cost
fn_cost = 10  # False negative cost (fraud)

# Find optimal threshold
thresholds = np.arange(0, 1, 0.01)
costs = []

for thresh in thresholds:
    y_pred_thresh = (y_pred_proba >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_thresh).ravel()
    total_cost = fp * fp_cost + fn * fn_cost
    costs.append(total_cost)

optimal_threshold = thresholds[np.argmin(costs)]
print(f'Optimal Threshold: {optimal_threshold:.2f}')

## 6. Model Explainability

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print('Top 10 Features:')
print(feature_importance.head(10))

## 7. Discussion and Results

Cost-sensitive learning with threshold optimization provides a principled approach to fraud detection that explicitly incorporates financial constraints.