<div class="alert alert-block alert-info" align=center><h1>Accounting Fraud Detection Modeling</h1></div>

## <div class="alert alert-block alert-info" ><h1>Imports</h1></div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import joblib
import json
from datetime import datetime
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier
from scipy.stats import uniform, randint

#Start a timer to check the execution time of the notebook.
start_time = time.time()

## <div class="alert alert-block alert-info" ><h1>Function Definitions</h1></div>

In [None]:
def build_model(learning_rate=0.001, dropout_rate=0.3, neurons=128):
    """
    Build and compile a deep neural network with 3 hidden layers using LeakyReLU activations.

    Parameters:
    - learning_rate (float): Learning rate for the Adam optimizer.
    - dropout_rate (float): Dropout rate to reduce overfitting.
    - neurons (int): Number of neurons in the first hidden layer. Subsequent layers scale down.

    Returns:
    - model (Sequential): Compiled Keras model ready for training.
    """

    model = Sequential()

    # First hidden layer: full neuron count with LeakyReLU and Dropout
    model.add(Dense(neurons, input_shape=(X_train_scaled.shape[1],)))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(dropout_rate))

    # Second hidden layer: half the neurons
    model.add(Dense(neurons // 2))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(dropout_rate))

    # Third hidden layer: quarter the neurons
    model.add(Dense(neurons // 4))
    model.add(LeakyReLU(alpha=0.1))
    model.add(Dropout(dropout_rate))

    # Output layer: binary classification with sigmoid
    model.add(Dense(1, activation='sigmoid'))

    # Compile model with Adam optimizer and AUC tracking
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['AUC']
    )

    return model




## <div class="alert alert-info"><h1>Load the Models and the Data</h1></div>

In [None]:
# Define paths
modelpath = "../models"
datapath = "../data"

# Load pre-fit ColumnTransformer
scaler = joblib.load(os.path.join(modelpath,"column_scaler.pkl"))

# Load models
catboost_model = joblib.load(os.path.join(modelpath, "catboost_model.pkl"))
dnn_model = load_model(os.path.join(modelpath, "dnn_model.keras"))

# Load metadata
with open(os.path.join(modelpath, "catboost_model_meta.json")) as f:
    catboost_metadata = json.load(f)

with open(os.path.join(modelpath, "dnn_model_meta.json")) as f:
    dnn_metadata = json.load(f)

# Load data
df = pd.read_csv(os.path.join(datapath, "fraud_data_eda.csv"))

<div class="alert alert-info">We are tuning the models starting with CatBoost.<br>
Before tuning, we will load the results from the baseline model.

</div>

In [None]:
# Extract features and target
features = catboost_metadata["features"]
X = df[features]
y = df["misstate"]

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Apply loaded scaler to train and test train data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Evaluate saved CatBoost model
saved_pred = catboost_model.predict(X_test_scaled)
saved_proba = catboost_model.predict_proba(X_test_scaled)[:, 1]

print("📦 Saved CatBoost Model")
print(f"Accuracy:     {accuracy_score(y_test, saved_pred):.4f}")
print(f"F1 Score:     {f1_score(y_test, saved_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, saved_proba):.4f}")
print(classification_report(y_test, saved_pred))

<div class="alert alert-info">We will tune CatBoost model.

</div>

In [None]:
# Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

# Use in CatBoost
tuned_catboost_scaled = CatBoostClassifier(
    verbose=0,
    random_state=42,
    eval_metric='AUC',
    class_weights=class_weights,
    iterations=530,
    depth=6,
    learning_rate=0.03837,
    l2_leaf_reg=3.296,
    border_count=114
)

tuned_catboost_scaled.fit(X_train_scaled, y_train)

# Evaluate on scaled test set
y_pred = tuned_catboost_scaled.predict(X_test_scaled)
y_proba = tuned_catboost_scaled.predict_proba(X_test_scaled)[:, 1]

print("📦 Tuned CatBoost Model")
print(f"Accuracy:     {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score:     {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print(classification_report(y_test, y_pred))


<div class="alert alert-info">Our tuned model significantly outperformed the baseline. 
Time for some cross-validation.

</div>

In [None]:
CV = 5  # cross-validation folds for tuning

In [None]:
# Define hyperparameter space
param_dist = {
    "iterations": randint(200, 1000),
    "depth": randint(4, 8),
    "learning_rate": uniform(0.005, 0.1),
    "l2_leaf_reg": uniform(1, 5),
    "border_count": randint(32, 128)
}

# Instantiate base model
catboost_base = CatBoostClassifier(
    verbose=0,
    random_state=42,
    eval_metric="AUC",
    class_weights=compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train).tolist()
)

# Randomized Search
catboost_random_search = RandomizedSearchCV(
    estimator=catboost_base,
    param_distributions=param_dist,
    n_iter=20,
    cv=CV,
    scoring="roc_auc",
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Run search
#catboost_random_search.fit(X_train, y_train)

# Best model
#best_catboost = catboost_random_search.best_estimator_

# Evaluate
#y_pred = best_catboost.predict(X_test)
#y_proba = best_catboost.predict_proba(X_test)[:, 1]

#print("🔍 Tuned CatBoost")
#print("Best Params:", catboost_random_search.best_params_)
#print("ROC AUC:", roc_auc_score(y_test, y_proba))
#print(classification_report(y_test, y_pred))


<div class="alert alert-info">We will now tune our DNN model.<br>
Before tuning, we will load the results from the baseline model.

</div>

In [None]:
# Evaluate saved CatBoost model
saved_dnn_proba = dnn_model.predict(X_test_scaled)
saved_dnn_pred = (saved_dnn_proba > 0.5).astype(int)


print("📦 Saved DNN Model")
print(f"Accuracy:     {accuracy_score(y_test, saved_dnn_pred):.4f}")
print(f"F1 Score:     {f1_score(y_test, saved_dnn_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, saved_dnn_proba):.4f}")
print(classification_report(y_test, saved_dnn_pred))

<div class="alert alert-info">Time to perform hyperparameter tuning on the DNN model.

</div>

In [None]:
# Initialize classifier
keras_clf = KerasClassifier(
    model=build_model,  
    verbose=0
)


# Define hyperparameter space
param_dist = {
    "batch_size": [32, 64, 128],
    "epochs": [30, 50],
    "model__learning_rate": uniform(1e-4, 9e-4),
    "model__dropout_rate": uniform(0.2, 0.3),
    "model__neurons": [64, 128, 256]
}

dnn_random_search = RandomizedSearchCV(
    estimator=keras_clf,
    param_distributions=param_dist,
    n_iter=20,
    scoring='roc_auc',
    cv=CV,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

dnn_random_search.fit(X_train_scaled, y_train)

best_dnn = dnn_random_search.best_estimator_

In [None]:
# 🔍 Post-training prediction and threshold evaluation
# Safe prediction with scikeras wrapper
y_proba = best_dnn.model_.predict(X_test_scaled).ravel()

threshold = 0.5
# Default threshold
print("Thresholds: ", threshold)
y_pred = (y_proba >= threshold).astype(int)
print(f"Accuracy:     {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score:     {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC:      {roc_auc_score(y_test, y_proba):.4f}")
print(classification_report(y_test, y_pred))

threshold = 0.3
print("\nThresholds: ", threshold)
y_pred = (y_proba >= threshold).astype(int)
print(f"Accuracy:     {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score:     {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC:      {roc_auc_score(y_test, y_proba):.4f}")
print(classification_report(y_test, y_pred))


In [None]:
for t in [0.05, 0.03, 0.02, 0.01]:
    print(f"\nThreshold: {t}")
    y_pred = (y_proba >= t).astype(int)
    print(classification_report(y_test, y_pred))

In [None]:
plt.hist(y_proba, bins=50)
plt.title("Predicted probabilities")
print("DNN Best Params", dnn_random_search.best_params_)

<div style="border: 2px solid #4CAF50; padding: 16px; border-radius: 8px; background-color: #F9FFFB; font-family: sans-serif; font-size: 15px">
  <strong>💡 Tuning Strategy Note:</strong><br><br>
  Since we used <code>RandomizedSearchCV</code> with <code>cv=5</code> to broadly explore the hyperparameter space, we can now use <code>GridSearchCV</code> with <code>cv=3</code> for fine-tuning.<br><br>

  <ul>
    <li><strong>RandomizedSearchCV (cv=5):</strong> best for wide, randomized search with higher confidence in initial rankings.</li>
    <li><strong>GridSearchCV (cv=3):</strong> faster when narrowing in on the optimal region around the best parameters.</li>
  </ul>

  This strategy gives us a good balance between performance and compute efficiency:
  <ul>
    <li>Use <code>GridSearchCV</code> to test smaller, precise hyperparameter ranges</li>
    <li>Lowering to <code>cv=3</code> is acceptable at this stage, especially since folds are stratified</li>
    <li>If two or more models are very close in performance, we can optionally rerun with <code>cv=5</code> on just the top configs</li>
  </ul>

  ✅ This staged approach ensures reliable model selection without excessive training time.
</div>


In [None]:
#Stop the timer to check the execution time of the notebook.
end_time = time.time()
print(f"Total execution time: {round(end_time - start_time, 2)} seconds")