<div class="alert alert-block alert-info" align=center><h1>Accounting Fraud Detection Modeling</h1></div>

## <div class="alert alert-block alert-info" ><h1>Imports</h1></div>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import joblib
import json
from datetime import datetime
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scipy.stats import uniform, randint

#Start a timer to check the execution time of the notebook.
start_time = time.time()




## <div class="alert alert-info"><h1>Load the Models and the Data</h1></div>

In [2]:
# Define paths
modelpath = "../models"
datapath = "../data"

# Load models
catboost_model = joblib.load(os.path.join(modelpath, "catboost_model.pkl"))
dnn_model = load_model(os.path.join(modelpath, "dnn_model.keras"))

# Load metadata
with open(os.path.join(modelpath, "catboost_model_meta.json")) as f:
    catboost_metadata = json.load(f)

with open(os.path.join(modelpath, "dnn_model_meta.json")) as f:
    dnn_metadata = json.load(f)

# Load data
df = pd.read_csv(os.path.join(datapath, "fraud_data_eda.csv"))




<div class="alert alert-info">We are tuning the models starting with CatBoost.<br>
Before tuning, we will load the results from the baseline model.

</div>

In [3]:
# Extract features and target
features = catboost_metadata["features"]
X = df[features]
y = df["misstate"]

# 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
# Evaluate saved CatBoost model
saved_pred = catboost_model.predict(X_test)
saved_proba = catboost_model.predict_proba(X_test)[:, 1]

print("📦 Saved CatBoost Model")
print(f"Accuracy:     {accuracy_score(y_test, saved_pred):.4f}")
print(f"F1 Score:     {f1_score(y_test, saved_pred):.4f}")
print("ROC AUC:", roc_auc_score(y_test, saved_proba))
print(classification_report(y_test, saved_pred))

# Already printed: baseline_catboost results
#print("\n🧪 Baseline CatBoost Re-trained on Split")
#print("ROC AUC:", roc_auc_score(y_test, y_proba))
#print(classification_report(y_test, y_pred))

📦 Saved CatBoost Model
Accuracy:     0.9933
F1 Score:     0.0000
ROC AUC: 0.5798299955286417
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     29016
           1       0.00      0.00      0.00       193

    accuracy                           0.99     29209
   macro avg       0.50      0.50      0.50     29209
weighted avg       0.99      0.99      0.99     29209



In [5]:


# Baseline CatBoost model
# Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

# Use in CatBoost
baseline_catboost = CatBoostClassifier(
    verbose=0,
    random_state=42,
    eval_metric='AUC',
    class_weights=class_weights
)
baseline_catboost.fit(X_train, y_train)

# Evaluation
y_pred = baseline_catboost.predict(X_test)
y_proba = baseline_catboost.predict_proba(X_test)[:, 1]

print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


ROC AUC: 0.8356784750525349
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     29016
           1       0.36      0.16      0.22       193

    accuracy                           0.99     29209
   macro avg       0.68      0.58      0.61     29209
weighted avg       0.99      0.99      0.99     29209



<div class="alert alert-info">We tuned models significantly outperformed the baseline. 
Time for some cross-validation.

</div>

In [6]:
CV = 5  # cross-validation folds for tuning

In [7]:
# Define hyperparameter space
param_dist = {
    "iterations": randint(200, 1000),
    "depth": randint(4, 8),
    "learning_rate": uniform(0.005, 0.1),
    "l2_leaf_reg": uniform(1, 5),
    "border_count": randint(32, 128)
}

# Instantiate base model
catboost_base = CatBoostClassifier(
    verbose=0,
    random_state=42,
    eval_metric="AUC",
    class_weights=compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train).tolist()
)

# Randomized Search
catboost_random_search = RandomizedSearchCV(
    estimator=catboost_base,
    param_distributions=param_dist,
    n_iter=20,
    cv=CV,
    scoring="roc_auc",
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Run search
catboost_random_search.fit(X_train, y_train)

# Best model
best_catboost = catboost_random_search.best_estimator_

# Evaluate
y_pred = best_catboost.predict(X_test)
y_proba = best_catboost.predict_proba(X_test)[:, 1]

print("🔍 Tuned CatBoost")
print("Best Params:", catboost_random_search.best_params_)
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
🔍 Tuned CatBoost
Best Params: {'border_count': 114, 'depth': 6, 'iterations': 530, 'l2_leaf_reg': 3.2962444598293357, 'learning_rate': 0.038370861113902185}
ROC AUC: 0.8397050189211313
              precision    recall  f1-score   support

           0       1.00      0.95      0.97     29016
           1       0.05      0.42      0.09       193

    accuracy                           0.94     29209
   macro avg       0.52      0.69      0.53     29209
weighted avg       0.99      0.94      0.97     29209



In [8]:
#Stop the timer to check the execution time of the notebook.
end_time = time.time()
print(f"Total execution time: {round(end_time - start_time, 2)} seconds")

Total execution time: 421.58 seconds
