In [1]:
import os

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import StratifiedKFold

In [2]:
data_path = os.path.join("output", "trainset.parquet")
model_path = os.path.join("models", "model.json")
os.makedirs(os.path.dirname(model_path), exist_ok=True)

RANDOM_STATE = 42
TEST_RATIO = 0.2
N_SPLITS = 5  # für Cross-Validation

In [3]:
dataset_df = pd.read_csv("data/fraud_oracle.csv")
dataset_df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


## Data Preprocessing

In [4]:
# Label-Encoding für kategoriale Spalten
for col in dataset_df.columns:
    if dataset_df[col].dtype == "object":
        le = LabelEncoder()
        dataset_df[col] = le.fit_transform(dataset_df[col].astype(str))

# Aufteilung in Fraud / Non-Fraud
frauds_df = dataset_df[dataset_df["FraudFound_P"] == 1]
non_frauds_df = dataset_df[dataset_df["FraudFound_P"] == 0]

total_frauds = len(frauds_df)
total_non_frauds = len(non_frauds_df)
fraud_ratio = len(frauds_df) / len(dataset_df)

print("Total Frauds:", total_frauds)
print("Total Non-Frauds:", total_non_frauds)
print("Fraud Ratio:", fraud_ratio)

Total Frauds: 923
Total Non-Frauds: 14497
Fraud Ratio: 0.05985732814526589


## Train/Test Split (stratifiziert über Fraud / Non-Fraud)

In [5]:
# Positive (Fraud) und Negative (Non-Fraud) Fälle getrennt mischen
pos_samples = frauds_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
neg_samples = non_frauds_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# 80% Train, 20% Test
train_num_pos = round(len(pos_samples) * (1 - TEST_RATIO))
train_num_neg = round(len(neg_samples) * (1 - TEST_RATIO))

# Klassengewichte für XGBoost aus dem Trainingsset
scale_pos_weight = train_num_neg / train_num_pos
print("scale_pos_weight (Train):", scale_pos_weight)

# Train-Samples
train_pos_samples = pos_samples.head(train_num_pos)
train_neg_samples = neg_samples.head(train_num_neg)

train_df = pd.concat([train_pos_samples, train_neg_samples], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Test-Samples: restliche Daten
test_pos_samples = pos_samples.tail(len(pos_samples) - train_num_pos)
test_neg_samples = neg_samples.tail(len(neg_samples) - train_num_neg)

test_df = pd.concat([test_pos_samples, test_neg_samples], ignore_index=True)
test_df = test_df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Kontrolle der Klassenverteilung
train_frauds_df = train_df[train_df["FraudFound_P"] == 1]
train_non_frauds_df = train_df[train_df["FraudFound_P"] == 0]

test_frauds_df = test_df[test_df["FraudFound_P"] == 1]
test_non_frauds_df = test_df[test_df["FraudFound_P"] == 0]

print("Train Fraud / Non-Fraud:", len(train_frauds_df), len(train_non_frauds_df))
print("Test  Fraud / Non-Fraud:", len(test_frauds_df), len(test_non_frauds_df))

scale_pos_weight (Train): 15.715447154471544
Train Fraud / Non-Fraud: 738 11598
Test  Fraud / Non-Fraud: 185 2899


## Cross-Validation auf dem Trainingsdatensatz

In [12]:
X_train_full = train_df.drop(columns=["FraudFound_P"])
y_train_full = train_df["FraudFound_P"]

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

cv_accuracies = []
cv_precisions = []
cv_recalls = []
cv_f1s = []

fold = 1
for train_index, val_index in skf.split(X_train_full, y_train_full):
    X_tr, X_val = X_train_full.iloc[train_index], X_train_full.iloc[val_index]
    y_tr, y_val = y_train_full.iloc[train_index], y_train_full.iloc[val_index]
    
    pos = y_tr.sum()
    neg = len(y_tr) - pos
    fold_scale_pos_weight = neg / pos
    
    model_cv = xgb.XGBClassifier(
        scale_pos_weight=fold_scale_pos_weight,
    )
    
    model_cv.fit(X_tr, y_tr)
    
    y_val_pred = model_cv.predict(X_val)
    
    acc = accuracy_score(y_val, y_val_pred)
    prec = precision_score(y_val, y_val_pred, zero_division=0)
    rec = recall_score(y_val, y_val_pred, zero_division=0)
    f1 = f1_score(y_val, y_val_pred, zero_division=0)
    
    print(f"Fold {fold}: Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}")
    
    cv_accuracies.append(acc)
    cv_precisions.append(prec)
    cv_recalls.append(rec)
    cv_f1s.append(f1)
    
    fold += 1

print("\nDurchschnitt über alle Folds:")
print(f"Accuracy : {np.mean(cv_accuracies):.4f} ± {np.std(cv_accuracies):.4f}")
print(f"Precision: {np.mean(cv_precisions):.4f} ± {np.std(cv_precisions):.4f}")
print(f"Recall   : {np.mean(cv_recalls):.4f} ± {np.std(cv_recalls):.4f}")
print(f"F1-Score : {np.mean(cv_f1s):.4f} ± {np.std(cv_f1s):.4f}")

Fold 1: Acc=0.9579, Prec=0.6310, Rec=0.7162, F1=0.6709
Fold 2: Acc=0.9477, Prec=0.5652, Rec=0.5306, F1=0.5474
Fold 3: Acc=0.9514, Prec=0.5780, Rec=0.6803, F1=0.6250
Fold 4: Acc=0.9574, Prec=0.6335, Rec=0.6892, F1=0.6602
Fold 5: Acc=0.9445, Prec=0.5355, Rec=0.5608, F1=0.5479

Durchschnitt über alle Folds:
Accuracy : 0.9518 ± 0.0053
Precision: 0.5886 ± 0.0382
Recall   : 0.6354 ± 0.0748
F1-Score : 0.6103 ± 0.0534


## Finale Modell-Training auf dem gesamten Trainingsdatensatz

In [9]:
X_train_final = train_df.drop(columns=["FraudFound_P"])
y_train_final = train_df["FraudFound_P"]

final_model = xgb.XGBClassifier(
    scale_pos_weight=scale_pos_weight,
)

final_model.fit(X_train_final, y_train_final)

final_model.save_model(model_path)
print("Finales Modell gespeichert unter:", model_path)

Finales Modell gespeichert unter: models\model.json


## Modell-Testing auf dem Testdatensatz

In [10]:
eval_X = test_df.drop(columns=["FraudFound_P"])
y_true = test_df["FraudFound_P"]

loaded_model = xgb.XGBClassifier()
loaded_model.load_model(model_path)

y_pred = loaded_model.predict(eval_X)

cm = confusion_matrix(y_true, y_pred)

print("Modellbewertung für Testdatensatz:")
print(f"Precision : {precision_score(y_true, y_pred):.4f}")
print(f"Accuracy  : {accuracy_score(y_true, y_pred):.4f}")
print(f"Recall    : {recall_score(y_true, y_pred):.4f}")
print(f"F1-Score  : {f1_score(y_true, y_pred):.4f}")

print("\nKonfusionsmatrix:")
print(cm)

Modellbewertung für Testdatensatz:
Precision : 0.5771
Accuracy  : 0.9514
Recall    : 0.7081
F1-Score  : 0.6359

Konfusionsmatrix:
[[2803   96]
 [  54  131]]
