In [14]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import os

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

In [2]:
data_path = os.path.join("output", "trainset.parquet")

In [3]:
dataset_df = pd.read_csv("data/fraud_oracle.csv")
dataset_df

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15415,Nov,4,Friday,Toyota,Urban,Tuesday,Nov,5,Male,Married,...,6 years,31 to 35,No,No,External,none,no change,1 vehicle,1996,Collision
15416,Nov,5,Thursday,Pontiac,Urban,Friday,Dec,1,Male,Married,...,6 years,31 to 35,No,No,External,more than 5,no change,3 to 4,1996,Liability
15417,Nov,5,Thursday,Toyota,Rural,Friday,Dec,1,Male,Single,...,5 years,26 to 30,No,No,External,1 to 2,no change,1 vehicle,1996,Collision
15418,Dec,1,Monday,Toyota,Urban,Thursday,Dec,2,Female,Married,...,2 years,31 to 35,No,No,External,more than 5,no change,1 vehicle,1996,All Perils


## Data Preprocessing

In [4]:
le = LabelEncoder()

for col in dataset_df:
    if dataset_df[col].dtype == "object":
        dataset_df[col] = LabelEncoder().fit_transform(dataset_df[col].astype(str))

frauds_df = dataset_df[dataset_df["FraudFound_P"] == 1]
non_frauds_df = dataset_df[dataset_df["FraudFound_P"] == 0]

In [5]:
total_frauds = len(frauds_df)
total_non_frauds = len(non_frauds_df)
fraud_ratio = len(frauds_df) / len(dataset_df)

print(total_frauds, total_non_frauds, fraud_ratio)

923 14497 0.05985732814526589


### Trainingsdatensatz

In [6]:
pos_samples = frauds_df
neg_samples = non_frauds_df

pos_samples = pos_samples.sample(frac=1, random_state=42).reset_index(drop=True)
neg_samples = neg_samples.sample(frac=1, random_state=42).reset_index(drop=True)

In [7]:
num_pos = round(len(pos_samples) * 0.8)
num_neg = round(len(neg_samples) * 0.8)

train_pos_samples = pos_samples.head(num_pos)
train_neg_samples = neg_samples.head(num_neg)

train_df = pd.concat(
    [train_pos_samples, train_neg_samples],
    ignore_index=True
)

train_df.sample(frac=1, random_state=42).reset_index(drop=True)

#train_pos_samples.to_parquet(data_path, index=False)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,2,2,5,5,1,5,3,3,0,2,...,5,4,0,0,0,2,3,0,1995,2
1,10,1,4,2,1,1,11,1,0,1,...,5,6,0,0,0,2,3,0,1994,2
2,7,2,1,13,1,7,8,2,1,2,...,3,3,0,0,0,3,3,0,1994,0
3,10,5,2,13,1,2,11,5,1,2,...,2,3,0,0,0,0,3,0,1995,1
4,7,4,6,13,1,5,8,4,1,1,...,6,6,0,0,0,3,3,0,1995,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12331,0,5,0,2,0,5,9,2,1,1,...,6,5,0,0,0,3,3,0,1995,2
12332,6,4,4,6,1,1,7,4,1,1,...,5,6,0,0,0,1,3,0,1994,2
12333,6,4,3,6,0,2,7,4,1,2,...,7,0,0,0,0,3,3,0,1996,2
12334,10,4,1,6,1,5,11,4,1,1,...,3,3,0,0,0,3,3,0,1995,1


### Testdatensatz

In [8]:
num_pos = round(len(pos_samples) * 0.2)
num_neg = round(len(neg_samples) * 0.2)

test_pos_samples = pos_samples.tail(num_pos)
test_neg_samples = neg_samples.tail(num_neg)

test_df = pd.concat(
    [test_pos_samples, test_neg_samples],
    ignore_index=True
)

test_df.sample(frac=1, random_state=42).reset_index(drop=True)

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,1,1,4,13,1,6,12,2,0,1,...,6,5,0,0,0,2,3,0,1994,1
1,5,5,2,2,1,2,2,2,1,1,...,6,8,0,0,0,2,3,0,1996,0
2,6,1,1,17,1,6,7,1,1,1,...,5,4,0,0,0,3,3,0,1994,1
3,0,1,2,6,1,2,1,1,1,1,...,5,6,0,0,0,0,3,0,1994,2
4,6,5,4,17,1,2,6,1,1,1,...,5,7,0,0,0,1,3,0,1995,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3079,0,4,4,0,1,1,9,1,1,1,...,5,5,0,0,0,3,3,0,1995,0
3080,11,4,3,13,0,7,12,5,1,2,...,6,6,0,0,0,0,3,0,1996,0
3081,9,4,0,13,1,1,10,4,0,2,...,5,5,0,0,0,0,3,0,1994,2
3082,8,5,3,9,1,7,7,4,1,1,...,6,6,1,0,0,3,3,0,1996,1


### Modell-Training

In [11]:
# Initialisiere XGBoost-Klassifikator mit Standardparametern
model = xgb.XGBClassifier()

# Definiere Feature-Matrix (X) und Zielvariable (y)
X = train_df.drop(columns=["FraudFound_P"])
y = train_df["FraudFound_P"]

# Trainiere das Modell auf den Trainingsdaten
model.fit(X, y)

# Speicherpfad für das trainierte Modell definieren
model_path = f"models/model.json"

# Speichere Modell im JSON-Format (kompatibel mit späterer SHAP-Analyse)
model.save_model(model_path)

### Modell-Testing

In [12]:
eval_X = test_df.drop(columns=["FraudFound_P"])
y_true = test_df["FraudFound_P"]

# Lade gespeichertes Modell der Mediengruppe
model_path = f"models/model.json"
model.load_model(model_path)

# Führe Vorhersagen auf den Testdaten durch
y_pred = model.predict(eval_X)

### Evaluation

In [13]:
# Berechne Konfusionsmatrix und Klassifikationsmetriken
cm = confusion_matrix(y_true, y_pred)

# Gebe zentrale Bewertungskennzahlen aus
print("Modellbewertung für Testdatensatz:")
print(f"Precision : {precision_score(y_true, y_pred):.4f}")
print(f"Recall    : {recall_score(y_true, y_pred):.4f}")
print(f"F1-Score  : {f1_score(y_true, y_pred):.4f}")

# Ausgabe der Konfusionsmatrix (TP, FP, FN, TN)
print("\nKonfusionsmatrix:")
print(cm)

NameError: name 'confusion_matrix' is not defined