In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
from benchmark import PerformanceMonitor
from xgboost import XGBClassifier
from data_loader import load_and_standardize_data

  import pynvml  # type: ignore[import]


In [2]:
FILE_PATH = r"..\data\processed\feature_all.txt"
# Hinweis: feature_all.txt ist oft leerzeichen-getrennt
X, y = load_and_standardize_data(FILE_PATH, target_col_name="Phishing?", delimiter=" ")

--- [Loader] Starte Laden von: feature_all.txt ---
--- [Loader] Fertig. Features: 201, Samples: 507171 ---


In [3]:
# Split (Standardisiert)
print("Führe Split durch...")
url_train_x, url_temp_x, url_train_y, url_temp_y = train_test_split(X, y, test_size=0.3, random_state=42)
url_val_x, url_test_x, url_val_y, url_test_y = train_test_split(url_temp_x, url_temp_y, test_size=0.5, random_state=42)

Führe Split durch...


In [4]:
# --- 4. FORMATIERUNG ---
X_train = url_train_x.astype('float32')
y_train = url_train_y.astype('float32')

X_val = url_val_x.astype('float32')
y_val = url_val_y.astype('float32')

X_test = url_test_x.astype('float32')
y_test = url_test_y.astype('float32')

In [5]:
# 1. Monitor initialisieren
monitor = PerformanceMonitor("XGBoost")

# --- TRAINING ---
print("Starte Training...")
monitor.start_measurement()

# --- VORBEREITUNG ---
# Daten vorher umwandeln, um Kopieren während des Trainings zu vermeiden
X_train = url_train_x.astype('float32')
y_train = url_train_y.astype('float32')
X_val = url_val_x.astype('float32')
y_val = url_val_y.astype('float32')


bst = XGBClassifier(
    n_estimators=5000,
    max_depth=6,
    learning_rate=0.01,
    objective='binary:logistic',
    early_stopping_rounds=50,
    tree_method="hist",  # Effizientester Algorithmus für GPU
    device="cuda"        # Aktiviert die GPU
)

# Training durchführen
# Wir nutzen .astype(float), um Warnungen bei booleschen/object Spalten zu vermeiden
bst.fit(
    url_train_x.astype(float), 
    url_train_y.astype(float),
    eval_set=[(url_val_x.astype(float), url_val_y.astype(float))], 
    verbose=False
)

monitor.end_measurement(task_name="Training")


Starte Training...
--- Ergebnisse XGBoost (Training) ---
Zeit: 52.7698s | GPU-Last: 76.8%
VRAM (System): 2173.56 MB | VRAM (Torch): 0.0 MB


{'model': 'XGBoost',
 'task': 'Training',
 'time_sec': 52.7698,
 'ram_mb': 3590.32,
 'vram_mb': 2173.56,
 'torch_vram_mb': 0.0,
 'cpu_percent': 112.7,
 'gpu_util_percent': 76.8}

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# --- INFERENZ ---
print("Starte Inferenz (gesamtes Testset)...")
monitor.start_measurement()

X_test_ready = url_test_x.astype(float)
y_scores = bst.predict_proba(X_test_ready)[:, 1]
y_pred = bst.predict(X_test_ready)

# --- METRIKEN BERECHNEN ---
y_true = url_test_y.astype(float).values

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
auc = roc_auc_score(y_true, y_scores)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# KORREKTUR: Dictionary umbenannt, damit 'metrics' Modul nicht überschrieben wird
metrics_dict = {
    "accuracy": round(acc, 4),
    "precision": round(prec, 4),
    "recall": round(rec, 4),
    "f1_score": round(f1, 4),
    "auc": round(auc, 4),
    "fpr": round(fpr, 4)
}

monitor.end_measurement(task_name="Inferenz", extra_metrics=metrics_dict)

Starte Inferenz (gesamtes Testset)...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


--- Ergebnisse XGBoost (Inferenz) ---
Zeit: 2.0028s | GPU-Last: 80.0%
VRAM (System): 2145.56 MB | VRAM (Torch): 0.0 MB


{'model': 'XGBoost',
 'task': 'Inferenz',
 'time_sec': 2.0028,
 'ram_mb': 3713.11,
 'vram_mb': 2145.56,
 'torch_vram_mb': 0.0,
 'cpu_percent': 316.4,
 'gpu_util_percent': 80.0,
 'accuracy': 0.9908,
 'precision': 0.9874,
 'recall': 0.9547,
 'f1_score': 0.9708,
 'auc': 0.9979,
 'fpr': np.float64(0.0023)}