In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
from benchmark import PerformanceMonitor
from xgboost import XGBClassifier

In [8]:
filename = r"..\data\processed\feature_all.txt" # Oder dein Pfad
target_col_name = "Phishing?"
delimiter = " "  # Original ist oft space-separated

In [9]:
# --- 2. LADEN & BEREINIGEN (Ab hier alles identisch!) ---
print(f"Lade Daten aus {filename}...")
# error_bad_lines=False oder on_bad_lines='skip' hilft bei kaputten Zeilen
try:
    dataset = pd.read_csv(filename, delimiter=delimiter, on_bad_lines='skip')
except TypeError:
    # Fallback für ältere Pandas Versionen
    dataset = pd.read_csv(filename, delimiter=delimiter, error_bad_lines=False)

# A) Ziel-Spalte sicherstellen
if target_col_name not in dataset.columns:
    # Falls der Name falsch ist, nimm die letzte Spalte (Heuristik)
    print(f"Warnung: Zielspalte '{target_col_name}' nicht gefunden. Nutze letzte Spalte.")
    target_col_name = dataset.columns[-1]

# B) Alles, was KEINE Zahl ist, radikal entfernen (Fairness für XGBoost)
# Das löst auch dein 'mw72424.txt' Problem
dataset_numeric = dataset.select_dtypes(include=[np.number])

# C) X und y trennen
# Wir müssen sichergehen, dass das Target auch noch da ist (falls es numerisch war)
if target_col_name not in dataset_numeric.columns:
    # Falls Target Text war (z.B. "yes"/"no"), müssen wir es aus dem Original holen und umwandeln
    y = dataset[target_col_name].astype('category').cat.codes
else:
    y = dataset_numeric[target_col_name]
    # Target aus X entfernen
    dataset_numeric = dataset_numeric.drop(columns=[target_col_name])

X = dataset_numeric


Lade Daten aus ..\data\processed\feature_all.txt...


In [10]:
print("Führe Train/Test Split durch (Random Seed 42)...")

url_train_x, url_temp_x, url_train_y, url_temp_y = train_test_split(
    X, y, test_size=0.3, random_state=42
)
url_val_x, url_test_x, url_val_y, url_test_y = train_test_split(
    url_temp_x, url_temp_y, test_size=0.5, random_state=42
)

Führe Train/Test Split durch (Random Seed 42)...


In [11]:
# --- 4. FORMATIERUNG ---
X_train = url_train_x.astype('float32')
y_train = url_train_y.astype('float32')

X_val = url_val_x.astype('float32')
y_val = url_val_y.astype('float32')

X_test = url_test_x.astype('float32')
y_test = url_test_y.astype('float32')

In [12]:
# 1. Monitor initialisieren
monitor = PerformanceMonitor("XGBoost")

# --- TRAINING ---
print("Starte Training...")
monitor.start_measurement()

# --- VORBEREITUNG ---
# Daten vorher umwandeln, um Kopieren während des Trainings zu vermeiden
X_train = url_train_x.astype('float32')
y_train = url_train_y.astype('float32')
X_val = url_val_x.astype('float32')
y_val = url_val_y.astype('float32')


bst = XGBClassifier(
    n_estimators=5000,
    max_depth=6,
    learning_rate=0.01,
    objective='binary:logistic',
    early_stopping_rounds=50,
    tree_method="hist",  # Effizientester Algorithmus für GPU
    device="cuda"        # Aktiviert die GPU
)

# Training durchführen
# Wir nutzen .astype(float), um Warnungen bei booleschen/object Spalten zu vermeiden
bst.fit(
    url_train_x.astype(float), 
    url_train_y.astype(float),
    eval_set=[(url_val_x.astype(float), url_val_y.astype(float))], 
    verbose=False
)

monitor.end_measurement(task_name="Training")


Starte Training...
--- Ergebnisse XGBoost (Training) ---
Zeit: 52.9987s | GPU-Last: 76.7%
VRAM (System): 2103.05 MB | VRAM (Torch): 0.0 MB


{'model': 'XGBoost',
 'task': 'Training',
 'time_sec': 52.9987,
 'ram_mb': 4560.37,
 'vram_mb': 2103.05,
 'torch_vram_mb': 0.0,
 'cpu_percent': 115.4,
 'gpu_util_percent': 76.7}

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# --- INFERENZ ---
print("Starte Inferenz (gesamtes Testset)...")
monitor.start_measurement()

X_test_ready = url_test_x.astype(float)
y_scores = bst.predict_proba(X_test_ready)[:, 1]
y_pred = bst.predict(X_test_ready)

# --- METRIKEN BERECHNEN ---
y_true = url_test_y.astype(float).values

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
auc = roc_auc_score(y_true, y_scores)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# KORREKTUR: Dictionary umbenannt, damit 'metrics' Modul nicht überschrieben wird
metrics_dict = {
    "accuracy": round(acc, 4),
    "precision": round(prec, 4),
    "recall": round(rec, 4),
    "f1_score": round(f1, 4),
    "auc": round(auc, 4),
    "fpr": round(fpr, 4)
}

monitor.end_measurement(task_name="Inferenz", extra_metrics=metrics_dict)

Starte Inferenz (gesamtes Testset)...
--- Ergebnisse XGBoost (Inferenz) ---
Zeit: 2.0038s | GPU-Last: 75.6%
VRAM (System): 2069.55 MB | VRAM (Torch): 0.0 MB


{'model': 'XGBoost',
 'task': 'Inferenz',
 'time_sec': 2.0038,
 'ram_mb': 4561.02,
 'vram_mb': 2069.55,
 'torch_vram_mb': 0.0,
 'cpu_percent': 312.5,
 'gpu_util_percent': 75.6,
 'accuracy': 0.9908,
 'precision': 0.9874,
 'recall': 0.9547,
 'f1_score': 0.9708,
 'auc': 0.9979,
 'fpr': np.float64(0.0023)}