In [1]:
# Logistic Regression mit PhiUSIIL Dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from benchmark import PerformanceMonitor
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import torch
import torch.nn as nn
import torch.optim as optim
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
from sklearn.preprocessing import StandardScaler
from data_loader import load_and_standardize_data

  import pynvml  # type: ignore[import]


In [2]:
FILE_PATH = r"..\data\processed\PhiUSIIL_Phishing_URL_Dataset.csv" 
TARGET_COL = "Phishing?"
DELIMITER = ","

X, y = load_and_standardize_data(FILE_PATH, TARGET_COL, DELIMITER)

--- [Loader] Starte Laden von: PhiUSIIL_Phishing_URL_Dataset.csv ---
Warnung: Target 'Phishing?' nicht gefunden. Versuche Alternativen...
-> Habe 'label' als Target identifiziert.
--- [Loader] Fertig. Features: 50, Samples: 235795 ---


In [3]:
url_train_x, url_temp_x, url_train_y, url_temp_y = train_test_split(X, y, test_size=0.3, random_state=42)
url_val_x, url_test_x, url_val_y, url_test_y = train_test_split(url_temp_x, url_temp_y, test_size=0.5, random_state=42)

In [4]:
# 3. WICHTIG: Skalierung für Logistische Regression
# LogReg braucht zwingend skalierte Daten (Mittelwert 0, Varianz 1)
print("Skaliere Daten...")
scaler = StandardScaler()
url_train_x = scaler.fit_transform(url_train_x)
url_val_x = scaler.transform(url_val_x) # Nur transform, nicht fitten!
url_test_x = scaler.transform(url_test_x)

Skaliere Daten...


In [5]:
url_train_x, url_temp_x, url_train_y, url_temp_y = train_test_split(X, y, test_size=0.3, random_state=42)
url_val_x, url_test_x, url_val_y, url_test_y = train_test_split(url_temp_x, url_temp_y, test_size=0.5, random_state=42)

In [6]:
# Monitor starten
monitor = PerformanceMonitor("Logistic Regression PhiUSIIL")
monitor.start_measurement()

# 1. GPU Setup prüfen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training läuft auf: {device}")

# 2. PyTorch Model Definition (Wrapper, der sich wie sklearn verhält)
class GPULogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(GPULogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

    # Hilfsfunktion, um sklearn-API nachzubauen
    def fit(self, X, y, epochs=100, lr=0.01, batch_size=4096):
        self.to(device)
        self.train()
        
        # Daten zu Tensoren konvertieren
        # .values nutzen, falls es Pandas Dataframes sind
        X_np = X.values if hasattr(X, 'values') else X
        y_np = y.values if hasattr(y, 'values') else y
        
        X_tensor = torch.tensor(X_np, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y_np, dtype=torch.float32).view(-1, 1).to(device)
        
        criterion = nn.BCELoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        
        # Training Loop (Mini-Batch)
        num_samples = X_tensor.shape[0]
        num_batches = int(np.ceil(num_samples / batch_size))
        
        print(f"Starte Training für {epochs} Epochen...")
        for epoch in range(epochs):
            # Shuffle indices
            indices = torch.randperm(num_samples, device=device)
            
            for i in range(num_batches):
                start = i * batch_size
                end = min(start + batch_size, num_samples)
                batch_idx = indices[start:end]
                
                optimizer.zero_grad()
                outputs = self.forward(X_tensor[batch_idx])
                loss = criterion(outputs, y_tensor[batch_idx])
                loss.backward()
                optimizer.step()
                
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            probs = self.predict_proba(X)[:, 1]
            return (probs >= 0.5).astype(int)

    def predict_proba(self, X):
        self.eval()
        X_np = X.values if hasattr(X, 'values') else X
        X_tensor = torch.tensor(X_np, dtype=torch.float32).to(device)
        
        with torch.no_grad():
            outputs = self.forward(X_tensor)
            probs = outputs.cpu().numpy().flatten()
            
        # Formatieren wie sklearn: [[prob_0, prob_1], ...]
        return np.vstack(((1 - probs), probs)).T

# 3. Modell initialisieren und trainieren
input_dim = url_train_x.shape[1]
log_reg = GPULogisticRegression(input_dim)

# Hyperparameter anpassen falls nötig (batch_size erhöht für GPU-Effizienz)
log_reg.fit(url_train_x, url_train_y, epochs=500, lr=0.001, batch_size=16384)

monitor.end_measurement(task_name="Training")

Training läuft auf: cuda
Starte Training für 500 Epochen...
--- Ergebnisse Logistic Regression PhiUSIIL (Training) ---
Zeit: 6.232s | GPU-Last: 32.7%
VRAM (System): 1673.44 MB | VRAM (Torch): 58.37 MB


{'model': 'Logistic Regression PhiUSIIL',
 'task': 'Training',
 'time_sec': 6.232,
 'ram_mb': 1250.3,
 'vram_mb': 1673.44,
 'torch_vram_mb': 58.37,
 'cpu_percent': 121.3,
 'gpu_util_percent': 32.7}

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# --- 5. INFERENZ & EVALUIERUNG ---
print("Starte Inferenz (gesamtes Testset)...")
monitor.start_measurement()

# 1. Vorhersage (Inferenz)
# Deine Klasse kümmert sich intern um .to(device) und .cpu()
# predict_proba gibt [[prob_0, prob_1], ...] zurück, wir brauchen Spalte 1
y_scores = log_reg.predict_proba(url_test_x)[:, 1]

# 2. Ground Truth (Echte Labels)
# Sicherstellen, dass es ein Numpy Array ist
y_true = url_test_y.values if hasattr(url_test_y, 'values') else url_test_y

# 3. Binäre Vorhersagen (Threshold 0.5)
y_pred_binary = (y_scores > 0.5).astype(int)

# --- METRIKEN BERECHNEN ---
# 1. Accuracy
acc = accuracy_score(y_true, y_pred_binary)
# 2. Precision
prec = precision_score(y_true, y_pred_binary, zero_division=0)
# 3. Recall
rec = recall_score(y_true, y_pred_binary, zero_division=0)
# 4. F1 Score
f1 = f1_score(y_true, y_pred_binary, zero_division=0)
# 5. AUC
auc = roc_auc_score(y_true, y_scores)

# 6. False Positive Rate (FPR)
# Confusion Matrix: tn, fp, fn, tp
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# Ergebnisse zusammenpacken
metrics_dict = {
    "accuracy": round(acc, 4),
    "precision": round(prec, 4),
    "recall": round(rec, 4),
    "f1_score": round(f1, 4),
    "auc": round(auc, 4),
    "fpr": round(fpr, 4)
}

# An Monitor übergeben
monitor.end_measurement(task_name="Inferenz", extra_metrics=metrics_dict)

Starte Inferenz (gesamtes Testset)...
--- Ergebnisse Logistic Regression PhiUSIIL (Inferenz) ---
Zeit: 0.1011s | GPU-Last: 31.0%
VRAM (System): 1641.75 MB | VRAM (Torch): 24.01 MB


{'model': 'Logistic Regression PhiUSIIL',
 'task': 'Inferenz',
 'time_sec': 0.1011,
 'ram_mb': 1236.14,
 'vram_mb': 1641.75,
 'torch_vram_mb': 24.01,
 'cpu_percent': 714.8,
 'gpu_util_percent': 31.0,
 'accuracy': 0.9987,
 'precision': 0.9993,
 'recall': 0.9984,
 'f1_score': 0.9988,
 'auc': 1.0,
 'fpr': np.float64(0.001)}