In [1]:
# Logistic Regression mit PhiUSIIL Dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from ucimlrepo import fetch_ucirepo
from benchmark import PerformanceMonitor
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import torch
import torch.nn as nn
import torch.optim as optim
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

  import pynvml  # type: ignore[import]


In [7]:
# --- NEUER TEIL: LOKALE DATEN LADEN ---

# 1. PFAD ANPASSEN: Hier den Pfad zu deiner CSV-Datei eintragen
# Das 'r' davor ist wichtig für Windows-Pfade!
dateipfad = r"..\data\processed\PhiUSIIL_Phishing_URL_Dataset.csv" 

print(f"Lade lokalen Datensatz: {dateipfad}")

# Einlesen der CSV
dataset = pd.read_csv(dateipfad)

# 1. 'URL'-Textspalte entfernen (falls vorhanden), da Modelle Zahlen brauchen
if 'URL' in dataset.columns:
    dataset = dataset.drop(columns=['URL'])

# 2. Target-Spalte umbenennen zu 'Phishing?'
# Hinweis: In der CSV heißt die Zielspalte oft 'label', 'URLLabel' oder 'class'.
# Wir suchen sie und benennen sie um, damit dein restlicher Code funktioniert.

# Versuche typische Namen zu finden:
target_candidates = ['label', 'URLLabel', 'class', 'Phishing']
target_found = False

for candidate in target_candidates:
    if candidate in dataset.columns:
        dataset.rename(columns={candidate: 'Phishing?'}, inplace=True)
        target_found = True
        break

# Fallback: Wenn wir den Namen nicht erraten, nehmen wir einfach die allerletzte Spalte
if not target_found:
    print("Warnung: Konnte Target-Name nicht automatisch finden. Nutze die letzte Spalte als Target.")
    last_col = dataset.columns[-1]
    dataset.rename(columns={last_col: 'Phishing?'}, inplace=True)

# 3. Sicherstellen, dass alles numerisch ist (Textspalten entfernen)
dataset = dataset.select_dtypes(include=[np.number])

# 4. X und y definieren (damit der Rest deines Codes weiterläuft)
# y ist jetzt die Spalte 'Phishing?'
y = dataset['Phishing?']
X = dataset.drop(columns=['Phishing?'])

Lade lokalen Datensatz: ..\data\processed\PhiUSIIL_Phishing_URL_Dataset.csv


In [3]:
url_train, url_dummy = train_test_split(dataset, test_size=0.2)
url_val, url_test = train_test_split(url_dummy, test_size=0.5)

In [4]:
url_train_x = url_train.drop(columns="Phishing?", inplace=False)
url_train_y = url_train["Phishing?"]

url_val_x = url_val.drop(columns="Phishing?", inplace=False)
url_val_y = url_val["Phishing?"]

url_test_x = url_test.drop(columns="Phishing?", inplace=False)
url_test_y = url_test["Phishing?"]

In [5]:
# Monitor starten
monitor = PerformanceMonitor("Logistic Regression PhiUSIIL")
monitor.start_measurement()

# 1. GPU Setup prüfen
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training läuft auf: {device}")

# 2. PyTorch Model Definition (Wrapper, der sich wie sklearn verhält)
class GPULogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(GPULogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

    # Hilfsfunktion, um sklearn-API nachzubauen
    def fit(self, X, y, epochs=100, lr=0.01, batch_size=4096):
        self.to(device)
        self.train()
        
        # Daten zu Tensoren konvertieren
        # .values nutzen, falls es Pandas Dataframes sind
        X_np = X.values if hasattr(X, 'values') else X
        y_np = y.values if hasattr(y, 'values') else y
        
        X_tensor = torch.tensor(X_np, dtype=torch.float32).to(device)
        y_tensor = torch.tensor(y_np, dtype=torch.float32).view(-1, 1).to(device)
        
        criterion = nn.BCELoss()
        optimizer = optim.Adam(self.parameters(), lr=lr)
        
        # Training Loop (Mini-Batch)
        num_samples = X_tensor.shape[0]
        num_batches = int(np.ceil(num_samples / batch_size))
        
        print(f"Starte Training für {epochs} Epochen...")
        for epoch in range(epochs):
            # Shuffle indices
            indices = torch.randperm(num_samples, device=device)
            
            for i in range(num_batches):
                start = i * batch_size
                end = min(start + batch_size, num_samples)
                batch_idx = indices[start:end]
                
                optimizer.zero_grad()
                outputs = self.forward(X_tensor[batch_idx])
                loss = criterion(outputs, y_tensor[batch_idx])
                loss.backward()
                optimizer.step()
                
    def predict(self, X):
        self.eval()
        with torch.no_grad():
            probs = self.predict_proba(X)[:, 1]
            return (probs >= 0.5).astype(int)

    def predict_proba(self, X):
        self.eval()
        X_np = X.values if hasattr(X, 'values') else X
        X_tensor = torch.tensor(X_np, dtype=torch.float32).to(device)
        
        with torch.no_grad():
            outputs = self.forward(X_tensor)
            probs = outputs.cpu().numpy().flatten()
            
        # Formatieren wie sklearn: [[prob_0, prob_1], ...]
        return np.vstack(((1 - probs), probs)).T

# 3. Modell initialisieren und trainieren
input_dim = url_train_x.shape[1]
log_reg = GPULogisticRegression(input_dim)

# Hyperparameter anpassen falls nötig (batch_size erhöht für GPU-Effizienz)
log_reg.fit(url_train_x, url_train_y, epochs=500, lr=0.001, batch_size=16384)

monitor.end_measurement(task_name="Training")

Training läuft auf: cuda
Starte Training für 500 Epochen...
--- Ergebnisse Logistic Regression PhiUSIIL (Training) ---
Zeit: 6.8372s | GPU-Last: 30.7%
VRAM (System): 2109.67 MB | VRAM (Torch): 63.42 MB


{'model': 'Logistic Regression PhiUSIIL',
 'task': 'Training',
 'time_sec': 6.8372,
 'ram_mb': 1362.38,
 'vram_mb': 2109.67,
 'torch_vram_mb': 63.42,
 'cpu_percent': 120.4,
 'gpu_util_percent': 30.7}

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# --- 5. INFERENZ & EVALUIERUNG ---
print("Starte Inferenz (gesamtes Testset)...")
monitor.start_measurement()

# 1. Vorhersage (Inferenz)
# Deine Klasse kümmert sich intern um .to(device) und .cpu()
# predict_proba gibt [[prob_0, prob_1], ...] zurück, wir brauchen Spalte 1
y_scores = log_reg.predict_proba(url_test_x)[:, 1]

# 2. Ground Truth (Echte Labels)
# Sicherstellen, dass es ein Numpy Array ist
y_true = url_test_y.values if hasattr(url_test_y, 'values') else url_test_y

# 3. Binäre Vorhersagen (Threshold 0.5)
y_pred_binary = (y_scores > 0.5).astype(int)

# --- METRIKEN BERECHNEN ---
# 1. Accuracy
acc = accuracy_score(y_true, y_pred_binary)
# 2. Precision
prec = precision_score(y_true, y_pred_binary, zero_division=0)
# 3. Recall
rec = recall_score(y_true, y_pred_binary, zero_division=0)
# 4. F1 Score
f1 = f1_score(y_true, y_pred_binary, zero_division=0)
# 5. AUC
auc = roc_auc_score(y_true, y_scores)

# 6. False Positive Rate (FPR)
# Confusion Matrix: tn, fp, fn, tp
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# Ergebnisse zusammenpacken
metrics_dict = {
    "accuracy": round(acc, 4),
    "precision": round(prec, 4),
    "recall": round(rec, 4),
    "f1_score": round(f1, 4),
    "auc": round(auc, 4),
    "fpr": round(fpr, 4)
}

# An Monitor übergeben
monitor.end_measurement(task_name="Inferenz", extra_metrics=metrics_dict)

Starte Inferenz (gesamtes Testset)...
--- Ergebnisse Logistic Regression PhiUSIIL (Inferenz) ---
Zeit: 0.1011s | GPU-Last: 31.0%
VRAM (System): 1639.8 MB | VRAM (Torch): 21.0 MB


{'model': 'Logistic Regression PhiUSIIL',
 'task': 'Inferenz',
 'time_sec': 0.1011,
 'ram_mb': 1578.99,
 'vram_mb': 1639.8,
 'torch_vram_mb': 21.0,
 'cpu_percent': 724.4,
 'gpu_util_percent': 31.0,
 'accuracy': 0.9992,
 'precision': 0.999,
 'recall': 0.9995,
 'f1_score': 0.9993,
 'auc': 0.9999,
 'fpr': np.float64(0.0013)}