In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd # Pandas muss importiert sein!
from benchmark import PerformanceMonitor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# --- 1. FUNKTIONS-DEFINITION (Muss oben stehen!) ---
def load_data_from_csv(csv_file_path):
    """
    Lädt URLs und Labels aus einer CSV-Datei.
    Erwartet eine CSV mit Spalten für URL und Label.
    """
    print(f"Lade Daten aus: {csv_file_path}...")
    
    try:
        # CSV einlesen
        df = pd.read_csv(csv_file_path)
        
        # --- WICHTIG: SPALTENNAMEN PRÜFEN ---
        # Passen Sie diese Namen an Ihre CSV an (z.B. 'url', 'label', 'type' etc.)
        url_col = 'URL'      
        label_col = 'label'  
        
        # Prüfen, ob Spalten existieren
        if url_col not in df.columns or label_col not in df.columns:
            print(f"FEHLER: Spalten '{url_col}' oder '{label_col}' nicht in CSV gefunden.")
            print(f"Vorhandene Spalten: {df.columns.tolist()}")
            return [], np.array([])

        # Leere Zeilen entfernen
        df = df.dropna(subset=[url_col, label_col])
        
        # URLs als Liste
        urls = df[url_col].astype(str).tolist()
        
        # Labels verarbeiten
        if pd.api.types.is_numeric_dtype(df[label_col]):
            labels = df[label_col].values
        else:
            print("Wandle Text-Labels in Zahlen um...")
            label_mapping = {'phishing': 1, 'bad': 1, 'malicious': 1, 
                             'benign': 0, 'good': 0, 'legitimate': 0}
            # Unbekannte Labels werden zu NaN
            labels = df[label_col].map(label_mapping)
            
            if labels.isna().any():
                print(f"Warnung: {labels.isna().sum()} Labels konnten nicht zugeordnet werden und werden entfernt.")
                # Maske für gültige Labels
                mask = labels.notna()
                urls = [u for u, m in zip(urls, mask) if m]
                labels = labels.dropna()
            
            labels = labels.values.astype(int)
            
        print(f"Erfolgreich geladen: {len(urls)} URLs.")
        return urls, np.array(labels)

    except Exception as e:
        print(f"Fehler beim Laden der CSV: {e}")
        return [], np.array([])

# --- 2. KONFIGURATION & HAUPTPROGRAMM ---
MAX_LEN = 2000
BATCH_SIZE = 64
EPOCHS = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Laufe auf: {device}")

# Dateipfad
CSV_PATH = r"..\data\processed\PhiUSIIL_Phishing_URL_Dataset.csv" 

# JETZT erst die Funktion aufrufen
raw_urls, labels = load_data_from_csv(CSV_PATH)

# Überprüfen, ob Daten da sind
if len(raw_urls) > 0:
    # Split into Train and Test sets
    X_train, X_test, y_train, y_test = train_test_split(
        raw_urls, labels, test_size=0.2, random_state=42
    )
    print(f"Training on {len(X_train)} URLs, Testing on {len(X_test)} URLs.")
else:
    print("ACHTUNG: Keine Daten geladen. Bitte Dateipfad und Spaltennamen prüfen!")

In [None]:
# --- 2. VECTORIZATION (Manuell für PyTorch) ---
# Wir bauen ein einfaches Vokabular (Zeichen-basiert), genau wie Keras TextVectorization
chars = sorted(list(set("".join(X_train[:1000])))) # Schnelles Vocab aus den ersten 1000 URLs
char_to_int = {c: i+2 for i, c in enumerate(chars)} # +2 für Padding (0) und UNK (1)
vocab_size = len(char_to_int) + 2

def encode_urls(urls, max_len=MAX_LEN):
    encoded_batch = []
    for url in urls:
        # Zeichen zu Int konvertieren
        vec = [char_to_int.get(c, 1) for c in url] # 1 = Unknown
        # Padding oder Truncating
        if len(vec) < max_len:
            vec += [0] * (max_len - len(vec))
        else:
            vec = vec[:max_len]
        encoded_batch.append(vec)
    return np.array(encoded_batch)

print("Vektorisiere Daten (das dauert kurz)...")
X_train_enc = torch.tensor(encode_urls(X_train), dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

X_test_enc = torch.tensor(encode_urls(X_test), dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# DataLoader erstellen (für Batching)
train_loader = DataLoader(TensorDataset(X_train_enc, y_train_tensor), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_enc, y_test_tensor), batch_size=BATCH_SIZE)

In [None]:
# --- 3. DAS CNN MODELL (PyTorch Version) ---
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=32):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Conv Layer 1
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=128, kernel_size=5)
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        
        # Conv Layer 2
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3)
        self.global_pool = nn.AdaptiveMaxPool1d(1) # Global Max Pooling
        
        # Dense Layers
        self.fc1 = nn.Linear(64, 64)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: [batch, seq_len] -> [batch, seq_len, embed]
        x = self.embedding(x)
        
        # Conv1D erwartet [batch, channels, seq_len], wir müssen die Dimensionen tauschen
        x = x.permute(0, 2, 1) 
        
        x = self.pool1(self.relu(self.conv1(x)))
        x = self.global_pool(self.relu(self.conv2(x)))
        
        # Flatten für Dense Layer: [batch, 64, 1] -> [batch, 64]
        x = x.squeeze(-1) 
        
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.sigmoid(self.fc2(x))
        return x

# Modell initialisieren und auf GPU schieben
model = CNNModel(vocab_size=vocab_size).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

monitor = PerformanceMonitor("CNN PhiUSIIL")

In [None]:

# --- 4. TRAINING ---
print("Starte Training...")
monitor.start_measurement()

model.train()
for epoch in range(EPOCHS):
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} fertig.")

monitor.end_measurement(task_name="Training")

In [None]:
# --- 5. INFERENZ & EVALUIERUNG ---
print("Starte Inferenz (gesamtes Testset)...")
monitor.start_measurement()

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        
        # Vorhersage
        outputs = model(X_batch)
        
        # Daten sammeln (auf CPU schieben für sklearn)
        all_preds.extend(outputs.cpu().numpy())
        all_labels.extend(y_batch.numpy())

# Zeit stoppen ist hier wichtig, bevor wir Metriken berechnen (außer wir wollen die Rechenzeit der Metriken mitmessen)
# Wir messen hier die reine Inferenzzeit des Modells + Data Transfer.

# --- METRIKEN BERECHNEN ---
# Konvertierung in Arrays
y_true = np.array(all_labels)
y_scores = np.array(all_preds) # Wahrscheinlichkeiten (Sigmoid Output)
y_pred_binary = (y_scores > 0.5).astype(int) # Hard predictions (0 oder 1)

# 1. Accuracy
acc = accuracy_score(y_true, y_pred_binary)
# 2. Precision
prec = precision_score(y_true, y_pred_binary, zero_division=0)
# 3. Recall
rec = recall_score(y_true, y_pred_binary, zero_division=0)
# 4. F1 Score
f1 = f1_score(y_true, y_pred_binary, zero_division=0)
# 5. AUC
auc = roc_auc_score(y_true, y_scores)

# 6. False Positive Rate (FPR)
# Confusion Matrix: tn, fp, fn, tp
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# Ergebnisse zusammenpacken
metrics = {
    "accuracy": round(acc, 4),
    "precision": round(prec, 4),
    "recall": round(rec, 4),
    "f1_score": round(f1, 4),
    "auc": round(auc, 4),
    "fpr": round(fpr, 4)
}

# An Monitor übergeben
monitor.end_measurement(task_name="Inferenz", extra_metrics=metrics)