In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from benchmark import PerformanceMonitor
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC as GPU_SVC 
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from benchmark import PerformanceMonitor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn import metrics as sklearn_metrics 

def ClassPrintMetrics(y_test, y_pred):
    print(classification_report(y_test, y_pred))

In [None]:
df_phiusiil = pd.read_csv(r"..\data\processed\PhiUSIIL_Phishing_URL_Dataset.csv")


df = pd.DataFrame()
df['text'] = df_phiusiil['URL'].apply(lambda x: re.sub(r'\W+', ' ', str(x)))

df['label'] = df_phiusiil['label'].map({0: 1, 1: 0}) 

X_train, X_test, y_train, y_test = train_test_split(
    df['text'],
    df['label'],
    test_size=0.1,
    random_state=42
)

print(f"Dataset erfolgreich geladen. Zeilen: {len(df)}")
print("Verteilung (1=Phishing, 0=Safe):")
print(y_train.value_counts())

In [None]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

base_model = LinearSVC(dual=False) # dual=False wird bei n_samples > n_features empfohlen
svm_gpu = CalibratedClassifierCV(base_model) 
svm_gpu.fit(X_train_vec, y_train)
#svm_gpu = GPU_SVC(kernel='linear', probability=True)

In [None]:
# 1. Monitor initialisieren
monitor = PerformanceMonitor("Linear SVC PhiUSIIL")

# --- TRAINING ---
print("Starte Training...")
monitor.start_measurement()

svm_gpu.fit(X_train_vec, y_train)

y_pred = svm_gpu.predict(X_test_vec)
ClassPrintMetrics(y_test, y_pred)

monitor.end_measurement(task_name="Training")

In [None]:
# --- 5. INFERENZ & EVALUIERUNG ---
print("Starte Inferenz (gesamtes Testset)...")

# Zeitmessung starten
monitor.start_measurement()

# Bei Scikit-Learn machen wir die Vorhersage direkt auf der gesamten Test-Matrix
# X_test_vec wurde bereits durch den vectorizer.transform(X_test) erstellt
y_pred_binary = svm_gpu.predict(X_test_vec)

# Für den AUC-Wert benötigen wir Konfidenzwerte (Abstand zur Trennebene)
# Da LinearSVC kein predict_proba hat, nutzen wir die decision_function
if hasattr(svm_gpu, "decision_function"):
    y_scores = svm_gpu.decision_function(X_test_vec)
else:
    # Falls du CalibratedClassifierCV nutzt, hat er predict_proba
    y_scores = svm_gpu.predict_proba(X_test_vec)[:, 1]

# Zeit stoppen
# (Inferenz bei SVM auf 28k Zeilen dauert meist nur Millisekunden)

# --- METRIKEN BERECHNEN ---
y_true = y_test.astype(int).values # Sicherstellen, dass es ein Numpy-Array ist

# 1. Accuracy
acc = accuracy_score(y_true, y_pred_binary)
# 2. Precision
prec = precision_score(y_true, y_pred_binary, zero_division=0)
# 3. Recall
rec = recall_score(y_true, y_pred_binary, zero_division=0)
# 4. F1 Score
f1 = f1_score(y_true, y_pred_binary, zero_division=0)
# 5. AUC
auc = roc_auc_score(y_true, y_scores)

# 6. False Positive Rate (FPR)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred_binary).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

# Ergebnisse zusammenpacken
results = {
    "accuracy": round(acc, 4),
    "precision": round(prec, 4),
    "recall": round(rec, 4),
    "f1_score": round(f1, 4),
    "auc": round(auc, 4),
    "fpr": round(fpr, 4)
}

# An Monitor übergeben
monitor.end_measurement(task_name="Inferenz", extra_metrics=results)

# Ausgabe der Metriken
print("\n--- Ergebnisse ---")
for key, value in results.items():
    print(f"{key.upper()}: {value}")