In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.pipeline import Pipeline
import os

def load_data(filepath):
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Die Datei {filepath} wurde nicht gefunden.")
    
    raw_data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(raw_data)
    df.replace([b'', ''], np.nan, inplace=True)
    for col in df.select_dtypes([object]):
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
    df['class1'] = df['class1'].astype(str)
    return df

# Datensatzpfad
filepath = r'Scenario A1-ARFF\TimeBasedFeatures-Dataset-15s-VPN.arff'

try:
    df = load_data(filepath)
except Exception as e:
    print(f"Fehler beim Laden der Datei: {e}")
    raise

# Feature-Auswahl
selected_features = [
    'duration', 'total_fiat', 'total_biat', 'min_fiat', 'max_fiat',
    'min_flowiat', 'max_flowiat', 'mean_flowiat', 'std_flowiat',
    'mean_active', 'mean_idle', 'std_active', 'std_idle',
    'flowBytesPerSecond', 'flowPktsPerSecond'
]
df_selected = df[selected_features]
labels = df['class1']

# Fehlende Werte auffüllen
df_selected = df_selected.fillna(df_selected.median())

# Daten skalieren und Pipeline erstellen
pipeline = Pipeline([
    ('scaler', StandardScaler())
])
df_scaled = pd.DataFrame(pipeline.fit_transform(df_selected), columns=selected_features)

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(
    df_scaled, labels, test_size=0.2, random_state=42, stratify=labels
)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

# Isolation Forest
print("\n Training Isolation Forest...")
iso_forest = IsolationForest(n_estimators=100, contamination=0.5, random_state=42)
iso_forest.fit(X_train)
y_pred_iso = iso_forest.predict(X_test)
y_pred_iso = np.where(y_pred_iso == 1, 0, 1)  # 1 = Anomalie (VPN), 0 = Normal (Non-VPN)
y_true = np.where(y_test_np == "VPN", 1, 0)

print("\n Ergebnisse für Isolation Forest:")
print("CONFUSION MATRIX:\n", confusion_matrix(y_true, y_pred_iso))
print("CLASSIFICATION REPORT:\n", classification_report(y_true, y_pred_iso, target_names=["Non-VPN", "VPN"]))

# One-Class SVM
print("\n Training One-Class SVM...")
one_class_svm = OneClassSVM(nu=0.1, kernel="rbf", gamma="scale")
one_class_svm.fit(X_train)
y_pred_svm = one_class_svm.predict(X_test)
y_pred_svm = np.where(y_pred_svm == 1, 0, 1) 

print("\n Ergebnisse für One-Class SVM:")
print("CONFUSION MATRIX:\n", confusion_matrix(y_true, y_pred_svm))
print("CLASSIFICATION REPORT:\n", classification_report(y_true, y_pred_svm, target_names=["Non-VPN", "VPN"]))



🔹 Training Isolation Forest...

🏆 Ergebnisse für Isolation Forest:
CONFUSION MATRIX:
 [[1024  769]
 [ 873 1086]]
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

     Non-VPN       0.54      0.57      0.56      1793
         VPN       0.59      0.55      0.57      1959

    accuracy                           0.56      3752
   macro avg       0.56      0.56      0.56      3752
weighted avg       0.56      0.56      0.56      3752


🔹 Training One-Class SVM...

🏆 Ergebnisse für One-Class SVM:
CONFUSION MATRIX:
 [[1619  174]
 [1674  285]]
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

     Non-VPN       0.49      0.90      0.64      1793
         VPN       0.62      0.15      0.24      1959

    accuracy                           0.51      3752
   macro avg       0.56      0.52      0.44      3752
weighted avg       0.56      0.51      0.43      3752

