In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix


In [9]:
df_vpn = pd.read_csv(r"C:\Users\berkb\Desktop\Uni\Bachelor\Projekt\Scenario A1-ARFF\Scenario A1-ARFF\VPN-WS.csv", sep=";")
df_nonvpn = pd.read_csv("C:\\Users\\berkb\\Desktop\\Uni\\Bachelor\\Projekt\\Scenario A1-ARFF\\Scenario A1-ARFF\\Non-VPN-WS.csv", sep=";")

In [15]:
df_combined = pd.concat([df_vpn, df_nonvpn], ignore_index=True)
df_combined.to_csv("C:\\Users\\berkb\\Desktop\\Uni\\Bachelor\\Projekt\\Scenario A1-ARFF\\Scenario A1-ARFF\\Combined-VPN-WS.csv", index=False)


In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

##############################
#         1) DATENSET        #
##############################

# Pfad und CSV-Einstellungen
CSV_PATH = r"C:\Users\berkb\Desktop\Uni\Bachelor\Projekt\Scenario A1-ARFF\Scenario A1-ARFF\Combined-VPN-WS.csv"

df_combined = pd.read_csv(
    CSV_PATH,     # Pfad zu deiner CSV
    sep=";",      # ODER sep=",", je nachdem, wie dein CSV formatiert ist
    engine="python"
)

# Komma->Punkt ersetzen
df_combined = df_combined.applymap(lambda x: str(x).replace(",", ".") if isinstance(x, str) else x)
df_combined = df_combined.apply(pd.to_numeric, errors='coerce')

# Entferne unbrauchbare Spalten
unwanted_columns = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp', 'Label.']
df_combined.drop(columns=[col for col in unwanted_columns if col in df_combined.columns], errors='ignore', inplace=True)

# Spalte "Label" => 0 / 1
y = df_combined["Label"].astype(int).values
df_features = df_combined.drop(columns=["Label"])


#   2) VORVERARBEITUNG       #


df_features.replace([np.inf, -np.inf], np.nan, inplace=True)
df_features.fillna(df_features.median(), inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_features)

# (X, y) sind nun unsere Features + Labels
X = X_scaled


#  3) TRAIN/TEST-SPLIT       #


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Nur Non-VPN (Label=0) zum Training
X_train_norm = X_train[y_train == 0]


# 4) AUTOENCODER DEFINIEREN  #


input_dim = X_train.shape[1]
input_layer = keras.Input(shape=(input_dim,))
x = layers.Dense(128, activation='elu')(input_layer)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(96, activation='elu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dense(48, activation='elu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dense(16, activation='elu')(x)  # Bottleneck
x = layers.BatchNormalization()(x)
x = layers.Dense(48, activation='elu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(96, activation='elu')(x)
x = layers.BatchNormalization()(x)
x = layers.Dense(128, activation='elu')(x)
output_layer = layers.Dense(input_dim, activation='linear')(x)

autoencoder = keras.Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(
    optimizer=keras.optimizers.Adamax(learning_rate=0.001),
    loss=tf.keras.losses.Huber(delta=1.0)
)

# EarlyStopping und Learning Rate Reduktion
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=5, min_lr=1e-6)


#        5) TRAINING         #


autoencoder.fit(
    X_train_norm, X_train_norm,
    epochs=100,
    batch_size=32,
    shuffle=True,
    validation_split=0.1,
    verbose=1,
    callbacks=[early_stopping, reduce_lr]
)


#    6) MSE BERECHNEN        #


# a) MSE auf Trainings-Norm
X_train_pred = autoencoder.predict(X_train_norm, verbose=0)
mse_train = np.mean(np.power(X_train_norm - X_train_pred, 2), axis=1)

# b) MSE auf Testset
X_test_pred = autoencoder.predict(X_test, verbose=0)
mse_test = np.mean(np.power(X_test - X_test_pred, 2), axis=1)


# 7) YOUDEN'S INDEX FUNKTION #


def find_best_threshold_youden(y_true, mse_values, steps=100):

    min_mse, max_mse = mse_values.min(), mse_values.max()
    best_thr = None
    best_j = -999

    for thr in np.linspace(min_mse, max_mse, steps):
        y_pred = (mse_values > thr).astype(int)

        cm = confusion_matrix(y_true, y_pred)
        if cm.shape != (2,2):
            continue  # z.B. wenn wir nur 1 Klasse?
        tn, fp, fn, tp = cm.ravel()

        # TPR (Recall) = TP / (TP+FN)
        tpr = tp / (tp+fn) if (tp+fn)>0 else 0
        # FPR = FP / (FP+TN)
        fpr = fp / (fp+tn) if (fp+tn)>0 else 0

        # Youden's index = TPR - FPR
        J = tpr - fpr
        if J > best_j:
            best_j = J
            best_thr = thr

    return best_thr, best_j


#   8) YOUDEN INDEX TEST     #


best_threshold, best_j = find_best_threshold_youden(y_test, mse_test, steps=200)
print(f"\n Bester Threshold laut Youden's Index: {best_threshold:.6f}")
print(f"Youden's Index = {best_j:.4f}")

# Vorhersage mit diesem best_threshold
y_pred_youden = (mse_test > best_threshold).astype(int)

cm_youden = confusion_matrix(y_test, y_pred_youden)
report_youden = classification_report(y_test, y_pred_youden, target_names=["Non-VPN", "VPN"])

print("\n=== CONFUSION MATRIX (Youden) ===")
print(cm_youden)
print("\n=== CLASSIFICATION REPORT (Youden) ===")
print(report_youden)

##############################
#   9) OPTIONAL: VERGLEICH   #
##############################

# a) Mittels Perzentil (z.B. 50)
threshold_median = np.percentile(mse_train, 50)
y_pred_median = (mse_test > threshold_median).astype(int)

cm_median = confusion_matrix(y_test, y_pred_median)
report_median = classification_report(y_test, y_pred_median, target_names=["Non-VPN", "VPN"])

print("\n=== VERGLEICH: Perzentil (50) ===")
print("Threshold Median:", threshold_median)
print(cm_median)
print(report_median)


Epoch 1/100


  df_combined = df_combined.applymap(lambda x: str(x).replace(",", ".") if isinstance(x, str) else x)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7