# Preparation

## 1. Existing Data Files Check

In [None]:
import helper
from helper import clear_folder
import os

clear_folder("ray_results")
clear_folder("outputs")
clear_folder("results")

## 2. Data Preparation

In [None]:
import pandas as pd
import os

# File paths for the normalized datasets
filenames = ['data/normalized/A2.csv', 
             'data/normalized/A3.csv', 
             'data/normalized/A4.csv', 
             'data/normalized/A12.csv', 
             'data/normalized/A21.csv']

# Load datasets
datasets = [pd.read_csv(f) for f in filenames]

# Split into features and target
features_list = []
targets_list = []
for df in datasets:
    features_list.append(df[[str(i) for i in range(30)]]) 
    targets_list.append(df["track"])

# Extract dataset names
dataset_names = [os.path.splitext(os.path.basename(f))[0] for f in filenames]


# Autoencoder SVM
## 3. Definition Autoencoder class

In [None]:
import torch
import torch.nn as nn

# Flexible autoencoder class
class Autoencoder(nn.Module):
    def __init__(self, layer_sizes):
        super(Autoencoder, self).__init__()
        encoder_layers = []
        for in_size, out_size in zip(layer_sizes[:-1], layer_sizes[1:]):
            encoder_layers.append(nn.Linear(in_size, out_size))
            encoder_layers.append(nn.ReLU())
        encoder_layers.pop()
        self.encoder = nn.Sequential(*encoder_layers)

        decoder_layers = []
        for in_size, out_size in zip(layer_sizes[::-1][:-1], layer_sizes[::-1][1:]):
            decoder_layers.append(nn.Linear(in_size, out_size))
            decoder_layers.append(nn.ReLU())
        decoder_layers.pop()
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

## 4. Definition Optuna training function

In [None]:
import optuna
import json
import os
from torch.utils.data import DataLoader, TensorDataset

def optuna_train_autoencoder(trial, dataset_idx, dataset_name):
    hidden_dim = trial.suggest_int("hidden_dim", 10, 30) # search space from 10 to 30
    bottleneck_dim = trial.suggest_int("bottleneck_dim", 2, hidden_dim - 1) # by -1, bottleneck_dim is always strictly smaller than hidden_dim. The search space ranges from 2 to (bottleneck_dim -1)
    lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True) # Optimizes the learning rate logarithmically between the two values


    # define layersize
    layer_sizes = [30, hidden_dim, bottleneck_dim]

    # Leave-One-Out (Dateset): Prepare training data
    train_features = pd.concat(
        [features_list[j] for j in range(len(datasets)) if j != dataset_idx],
        axis=0
    )

    # Explanation Leave-One-Out Approach:
        # dataset_idx is the index of the current test dataset (e.g. 0 for A2).
        # [features_list[j] for j in range(len(datasets)) if j != dataset_idx] means: Take all datasets except the one with index dataset_idx.
        # pd.concat(..., axis=0) concatenates all these training datasets line by line.

    X_tensor = torch.tensor(train_features.values, dtype=torch.float32)
    loader = DataLoader(TensorDataset(X_tensor), batch_size=64, shuffle=True)

    # Initialize model
    model = Autoencoder(layer_sizes)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    # Training
    model.train()
    epochs = 50
    for epoch in range(epochs):
        total_loss = 0
        for batch in loader:
            x = batch[0]
            out = model(x)
            loss = criterion(out, x)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(loader)

    # save results
    os.makedirs("results", exist_ok=True)
    model_path = f"results/autoencoder_leave_out_{dataset_name}.pt"
    torch.save(model.state_dict(), model_path)

    # Save the model configuration and training parameters
    config = {
        "layer_sizes": layer_sizes,
        "num_layers": len(layer_sizes),
        "activation_function": "ReLU",             
        "loss_function": "MSELoss",
        "optimizer": "Adam",
        "learning_rate": lr,
        "batch_size": 64,
        "epochs": epochs
    }
    config_path = f"results/autoencoder_config_{dataset_name}.json"
    with open(config_path, "w") as f:
        json.dump(config, f, indent=4)

    # Return loss to Optuna
    return avg_loss


## 5. Training Optuna

In [None]:
import optuna
import json

# Dictionary for saving the best parameters
best_params_all = {}

for i, name in enumerate(dataset_names):
    print(f"\n🔍 Optuna-Tuning for: {name}")

    # Target function with fixed data set
    def objective(trial):
        return optuna_train_autoencoder(trial, dataset_idx=i, dataset_name=name)

    # start Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)

    # save best parameters
    best_params_all[name] = study.best_params
    print(f"  Best parameters for {name}: {study.best_params}")

# create JSON-File
os.makedirs("results", exist_ok=True)
with open("results/ae_best_params.json", "w") as f:
    json.dump(best_params_all, f, indent=4)


## 6. SVM

Configurations of the SVM hyperparameter per data set

Data Set A2 (2 classes)
- **Hyperparameter:** `C = 1`, `kernel = rbf`, `gamma = 1`

Data Set A3 (2 classes)
- **Hyperparameter:** `C = 1`, `kernel = rbf`, `gamma = 1`

 Data Set A4 (3 classes)
- **Hyperparameter:** `C = 10`, `kernel = rbf`, `gamma = scale`

Data Set A12 (6 classes)
- **Hyperparameter:** `C = 10`, `kernel = rbf`, `gamma = scale`

Data Set A21 (2 classes)
- **Hyperparameter:** `C = 100`, `kernel = rbf`, `gamma = auto`

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import json

# SVM settings per dataset
svm_params = {
    "A2":  {"C": 1,   "kernel": "rbf", "gamma": 1},
    "A3":  {"C": 1,   "kernel": "rbf", "gamma": 1},
    "A4":  {"C": 10,  "kernel": "rbf", "gamma": "scale"},
    "A12": {"C": 10,  "kernel": "rbf", "gamma": "scale"},
    "A21": {"C": 100, "kernel": "rbf", "gamma": "auto"},
}

# CV & storage
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_metrics = []
fold_results = {}

for name, features, targets in zip(dataset_names, features_list, targets_list):
    print(f"\nTraining SVM for: {name}")

    # Load layer structure from saved configuration
    config_path = f"results/autoencoder_config_{name}.json"
    with open(config_path, "r") as f:
        model_config = json.load(f)
    layer_sizes = model_config["layer_sizes"]

    # load models
    model_path = f"results/autoencoder_leave_out_{name}.pt"
    autoencoder = Autoencoder(layer_sizes)
    autoencoder.load_state_dict(torch.load(model_path))
    autoencoder.eval()

    # encode data
    X_tensor = torch.tensor(features.values, dtype=torch.float32)
    with torch.no_grad():
        encoded = autoencoder.encoder(X_tensor).numpy()

    le = LabelEncoder()
    y = le.fit_transform(targets)

    # train SVM
    params = svm_params[name]
    svm = SVC(C=params["C"], kernel=params["kernel"], gamma=params["gamma"])

    acc_list, f1_micro_list, f1_macro_list, mcc_list = [], [], [], []

    for train_idx, test_idx in cv.split(encoded, y):
        X_train, X_test = encoded[train_idx], encoded[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)

        acc_list.append(accuracy_score(y_test, y_pred))
        f1_micro_list.append(f1_score(y_test, y_pred, average='micro'))
        f1_macro_list.append(f1_score(y_test, y_pred, average='macro'))
        mcc_list.append(matthews_corrcoef(y_test, y_pred))

    all_metrics.append([
        name,
        f"{np.mean(acc_list):.3f} ± {np.std(acc_list):.3f}",
        f"{np.mean(f1_micro_list):.3f} ± {np.std(f1_micro_list):.3f}",
        f"{np.mean(f1_macro_list):.3f} ± {np.std(f1_macro_list):.3f}",
        f"{np.mean(mcc_list):.3f} ± {np.std(mcc_list):.3f}"
    ])

    fold_results[name] = acc_list

# save results
results_df = pd.DataFrame(all_metrics, columns=["Dataset", "Accuracy", "F1 Micro", "F1 Macro", "MCC"])
results_df.to_csv("results/svm_evaluation_metrics.csv", index=False)

fold_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in fold_results.items()]))
fold_df.to_csv("results/svm_cv_fold_accuracies.csv", index=False)


# Visualise Results
## Plot

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load per-fold accuracy data
fold_df = pd.read_csv("results/svm_cv_fold_accuracies.csv")

# Calculate means and stds for annotation
means = fold_df.mean()
stds = fold_df.std()
labels = fold_df.columns
table_data = [f"{m:.3f} ± {s:.3f}" for m, s in zip(means, stds)]

# Create boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(
    [fold_df[col].dropna() for col in labels],
    tick_labels=labels,
    showmeans=False
)

plt.ylim(0, 1)
plt.ylabel("Accuracy")
plt.title("SVM 5-Fold CV Accuracy per Dataset")

# Add mean ± std below each box
for i, text in enumerate(table_data):
    plt.text(
        x=i+1, y=-0.08, s=text, 
        ha='center', va='top', fontsize=9, rotation=0, color='black'
    )

# Add explanation
plt.text(
    x=len(labels)/2 + 0.5, y=-0.15,
    s="Mean ± SD of 5-Fold CV Accuracy",
    ha='center', va='top', fontsize=10, color='gray'
)

plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.subplots_adjust(bottom=0.3)
plt.savefig("results/svm_cv_accuracy_boxplot.png", dpi=300)

# Show plot
plt.show()


## Table

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load metrics table
results_df = pd.read_csv("results/svm_evaluation_metrics.csv")

# Create figure
fig, ax = plt.subplots(figsize=(10, len(results_df) * 0.6 + 1))
ax.axis('off')

# Create table
table = ax.table(cellText=results_df.values,
                 colLabels=results_df.columns,
                 cellLoc='center',
                 loc='center')

# Styling
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)

# Save and show
plt.tight_layout()
plt.savefig("results/svm_evaluation_results.png", dpi=300)
plt.show()
