## Existing Data Files Check

In [None]:
from helper import clear_folder

clear_folder("ray_results")
clear_folder("outputs")
clear_folder("results")

# Repeated stratified nested Cross Validation

The code performs the repeated stratified nested cross validation according to Krstajic et al. (2014).
- 10 repitions
- 5 outer loops
- 4 inner loops

## Function

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score
from statistics import mean, stdev
import logging
from collections import Counter
import os

def run_nested_cv(
    data_path,
    output_path,
    outer_folds=5,
    inner_folds=4,
    repeats=10,
    param_grid={
        'C': [0.01, 0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
    }
):
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logger = logging.getLogger()

    # Data import
    df = pd.read_csv(data_path)
    X = df.drop(columns=['track'])
    y = df['track']

    # Result container
    nested_accuracies = []
    best_params_per_fold = []
    inner_cv_scores_per_fold = []

    # Outer Loop (repitions)
    for rep in range(repeats):
        logger.info(f"Repetition {rep + 1}/{repeats}")
        outer_cv = StratifiedKFold(n_splits=outer_folds, shuffle=True, random_state=rep)

        for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
            logger.info(f"  Outer Fold {fold_idx + 1}/{outer_folds}")
            
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            inner_cv = StratifiedKFold(n_splits=inner_folds, shuffle=True, random_state=rep)
            grid_search = GridSearchCV(
                SVC(),
                param_grid,
                cv=inner_cv,
                scoring='accuracy',
                return_train_score=False,
                n_jobs=1  # n_jobs sets the number of CPU cores used in parallel; -1 means using all available cores
            )
            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            nested_accuracies.append(acc)

            logger.info(f"    Best params: {grid_search.best_params_}")
            logger.info(f"    Nested CV accuracy: {acc:.4f}")

            best_params_per_fold.append(grid_search.best_params_)

            inner_scores = grid_search.cv_results_['mean_test_score']
            inner_cv_scores_per_fold.append({
                'mean': float(np.mean(inner_scores)),
                'std': float(np.std(inner_scores)),
                'min': float(np.min(inner_scores)),
                'max': float(np.max(inner_scores))
            })

    # Summary
    summary = {
        'mean_nested_accuracy': mean(nested_accuracies),
        'sd_nested_accuracy': stdev(nested_accuracies),
        'min_nested_accuracy': min(nested_accuracies),
        'max_nested_accuracy': max(nested_accuracies),
        'nested_accuracies': nested_accuracies,
        'best_params_per_fold': best_params_per_fold,
        'inner_cv_scores_per_fold': inner_cv_scores_per_fold
    }

    # Create results folder
    os.makedirs(output_path, exist_ok=True)

    # Save results
    df_inner = pd.DataFrame(summary['inner_cv_scores_per_fold'])
    df_inner.to_csv(os.path.join(output_path, "inner_cv_scores_per_fold.csv"), index=False)

    df_summary = pd.DataFrame([{
        'mean_nested_accuracy': summary['mean_nested_accuracy'],
        'sd_nested_accuracy': summary['sd_nested_accuracy'],
        'min_nested_accuracy': summary['min_nested_accuracy'],
        'max_nested_accuracy': summary['max_nested_accuracy']
    }])
    df_summary.to_csv(os.path.join(output_path, "nested_cv_accuracy_summary.csv"), index=False)

    print("\nNested CV Accuracy (gesamt):")
    print(f"Mean: {summary['mean_nested_accuracy']:.4f}, "
          f"SD: {summary['sd_nested_accuracy']:.4f}, "
          f"Min: {summary['min_nested_accuracy']:.4f}, "
          f"Max: {summary['max_nested_accuracy']:.4f}")

    # Visualisation
    param_combos = [f"C={p['C']}, kernel={p['kernel']}, gamma={p['gamma']}" for p in best_params_per_fold]
    combo_counts = Counter(param_combos)
    df_combos = pd.DataFrame(combo_counts.items(), columns=['Parameter Combination', 'Frequency'])

    plt.figure(figsize=(10, 6))
    sns.barplot(y='Parameter Combination', x='Frequency', data=df_combos.sort_values('Frequency', ascending=False))
    plt.title('Frequency of Best Hyperparameter Combinations')
    plt.xlabel('Number of Occurrences')
    plt.ylabel('Hyperparameter Combination')
    plt.tight_layout()
    plt.savefig(os.path.join(output_path, "best_hyperparameter_combinations.png"))
    plt.show()


## Application

In [None]:
import os

# Basic path for data and results
base_data_path = os.path.join("data", "normalized")
base_output_path = os.path.join("results")

# List of data sets
datasets = ["A2", "A3", "A4", "A12", "A21"] # select the data sets to be analyzed

# Loop over all data sets
for name in datasets:
    data_path = os.path.join(base_data_path, f"{name}.csv")
    output_path = os.path.join(base_output_path, f"results_{name}")
    os.makedirs(output_path, exist_ok=True)
    
    run_nested_cv(
        data_path=data_path,
        output_path=output_path
    )


## Follow Up Analysis

Model configurations that were most frequently the best model per data set in the repeated stratified nested cross validation:

- **Data set A2** (2 classes/tracks)  
  - `C = 100`, `kernel = rbf`, `gamma = auto`

- **Data set A3** (2 classes/tracks)  
  - `C = 10`, `kernel = rbf`, `gamma = scale`

- **Data set A4** (3 classes/tracks)  
  - `C = 1`, `kernel = rbf`, `gamma = 1`

- **Data set A12** (6 classes/tracks)  
  - `C = 10`, `kernel = rbf`, `gamma = scale`

- **Data set A21** (2 classes/tracks)  
  - `C = 100`, `kernel = rbf`, `gamma = auto`


### Function "evaluate models on all data sets"

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

def evaluate_model_on_dataframes(param_grid, dataframes):
    """
    Evaluates a fixed SVM model configuration on multiple pandas DataFrames using 5-fold cross-validation.

    Each DataFrame must have a 'track' column as the target.

    Args:
        param_grid (dict): Dict with keys 'C', 'kernel', and 'gamma' specifying the model parameters.
        dataframes (list): List of pandas DataFrames, each representing one dataset.

    Returns:
        List of mean accuracy values for each dataset (in the same order).
    """
    accuracies = []

    for idx, df in enumerate(dataframes):
        print(f"\nEvaluating dataset {idx + 1}/{len(dataframes)}...")

        X = df.drop(columns=['track'])
        y = df['track']

        model = SVC(C=param_grid['C'],
                    kernel=param_grid['kernel'],
                    gamma=param_grid['gamma'])

        scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
        mean_accuracy = np.mean(scores)

        print(f"→ Accuracy (mean over 5 folds): {mean_accuracy:.4f}")
        accuracies.append(mean_accuracy)

    print("\nEvaluation completed.")
    return accuracies


### Application of function

In [None]:
# Load DataFrames
df_a2 = pd.read_csv("data/normalized/A2.csv")
df_a3 = pd.read_csv("data/normalized/A3.csv")
df_a4 = pd.read_csv("data/normalized/A4.csv")
df_a12 = pd.read_csv("data/normalized/A12.csv")
df_a21 = pd.read_csv("data/normalized/A21.csv")

# param grids
param_grid_A2 = {'C': 100, 'kernel': 'rbf', 'gamma': 'auto'}
param_grid_A3 = {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}
param_grid_A4 = {'C': 1, 'kernel': 'rbf', 'gamma': 1}
param_grid_A12 = {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}
param_grid_A21 = {'C': 100, 'kernel': 'rbf', 'gamma': 'auto'}


# List of DataFrames
datasets = [df_a2, df_a3, df_a4, df_a12, df_a21]

# Start evaluation
results_model_A2 = evaluate_model_on_dataframes(param_grid_A2, datasets)
results_model_A3 = evaluate_model_on_dataframes(param_grid_A3, datasets)
results_model_A4 = evaluate_model_on_dataframes(param_grid_A4, datasets)
results_model_A12 = evaluate_model_on_dataframes(param_grid_A12, datasets)
results_model_A21 = evaluate_model_on_dataframes(param_grid_A21, datasets)




### Graphical representation of the results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib.lines import Line2D
import os

# Model names
model_names = ['Model A2', 'Model A3', 'Model A4', 'Model A12', 'Model A21']
results_lists = [results_model_A2, results_model_A3, results_model_A4, results_model_A12, results_model_A21]

# New labels for x-axis
new_model_labels = [f'Best HP configuration for {name}' for name in model_names]
label_map = dict(zip(model_names, new_model_labels))

# create DataFrame
data = []
for model_name, scores in zip(model_names, results_lists):
    for idx, score in enumerate(scores):
        data.append({'Model': model_name, 'Accuracy': score, 'Dataset Index': f'Dataset {idx+1}'})

df_plot = pd.DataFrame(data)
df_plot['Model'] = df_plot['Model'].map(label_map)

# create Plot 
plt.figure(figsize=(12, 7))

# Boxplots
sns.boxplot(data=df_plot, x='Model', y='Accuracy', showfliers=False,
            boxprops=dict(facecolor='white', edgecolor='black'),
            medianprops=dict(color='black'),
            whiskerprops=dict(color='black'),
            capprops=dict(color='black'))

# Stripplot
palette = sns.color_palette(n_colors=len(results_lists))
strip = sns.stripplot(data=df_plot, x='Model', y='Accuracy', hue='Dataset Index',
                      jitter=True, dodge=True, marker='o', linewidth=1, edgecolor='gray')

# Legend labels
legend_labels = [
    "Data A2: 2 classes",
    "Data A3: 2 classes",
    "Data A4: 3 classes",
    "Data A12: 6 classes",
    "Data A21: 2 classes"
]

# configure legend
legend_handles = [
    Line2D([0], [0], marker='o', color='w', label=label,
           markerfacecolor=palette[i], markeredgecolor='gray', markersize=8)
    for i, label in enumerate(legend_labels)
]

plt.legend(handles=legend_handles, title='Dataset', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('Accuracy Scores of Different Models Across Datasets')
plt.xticks(rotation=20, ha='right')
plt.tight_layout()

# save
plt.savefig(os.path.join("results", "follow_up_analysis_plot.png"), dpi=300, bbox_inches='tight')
plt.show()


### Tabular presentation of the results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# results per model
model_results = {
    "Model A2": results_model_A2,
    "Model A3": results_model_A3,
    "Model A4": results_model_A4,
    "Model A12": results_model_A12,
    "Model A21": results_model_A21
}

# SVM-Parameters per model
param_configs = {
    "Model A2": {'C': 100, 'kernel': 'rbf', 'gamma': 'auto'},
    "Model A3": {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'},
    "Model A4": {'C': 1, 'kernel': 'rbf', 'gamma': 1},
    "Model A12": {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'},
    "Model A21": {'C': 100, 'kernel': 'rbf', 'gamma': 'auto'}
}

# Dataset names
dataset_names = ["A2", "A3", "A4", "A12", "A21"]

# Restructure matrix
accuracy_matrix = pd.DataFrame({
    model: scores for model, scores in model_results.items()
}, index=dataset_names)

# Initialize table
summary_data = {
    "Dataset": [],
    "Winning Model": [],
    "Accuracy": [],
    "C": [],
    "Kernel": [],
    "Gamma": []
}

#  Determine the model with the best accuracy for each dataset
for dataset in dataset_names:
    best_model = accuracy_matrix.loc[dataset].idxmax()
    best_accuracy = accuracy_matrix.loc[dataset].max()
    params = param_configs[best_model]
    
    summary_data["Dataset"].append(dataset)
    summary_data["Winning Model"].append(best_model)
    summary_data["Accuracy"].append(round(best_accuracy, 3))
    summary_data["C"].append(params["C"])
    summary_data["Kernel"].append(params["kernel"])
    summary_data["Gamma"].append(params["gamma"])

df_summary = pd.DataFrame(summary_data)

# Save and show table
fig, ax = plt.subplots(figsize=(10, 2.5))
ax.axis('off')

table = ax.table(cellText=df_summary.values,
                 colLabels=df_summary.columns,
                 cellLoc='center',
                 loc='center')

table.scale(1, 2)
table.auto_set_font_size(False)
table.set_fontsize(11)

plt.title("Winning Hyperparameter Configuration per Dataset", pad=20)
plt.tight_layout()

output_path = os.path.join("results", "winning_hyperparameters_per_dataset.png")
plt.savefig(output_path, dpi=300, bbox_inches='tight')
plt.show()


# Final Analysis
For each dataset, a SVM with 5-fold CV was trained and evaluated using the winning hyperparameter configuration from the previous analysis. The evaluation included the calculation of key classification metrics, namely accuracy, micro F1 score, macro F1 score, and the Matthews Correlation Coefficient (MCC). For each of these metrics, the mean and standard deviation is provided.

Data Set A2 (2 classes)
- **Hyperparameter:** `C = 1`, `kernel = rbf`, `gamma = 1`

Data Set A3 (2 classes)
- **Hyperparameter:** `C = 1`, `kernel = rbf`, `gamma = 1`

 Data Set A4 (3 classes)
- **Hyperparameter:** `C = 10`, `kernel = rbf`, `gamma = scale`

Data Set A12 (6 classes)
- **Hyperparameter:** `C = 10`, `kernel = rbf`, `gamma = scale`

Data Set A21 (2 classes)
- **Hyperparameter:** `C = 100`, `kernel = rbf`, `gamma = auto`


In [None]:
import pandas as pd
import numpy as np
import logging
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
from sklearn.svm import SVC

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# load data
df_a2 = pd.read_csv("data/normalized/A2.csv")
df_a3 = pd.read_csv("data/normalized/A3.csv")
df_a4 = pd.read_csv("data/normalized/A4.csv")
df_a12 = pd.read_csv("data/normalized/A12.csv")
df_a21 = pd.read_csv("data/normalized/A21.csv")

# define Data sets and hyperparameters
datasets = {
    "A2": (df_a2, {'C': 1, 'kernel': 'rbf', 'gamma': 1}),
    "A3": (df_a3, {'C': 1, 'kernel': 'rbf', 'gamma': 1}),
    "A4": (df_a4, {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}),
    "A12": (df_a12, {'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}),
    "A21": (df_a21, {'C': 100, 'kernel': 'rbf', 'gamma': 'auto'}),
}

results = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, (df, params) in datasets.items():
    logging.info(f"Process data set {name} with parameters {params}")
    
    X = df.drop(columns=['track']).values
    y = df['track'].values
    
    model = SVC(**params, random_state=42)

    acc_scores = []
    micro_f1_scores = []
    macro_f1_scores = []
    mcc_scores = []

    for train_idx, test_idx in cv.split(X, y):
        model.fit(X[train_idx], y[train_idx])
        y_pred = model.predict(X[test_idx])
        y_true = y[test_idx]

        acc_scores.append(accuracy_score(y_true, y_pred))
        micro_f1_scores.append(f1_score(y_true, y_pred, average='micro'))
        macro_f1_scores.append(f1_score(y_true, y_pred, average='macro'))
        mcc_scores.append(matthews_corrcoef(y_true, y_pred))

    def fmt(mean, std):
        return f"{mean:.3f} ± {std:.3f}"

    results.append({
        "Dataset": name,
        "Accuracy": fmt(np.mean(acc_scores), np.std(acc_scores, ddof=1)),
        "Micro F1": fmt(np.mean(micro_f1_scores), np.std(micro_f1_scores, ddof=1)),
        "Macro F1": fmt(np.mean(macro_f1_scores), np.std(macro_f1_scores, ddof=1)),
        "MCC": fmt(np.mean(mcc_scores), np.std(mcc_scores, ddof=1))
    })

# Results
results_df = pd.DataFrame(results)
print("\nSVM Evaluation Results:\n")
print(results_df.to_string(index=False))

results_df.to_csv("results/svm_evaluation_results.csv", index=False)



## Tabular presentation of the results

In [None]:
import matplotlib.pyplot as plt
import os

# create plot
fig, ax = plt.subplots(figsize=(10, len(results_df) * 0.6 + 1))
ax.axis('off')

# create table
table = ax.table(cellText=results_df.values,
                 colLabels=results_df.columns,
                 cellLoc='center',
                 loc='center')

# styling
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)

# save and display
plt.tight_layout()
plt.savefig("results/svm_evaluation_results.png", dpi=300)
plt.show()
