In [1]:
# Run this to use from colab environment

import zipfile
import os
import pandas as pd
import numpy as np

with zipfile.ZipFile('ecg/ecg_data.zip', 'r') as zip_ref: #TODO: let hierop voor inleveren
    zip_ref.extractall('ecg')

data = pd.read_csv('ecg/ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 827
The number of columns: 9001


In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold

# Splits features en labels
X = data.iloc[:, :-1].values  # Alle kolommen behalve de laatste zijn de features
y = data.iloc[:, -1].values   # De laatste kolom is de label (0 of 1)

In [4]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import numpy as np

def evaluate_with_nested_cv(X, y, model, param_grid, use_scaling=True, n_splits_outer=5, n_splits_inner=5, n_iter=10):
    """
    Performs nested cross-validation for model selection and performance estimation.
    
    Args:
        X (np.ndarray): Feature matrix
        y (np.ndarray): Labels
        model: Classifier (e.g. SVC(), RandomForestClassifier())
        param_grid (dict): Hyperparameter grid for tuning
        use_scaling (bool): Whether to scale features (important for SVM)
        n_splits_outer (int): Folds for outer CV
        n_splits_inner (int): Folds for inner CV
        n_iter (int): Iterations for RandomizedSearchCV
        
    Returns:
        List of outer scores and their mean
    """
    outer_cv = StratifiedKFold(n_splits=n_splits_outer, shuffle=True, random_state=42)
    scores = []

    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        if use_scaling:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        inner_cv = StratifiedKFold(n_splits=n_splits_inner, shuffle=True, random_state=1)

        clf = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,
            scoring='roc_auc',
            n_iter=n_iter,
            cv=inner_cv,
            n_jobs=-1,
            random_state=1
        )
        clf.fit(X_train, y_train)
        best_model = clf.best_estimator_

        y_prob = best_model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, y_prob)
        scores.append(score)

    print("AUCs per fold:", scores)
    print("Mean AUC:", np.mean(scores))
    return scores


In [None]:
from sklearn.svm import SVC

svm = SVC(probability=True, class_weight='balanced')
svm_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.1],
    'kernel': ['rbf']
}

evaluate_with_nested_cv(X, y, model=svm, param_grid=svm_grid, use_scaling=True)


