In [8]:
# Run this to use from colab environment

import zipfile
import os
import pandas as pd
import numpy as np

with zipfile.ZipFile('ecg/ecg_data.zip', 'r') as zip_ref: #TODO: let hierop voor inleveren
    zip_ref.extractall('ecg')

data = pd.read_csv('ecg/ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 827
The number of columns: 9001


In [5]:
# Splits features en labels
X = data.iloc[:, :-1].values  # Alle kolommen behalve de laatste zijn de features
y = data.iloc[:, -1].values   # De laatste kolom is de label (0 of 1)

In [None]:
import random
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from deap import base, creator, tools, algorithms

# Set random seed for reproducibility
random.seed(42)  # Set for random module
np.random.seed(42)  # Set for numpy module, used in data scaling, etc.

# Function to run Genetic Algorithm-based feature selection
def ga_feature_selection(X_train_scaled, y_train):
    # Set random seed for GA
    random.seed(42)
    
    # Define the evaluation function
    def evaluate(individual):
        # Individual is a binary list representing selected features of a potential solution

        # Convert binary list to feature indices
        selected_features = [i for i, bit in enumerate(individual) if bit == 1]
        
        # If no features are selected, return worst fitness
        if len(selected_features) == 0:
            return 0.0,
        
        # Select the features from the training data
        X_selected = X_train_scaled[:, selected_features]

        # Train Random Forest on selected features
        model = RandomForestClassifier(random_state=42)
        model.fit(X_selected, y_train)
        
        # Evaluate the accuracy of the model
        accuracy = model.score(X_selected, y_train)
        
        return accuracy,  # Return the accuracy as fitness score

    # Set up Genetic Algorithm using DEAP
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize fitness (so accuracy)
    creator.create("Individual", list, fitness=creator.FitnessMax)  # Create individual which represents a solution

    toolbox = base.Toolbox()  # Toolbox to hold genetic operations
    toolbox.register("attr_bool", random.randint, 0, 1)  # How to create a bit for the individual
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X_train_scaled.shape[1])  # Create an individual with n features
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)  # Creates a population of individuals
    toolbox.register("mate", tools.cxTwoPoint)  # Crossover operation which combines two individuals
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Randomly flips bits in the individual with a probability of 0.05
    toolbox.register("select", tools.selTournament, tournsize=3)  # Selects individuals for the next generation using tournament selection
    toolbox.register("evaluate", evaluate)

    # Run the Genetic Algorithm
    population = toolbox.population(n=5)  # Create a population of n individuals
    algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=5, verbose=True)  # ngen is the number of iterations of the GA

    # Extract the best individual after the GA completes
    best_individual = tools.selBest(population, k=1)[0]  # Select the best individual (solution)
    selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

    # Print the number of features selected
    print(f"Number of features selected: {len(selected_features)}")
    
    return selected_features

def inner_cv_with_ga(X, y, feature_selection_method, k_features=2100):
    skf_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Set random_state for reproducibility

    # Lists to accumulate F1 scores across all folds
    f1_normal_scores = []
    f1_abnormal_scores = []

    for inner_train_idx, inner_val_idx in skf_inner.split(X, y):
        X_train, X_val = X[inner_train_idx], X[inner_val_idx]
        y_train, y_val = y[inner_train_idx], y[inner_val_idx]

        # --- Scale the data (fit on training, transform on both train/validation) ---
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)  # Fit on training set
        X_val_scaled = scaler.transform(X_val)        # Use the same scaler for the validation set

        # --- Feature Selection using the GA method in the inner CV ---
        selected_features_ga = ga_feature_selection(X_train_scaled, y_train)

        # --- Select the features based on the GA result ---
        X_train_sel = X_train_scaled[:, selected_features_ga]
        X_val_sel = X_val_scaled[:, selected_features_ga]

        # --- Train the model with the selected features ---
        rf = RandomForestClassifier(random_state=42)  # Set random_state for reproducibility
        rf.fit(X_train_sel, y_train)

        # Evaluate the performance on the validation set
        y_pred = rf.predict(X_val_sel)

        # Calculate F1 scores for both classes (normal and abnormal)
        f1_rf = f1_score(y_val, y_pred, average=None, labels=[0, 1])

        # Print F1 scores for each class
        print(f"Random Forest F1 - Normal: {f1_rf[0]:.4f}, Abnormal: {f1_rf[1]:.4f}")

        # Append the F1 scores to the lists
        f1_normal_scores.append(f1_rf[0])
        f1_abnormal_scores.append(f1_rf[1])

    # Calculate the average F1 scores for both classes across all folds
    avg_f1_normal = sum(f1_normal_scores) / len(f1_normal_scores)
    avg_f1_abnormal = sum(f1_abnormal_scores) / len(f1_abnormal_scores)

    # Print the average F1 scores
    print(f"Average F1 - Normal: {avg_f1_normal:.4f}, Abnormal: {avg_f1_abnormal:.4f}")


In [19]:
inner_cv_with_ga(X, y, feature_selection_method=ga_feature_selection)



gen	nevals
0  	5     
1  	4     
2  	4     
3  	4     
4  	0     
5  	4     
Number of features selected: 4491
Random Forest F1 - Normal: 0.9158, Abnormal: 0.2857




KeyboardInterrupt: 