This file consists of all feature selection methods that were tried and evaluated that aren't used in the final model

Unzipping and loading data

In [None]:
import zipfile
import pandas as pd

with zipfile.ZipFile('ecg/ecg_data.zip', 'r') as zip_ref: #TODO: let hierop voor inleveren
    zip_ref.extractall('ecg')
# Load dataset (assuming a CSV format)
df = pd.read_csv("ecg/ecg_data.csv")

Feature correlation for determining number of redundant features

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

features = df.iloc[:, :-1]  # Assuming all but the last column are features
labels = df.iloc[:, -1]     # Assuming the last column is the target (0/1)

# Caculate the correlation matrix
subset_of_features = features #.iloc[:, :3000]  
correlation_matrix = subset_of_features.corr()

#Identify highly correlated features
threshold = 0.88
upper = correlation_matrix.where( 
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)) #only upper triangle of the matrix (to avoid checking each pair twice)

to_drop = set()
for column in upper.columns:
    high_corr = upper[column][abs(upper[column]) > threshold].index.tolist() # checks for correlation above the threshold
    for correlated_feature in high_corr:
        if correlated_feature not in to_drop and column not in to_drop:
            to_drop.add(correlated_feature)

print(f"Number of features with correlation to drop (one per correlated pair for which correlation > {threshold}): {len(to_drop)}")

# Drop redundant features
subset_of_features_reduced = subset_of_features.drop(columns=to_drop)
print(f"Number of features that remain: {len(subset_of_features_reduced.columns)}")

# Splitting the data into training, validation and testing sets
x_train, x_test, y_train, y_test = train_test_split(subset_of_features_reduced, labels, test_size=0.2, random_state=42, stratify=labels)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

# Train Random Forest with class balancing to determine optimal threshold
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  
)
rf_model.fit(x_train, y_train)

# Predict and evaluate
y_pred = rf_model.predict(x_val)
y_prob = rf_model.predict_proba(x_val)[:, 1]

# Metrics
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nROC AUC Score:", roc_auc_score(y_val, y_prob))

T-test, SelectKBest, Random Forest feature importance, Mutual Information and Optimization-based feature selection

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
#from deap import base, creator, tools, algorithms
import random

# Load dataset
df = pd.read_csv("ecg/ecg_data.csv")

# Separate features and labels
X = df.iloc[:, :-1].values  # First 9000 columns: features
y = df.iloc[:, -1].values   # Last column: labels (0 = normal, 1 = abnormal)

# Stratified Train-Test Split (80%-20%) to preserve class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Split training set into training (80%) and validation (20%) with stratification
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Standardize the data (scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# --- T-test Feature Selection --- 
# Perform t-test for each feature on the scaled data
p_values = []
for i in range(X_train_scaled.shape[1]):  # Use X_train_scaled
    t_stat, p_value = stats.ttest_ind(X_train_scaled[y_train == 0, i], X_train_scaled[y_train == 1, i])
    p_values.append(p_value)
p_values = np.array(p_values)
sorted_indices = np.argsort(p_values)
top_2100_features_ttest = sorted_indices[:2100]
X_train_ttest = X_train_scaled[:, top_2100_features_ttest]
X_val_ttest = X_val_scaled[:, top_2100_features_ttest]
X_test_ttest = X_test_scaled[:, top_2100_features_ttest]

# --- SelectKBest Feature Selection --- 
selector = SelectKBest(score_func=f_classif, k=2100)
X_train_kbest = selector.fit_transform(X_train_scaled, y_train)  # Use X_train_scaled
X_val_kbest = selector.transform(X_val_scaled)
X_test_kbest = selector.transform(X_test_scaled)

# --- Random Forest Feature Importance Selection --- 
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)  # Use X_train_scaled
importances = rf.feature_importances_
indices_rf = np.argsort(importances)[-2100:]
X_train_rf = X_train_scaled[:, indices_rf]
X_val_rf = X_val_scaled[:, indices_rf]
X_test_rf = X_test_scaled[:, indices_rf]

# --- Mutual Information Feature Selection --- 
mi_scores = mutual_info_classif(X_train_scaled, y_train)  # Use X_train_scaled
mi_feature_indices = np.argsort(mi_scores)[-2100:]
X_train_mi = X_train_scaled[:, mi_feature_indices]
X_val_mi = X_val_scaled[:, mi_feature_indices]
X_test_mi = X_test_scaled[:, mi_feature_indices]

# --- Optimization-based Feature Selection (Commented) ---
def evaluate(individual):
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    if len(selected_features) == 0:
        return 0.0,
    X_selected = X_train_scaled[:, selected_features]
    model = RandomForestClassifier(random_state=42)
    model.fit(X_selected, y_train)
    accuracy = model.score(X_selected, y_train)
    return accuracy,

#creator.create("FitnessMax", base.Fitness, weights=(1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMax)
#toolbox = base.Toolbox()
#toolbox.register("attr_bool", random.randint, 0, 1)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X_train_scaled.shape[1])
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("mate", tools.cxTwoPoint)
#toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
#toolbox.register("select", tools.selTournament, tournsize=3)
#toolbox.register("evaluate", evaluate)

#population = toolbox.population(n=10)
#algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=10, verbose=True)
#best_individual = tools.selBest(population, k=1)[0]
#selected_features_ob = [i for i, bit in enumerate(best_individual) if bit == 1]
#X_train_ob = X_train_scaled[:, selected_features_ob]
#X_val_ob = X_val_scaled[:, selected_features_ob]
#X_test_ob = X_test_scaled[:, selected_features_ob]

# --- Model Training and Evaluation ---
def evaluate_model(X_train_selected, X_val_selected, X_test_selected):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_selected, y_train)
    y_pred = rf.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{cm}")

# Evaluate each feature selection method
print("Evaluating T-test Feature Selection:")
evaluate_model(X_train_ttest, X_val_ttest, X_test_ttest)

print("\nEvaluating SelectKBest Feature Selection:")
evaluate_model(X_train_kbest, X_val_kbest, X_test_kbest)

print("\nEvaluating Random Forest Feature Importance Selection:")
evaluate_model(X_train_rf, X_val_rf, X_test_rf)

print("\nEvaluating Mutual Information Feature Selection:")
evaluate_model(X_train_mi, X_val_mi, X_test_mi)

#print("\nEvaluating Optimization-based Feature Selection:")
#evaluate_model(X_train_ob, X_val_ob, X_test_ob)


Evaluating T-test Feature Selection:
Accuracy: 0.8133
              precision    recall  f1-score   support

           0       0.83      0.97      0.90       137
           1       0.33      0.07      0.11        29

    accuracy                           0.81       166
   macro avg       0.58      0.52      0.50       166
weighted avg       0.74      0.81      0.76       166

Confusion Matrix:
[[133   4]
 [ 27   2]]

Evaluating SelectKBest Feature Selection:
Accuracy: 0.8313
              precision    recall  f1-score   support

           0       0.85      0.97      0.90       137
           1       0.56      0.17      0.26        29

    accuracy                           0.83       166
   macro avg       0.70      0.57      0.58       166
weighted avg       0.80      0.83      0.79       166

Confusion Matrix:
[[133   4]
 [ 24   5]]

Evaluating Random Forest Feature Importance Selection:
Accuracy: 0.8434
              precision    recall  f1-score   support

           0       0.8

Preprocess data

In [9]:
# Separate features and labels
X = df.iloc[:, :-1].values  # First 9000 columns: features
y = df.iloc[:, -1].values   # Last column: labels (0 = normal, 1 = abnormal)

# Stratified Train-Test Split (80-20) to preserve the class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Split training set into training (80%) and validation (20%) with stratification
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Standardize Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

1. T-test

In [None]:
# Instantiëren van het Random Forest-model met aangepaste hyperparameters

# Step 2: T-test for each feature on the training data
p_values = []

# Loop through all features (columns of X_train)
for i in range(X_train.shape[1]):
    # Perform t-test between the two classes for each feature
    t_stat, p_value = stats.ttest_ind(X_train[y_train == 0, i], X_train[y_train == 1, i])
    p_values.append(p_value)

# Convert p-values to a numpy array and sort them from low to high
p_values = np.array(p_values)
sorted_indices = np.argsort(p_values)

# Step 3: Select the top 2100 features based on the lowest p-values (most significant features)
top_2100_features = sorted_indices[:2100]  # Select the indices of the top 2100 features

# Create a new dataset with only the top 2100 features
X_train_selected = X_train[:, top_2100_features]
X_val_selected = X_val[:, top_2100_features]
X_test_selected = X_test[:, top_2100_features]

# Step 4: Standardize the data (using training data statistics)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)  # Standardize the selected features
X_val_scaled = scaler.transform(X_val_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Step 5: Train a RandomForest model using the selected and standardized features
rf = RandomForestClassifier(n_estimators=100, 
                            max_depth=10, 
                            min_samples_split=5, 
                            min_samples_leaf=3, 
                            class_weight='balanced', 
                            random_state=42)
rf.fit(X_train_scaled, y_train)  # Train on the selected features

# Step 6: Make predictions
y_pred = rf.predict(X_test_scaled)

# Show classification results
print(classification_report(y_test, y_pred))

1. SelectKBest

In [None]:
# Step 3: Use SelectKBest to select the top 2100 features
selector = SelectKBest(score_func=f_classif, k=2100)
X_train_selected_kb = selector.fit_transform(X_train, y_train)
X_val_selected_kb = selector.transform(X_val)
X_test_selected_kb = selector.transform(X_test)

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)
print(f"Selected feature indices: {selected_feature_indices}")

2. Random Forest feature importance

In [26]:
# Train een RandomForest-model met alle features om de feature importances te verkrijgen
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Verkrijg de feature importances van het getrainde model
importances = rf.feature_importances_

# Selecteer de top 2100 meest belangrijke features
indices = np.argsort(importances)[-2100:]  # Top 2100 features

# Maak nieuwe datasets met de geselecteerde features
X_train_selected_rf = X_train[:, indices]
X_val_selected_rf = X_val[:, indices]
X_test_selected_rf = X_test[:, indices]

# Train opnieuw een RandomForest-model met de geselecteerde features
rf.fit(X_train_selected_rf, y_train)

# Voorspel de labels voor de testdata met de geselecteerde features
y_pred_rf = rf.predict(X_test_selected_rf)

# Print de classification report van de voorspellingen
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.84      1.00      0.91       137
           1       1.00      0.10      0.19        29

    accuracy                           0.84       166
   macro avg       0.92      0.55      0.55       166
weighted avg       0.87      0.84      0.79       166



3. Mutual information

In [19]:
# Select Features using Mutual Information
mi_scores = mutual_info_classif(X_train_scaled, y_train)
mi_feature_indices = np.argsort(mi_scores)[-2100:]  # Select top 100 features
X_train_mi = X_train_scaled[:, mi_feature_indices]
X_val_mi = X_val_scaled[:, mi_feature_indices]

# Train Random Forest with Selected Features
rf_mi = RandomForestClassifier(random_state=42)
rf_mi.fit(X_train_mi, y_train)
y_pred_mi = rf_mi.predict(X_val_mi)

# Compute Accuracy, classification report & Confusion Matrix
accuracy_mi = accuracy_score(y_val, y_pred_mi)
print(classification_report(y_val, y_pred_mi))
cm_mi = confusion_matrix(y_val, y_pred_mi)

print(f"Accuracy (Mutual Information Feature Selection): {accuracy_mi:.4f}")

              precision    recall  f1-score   support

           0       0.83      0.97      0.90       109
           1       0.50      0.12      0.20        24

    accuracy                           0.82       133
   macro avg       0.67      0.55      0.55       133
weighted avg       0.77      0.82      0.77       133

Accuracy (Mutual Information Feature Selection): 0.8195


4. Genetic Algorithm

In [None]:
import random

# Define the evaluation function
def evaluate(individual):
    # Individual is a binary list representing selected features of a potential solution

    # Convert binary list to feature indices
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    # If no features are selected, return worst fitness
    if len(selected_features) == 0:
        return 0.0,
    
    # Select the features from the training data
    X_selected = X_train_scaled[:, selected_features]

    # Train Random Forest on selected features
    model = RandomForestClassifier(random_state=42)
    model.fit(X_selected, y_train)
    
    # Evaluate the accuracy of the model
    accuracy = model.score(X_selected, y_train)
    
    return accuracy,  # Return the accuracy as fitness score

# Set up Genetic Algorithm using DEAP
creator.create("FitnessMax", base.Fitness, weights=(1.0,)) # Maximize fitness (so accuracy)
creator.create("Individual", list, fitness=creator.FitnessMax) # Create individual which represents a solution

toolbox = base.Toolbox() # Toolbox to hold genetic operations
toolbox.register("attr_bool", random.randint, 0, 1) # How to create a bit for the individual
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X_train_scaled.shape[1]) # Creates and invidividual with n features
toolbox.register("population", tools.initRepeat, list, toolbox.individual) # Creates a population of individuals
toolbox.register("mate", tools.cxTwoPoint) # Crossover operation which combines two individuals
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) # Randomly flips bits in the individual with a probability of 0.05
toolbox.register("select", tools.selTournament, tournsize=3) # Selects individuals for the next generation using tournament selection
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm
population = toolbox.population(n=10) # Create a population of 10 individuals
algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=10, verbose=True) # ngen is the number of iterations of the GA

# Extract the best individual after the GA completes
best_individual = tools.selBest(population, k=1)[0] # Select the best individual (solution)
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

# Print the number of features selected
print(f"Number of features selected: {len(selected_features)}")

In [None]:
### Train Random Forest With Selected Features ###
X_train_ga = X_train_scaled[:, selected_features]
X_val_ga = X_val_scaled[:, selected_features]

model = RandomForestClassifier(random_state=42)
model.fit(X_train_ga, y_train)

# Predict on test data
y_pred_ga = model.predict(X_val_ga)

# Compute confusion matrix
cm_ga = confusion_matrix(y_val, y_pred_ga)
acc = accuracy_score(y_val, y_pred_ga)
print(classification_report(y_val, y_pred_ga))

print(f"Accuracy (Genetic algorithm feature selection): {acc:.4f}")

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm_ga, annot=True, fmt='d', cmap='Blues', xticklabels=["Normal", "Abnormal"], yticklabels=["Normal", "Abnormal"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

Evaluation of methods

In [None]:
def evaluate_model(X_train, X_val, y_train, y_val):
    """Train and evaluate a RandomForest model."""
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    
    # Overall metrics
    scores = {
        "Accuracy": accuracy_score(y_val, y_pred),
        "F1-score (Macro)": f1_score(y_val, y_pred, average='macro'),
        "F1-score (Weighted)": f1_score(y_val, y_pred, average='weighted'),
        "Precision": precision_score(y_val, y_pred),
        "Recall": recall_score(y_val, y_pred),
        "AUC-ROC": roc_auc_score(y_val, y_pred)
    }
    
    # Detailed classification report
    class_report = classification_report(y_val, y_pred)
    
    return scores, class_report

# Evaluate model performance with SelectKBest features
scores_kbest, class_report_kbest = evaluate_model(X_train_selected_kb, X_val_selected_kb, y_train, y_val)

# Evaluate model performance with RandomForest feature importance selection
scores_rf_importance, class_report_rf_importance = evaluate_model(X_train_selected_rf, X_val_selected_rf, y_train, y_val)

# Print results
print("Performance with SelectKBest:")
print(scores_kbest)
print("Classification Report with SelectKBest:")
print(class_report_kbest)

print("Performance with RF Feature Importance:")
print(scores_rf_importance)
print("Classification Report with RF Feature Importance:")
print(class_report_rf_importance)

Proberen alles in 1