This file consists of all feature selection methods that were tried and evaluated that aren't used in the final model

In [None]:
## Mutual information

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Select Features using Mutual Information
mi_scores = mutual_info_classif(X_train_scaled, y_train)
mi_feature_indices = np.argsort(mi_scores)[-2100:]  # Select top 100 features
X_train_mi = X_train_scaled[:, mi_feature_indices]
X_val_mi = X_val_scaled[:, mi_feature_indices]

# Train Random Forest with Selected Features
rf_mi = RandomForestClassifier(random_state=42)
rf_mi.fit(X_train_mi, y_train)
y_pred_mi = rf_mi.predict(X_val_mi)

# Compute Accuracy, classification report & Confusion Matrix
accuracy_mi = accuracy_score(y_val, y_pred_mi)
print(classification_report(y_val, y_pred_mi))
cm_mi = confusion_matrix(y_val, y_pred_mi)

print(f"Accuracy (Mutual Information Feature Selection): {accuracy_mi:.4f}")

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm_mi, annot=True, fmt='d', cmap='Blues', xticklabels=["Normal", "Abnormal"], yticklabels=["Normal", "Abnormal"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Feature selection using Genetic Algorithm

import random
from sklearn.ensemble import RandomForestClassifier
from deap import base, creator, tools, algorithms

# Define the evaluation function
def evaluate(individual):
    # Individual is a binary list representing selected features of a potential solution

    # Convert binary list to feature indices
    selected_features = [i for i, bit in enumerate(individual) if bit == 1]
    
    # If no features are selected, return worst fitness
    if len(selected_features) == 0:
        return 0.0,
    
    # Select the features from the training data
    X_selected = X_train_scaled[:, selected_features]

    # Train Random Forest on selected features
    model = RandomForestClassifier(random_state=42)
    model.fit(X_selected, y_train)
    
    # Evaluate the accuracy of the model
    accuracy = model.score(X_selected, y_train)
    
    return accuracy,  # Return the accuracy as fitness score

# Set up Genetic Algorithm using DEAP
creator.create("FitnessMax", base.Fitness, weights=(1.0,)) # Maximize fitness (so accuracy)
creator.create("Individual", list, fitness=creator.FitnessMax) # Create individual which represents a solution

toolbox = base.Toolbox() # Toolbox to hold genetic operations
toolbox.register("attr_bool", random.randint, 0, 1) # How to create a bit for the individual
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X_train_scaled.shape[1]) # Creates and invidividual with n features
toolbox.register("population", tools.initRepeat, list, toolbox.individual) # Creates a population of individuals
toolbox.register("mate", tools.cxTwoPoint) # Crossover operation which combines two individuals
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) # Randomly flips bits in the individual with a probability of 0.05
toolbox.register("select", tools.selTournament, tournsize=3) # Selects individuals for the next generation using tournament selection
toolbox.register("evaluate", evaluate)

# Run the Genetic Algorithm
population = toolbox.population(n=10) # Create a population of 10 individuals
algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=10, verbose=True) # ngen is the number of iterations of the GA

# Extract the best individual after the GA completes
best_individual = tools.selBest(population, k=1)[0] # Select the best individual (solution)
selected_features = [i for i, bit in enumerate(best_individual) if bit == 1]

# Print the number of features selected
print(f"Number of features selected: {len(selected_features)}")