In [85]:
import numpy as np
import sys
np.set_printoptions(threshold=np.inf)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
import warnings
from joblib import Parallel, delayed
import time

## Global Variables

In [80]:
CLASSIFIER = ''

## Notebook Start Time

In [81]:
start_time = time.time()

## Data

In [86]:
my_frac=0.01

# Load dataset as pandas DataFrame
df_train = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_train.parquet')
df_test = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_test.parquet')  

# Create sample DataFrame for feature selection
df_train_sample = df_train.sample(frac=my_frac, random_state=1984)      
df_test_sample = df_test.sample(frac=my_frac, random_state=1984)   

# Create sample X and y from train and test, convert to numpy arrays
X_train_sample = df_train_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_train_sample_2 = df_train_sample['class_label'].to_numpy()
y_train_sample_6 = df_train_sample['category_label'].to_numpy()
y_train_sample_19 = df_train_sample['attack_label'].to_numpy()


X_test_sample = df_test_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_test_sample_2 = df_test_sample['class_label'].to_numpy()
y_test_sample_6 = df_test_sample['category_label'].to_numpy()
y_test_sample_19 = df_test_sample['attack_label'].to_numpy()


# Create full data X and y from train and test, convert to numpy arrays
X_train_full = df_train_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_train_full_2 = df_train_sample['class_label'].to_numpy()
y_train_full_6 = df_train_sample['category_label'].to_numpy()
y_train_full_19 = df_train_sample['attack_label'].to_numpy()


X_test_full = df_test_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_test_full_2 = df_test_sample['class_label'].to_numpy()
y_test_full_6 = df_test_sample['category_label'].to_numpy()
y_test_full_19 = df_test_sample['attack_label'].to_numpy()

## Benchmarks

We use hyperparameters from the CICIoMT2024 to establish benchmarks for classification on the dataset. We will then use pso to select the best features and compare results.

### Logistic Regression

In [99]:
lr_benchmark = LogisticRegression(
    penalty='l2', 
    dual=False, 
    tol=0.0001, 
    C=1.0, 
    fit_intercept=True, 
    intercept_scaling=1, 
    solver='lbfgs', 
    max_iter=100,
    warm_start=False, 
    n_jobs=-1
    )

#### Binary Classification

In [None]:
lr_benchmark.fit(X_train_sample, y_train_sample_2)
y_pred_lr_benchmark_2 = lr_benchmark.predict(X_test_sample)

In [None]:
# print the classification report
report_lr_benchmark_2 = classification_report(y_test_sample_2, y_pred_lr_benchmark_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_lr_benchmark_2))

# calculate accuracy
accuracy_lr_benchmark_2 = accuracy_score(y_test_sample_2, y_pred_lr_benchmark_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_benchmark_2:.5f}")

#### 6 Class Classification

In [9]:
lr_benchmark.fit(X_train_sample, y_train_sample_6)
y_pred_lr_benchmark_6 = lr_benchmark.predict(X_test_sample)

In [None]:
# print the classification report
report_lr_benchmark_6 = classification_report(y_test_sample_6, y_pred_lr_benchmark_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_lr_benchmark_6))

# calculate accuracy
accuracy_lr_benchmark_6 = accuracy_score(y_test_sample_6, y_pred_lr_benchmark_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_benchmark_6:.5f}")

#### 19 Class Classification

In [11]:
lr_benchmark.fit(X_train_sample, y_train_sample_19)
y_pred_lr_benchmark_19 = lr_benchmark.predict(X_test_sample)

In [None]:
# print the classification report
report_lr_benchmark_19 = classification_report(y_test_sample_19, y_pred_lr_benchmark_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_lr_benchmark_19))

# calculate accuracy
accuracy_lr_benchmark_19 = accuracy_score(y_test_sample_19, y_pred_lr_benchmark_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_benchmark_19:.5f}")

### AdaBoost

In [100]:
ada_benchmark = AdaBoostClassifier(
    DecisionTreeClassifier(), 
    n_estimators=50,
    learning_rate=1.0, 
    algorithm='SAMME.R', 
    random_state=1984
    )

#### Binary Classification

In [14]:
ada_benchmark.fit(X_train_sample, y_train_sample_2)
y_pred_ada_benchmark_2 = ada_benchmark.predict(X_test_sample)

In [None]:
# print the classification report
report_ada_benchmark_2 = classification_report(y_test_sample_2, y_pred_ada_benchmark_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_ada_benchmark_2))

# calculate accuracy
accuracy_ada_benchmark_2 = accuracy_score(y_test_sample_2, y_pred_ada_benchmark_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_ada_benchmark_2:.5f}")

#### 6 Class Classification

In [16]:
ada_benchmark.fit(X_train_sample, y_train_sample_6)
y_pred_ada_benchmark_6 = ada_benchmark.predict(X_test_sample)

In [None]:
# print the classification report
report_ada_benchmark_6 = classification_report(y_test_sample_6, y_pred_ada_benchmark_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_ada_benchmark_6))

# calculate accuracy
accuracy_ada_benchmark_6 = accuracy_score(y_test_sample_6, y_pred_ada_benchmark_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_ada_benchmark_6:.5f}")

#### 19 Class Classification

In [18]:
ada_benchmark.fit(X_train_sample, y_train_sample_19)
y_pred_ada_benchmark_19 = ada_benchmark.predict(X_test_sample)

In [None]:
report_ada_benchmark_19 = classification_report(y_test_sample_19, y_pred_ada_benchmark_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_ada_benchmark_19))

# calculate accuracy
accuracy_ada_benchmark_19 = accuracy_score(y_test_sample_19, y_pred_ada_benchmark_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_ada_benchmark_19:.5f}")

#### Random Forest

In [101]:
rf_benchmark = RandomForestClassifier(
    n_estimators=100, 
    criterion='gini', 
    min_samples_split=2, 
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0, 
    max_features='sqrt', 
    min_impurity_decrease=0.0,
    bootstrap=True, 
    oob_score=False, 
    warm_start=False, 
    ccp_alpha=0.0, 
    n_jobs=-1, 
    random_state=1984
    )

#### Binary Classification

In [21]:
rf_benchmark.fit(X_train_sample, y_train_sample_2) 
y_pred_rf_benchmark_2 = rf_benchmark.predict(X_test_sample)   

In [None]:
# print the classification report
report_rf_benchmark_2 = classification_report(y_test_sample_2, y_pred_rf_benchmark_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_rf_benchmark_2))

# calculate accuracy
accuracy_rf_benchmark_2 = accuracy_score(y_test_sample_2, y_pred_rf_benchmark_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_rf_benchmark_2:.5f}")

#### 6 Class Classification

In [23]:
rf_benchmark.fit(X_train_sample, y_train_sample_6) 
y_pred_rf_benchmark_6 = rf_benchmark.predict(X_test_sample)   

In [None]:
# print the classification report
report_rf_benchmark_6 = classification_report(y_test_sample_6, y_pred_rf_benchmark_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_rf_benchmark_6))

# calculate accuracy
accuracy_rf_benchmark_6 = accuracy_score(y_test_sample_6, y_pred_rf_benchmark_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_rf_benchmark_6:.5f}")

#### 19 Class Classification

In [25]:
rf_benchmark.fit(X_train_sample, y_train_sample_19) 
y_pred_rf_benchmark_19 = rf_benchmark.predict(X_test_sample)    

In [None]:
# print the classification report
report_rf_benchmark_19 = classification_report(y_test_sample_19, y_pred_rf_benchmark_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_rf_benchmark_19))

# calculate accuracy
accuracy_rf_benchmark_19 = accuracy_score(y_test_sample_19, y_pred_rf_benchmark_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_rf_benchmark_19:.5f}")

## GA Code

## Feature Selection With GA 

Code below (up until "FUNCTION: Feature reduction with Genetics Algorithm") is created for interactive exploration of Genetics Algorithm. It should be removed from current notebook after users are comforatable with implementation. Function ga_feature_selection() is a copy of the code chunks below excluding printing statements.

### GA Variables

In [61]:
# Genetic Algorithm parameters
population_size = 20            # number of individuals in the population. HAS TO BE AN EVEN NUMBER!!!!!
n_generations = 50              # maximum number of generations
mutation_rate = 0.1             # probability of mutation
#fitness_threshold = 1          # fitness goal (threshold for stopping)
#stagnation_limit = 3           # number of generations without improvement before stopping
num_elites = 2                  # number of elite individuals to keep in each generation

### Fitness Function

In [62]:
# define the fitness function for evaluating feature subsets
def fitness_function(individual, y_train_sample, y_test_sample, CLASSIFIER="rf", metric="accuracy"):
    # function selects the features based on the individual's genes - features with values > 0.5 are selected
    selected_features = np.where(individual == 1)[0]  # select features based on individual genes
    
    if len(selected_features) == 0:                   # avoid empty feature set
        return 0
    
    X_train_selected = X_train_sample[:, selected_features]
    X_test_selected = X_test_sample[:, selected_features]

    if CLASSIFIER == 'lr':
        lr = LogisticRegression(**lr_benchmark.get_params())
        lr.fit(X_train_selected, y_train_sample)
        y_pred = lr.predict(X_test_selected)

    elif CLASSIFIER == 'ada':
        ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=1984)
        ada.fit(X_train_selected, y_train_sample)               
        y_pred = ada.predict(X_test_selected) 
    
    elif CLASSIFIER == 'rf':
        rf = RandomForestClassifier(**rf_benchmark.get_params())
        rf.fit(X_train_selected, y_train_sample)               
        y_pred = rf.predict(X_test_selected) 

    # Calculate the chosen metric
    if metric == 'accuracy':
        score = accuracy_score(y_test_sample, y_pred)
    elif metric == 'precision':
        score = precision_score(y_test_sample, y_pred, average='weighted')
    elif metric == 'recall':
        score = recall_score(y_test_sample, y_pred, average='weighted')
    elif metric == 'f1':
        score = f1_score(y_test_sample, y_pred, average='weighted')
    else:
        raise ValueError("Invalid metric. Choose from 'accuracy', 'precision', 'recall', or 'f1'.")

    return score 

### Generate Population and Objects for Collecting Results

In [63]:
# get the number of features for individual length
n_features = X_train_sample.shape[1]   # number of features in dataset (45)

# initialize population with random values between 0 and 1 (individuals represent feature subsets)
# population = np.random.rand(population_size, n_features)
population = np.random.randint(2, size=(population_size, n_features))

# initialize variables to track the best fitness and stagnation count
best_fitness_overall = 0
best_individual_overall = None
no_improvement_count = 0
termination_reason = None  # To store the reason for termination

In [None]:
# Genetic Algorithm main loop for feature selection
for generation in range(n_generations):
    
    # printing population at the start of each loop
    print(f"LOOP STEP: Generation {generation + 1}")
    print(f"Population {generation + 1}: {population}")
    
    # using fitness_function (defined above) to calculate fitness score (accuracy, f1 ...) for each individual in population
    fitness_scores = np.array([fitness_function(individual, y_train_sample=y_train_sample_2, y_test_sample=y_test_sample_2) for individual in population])
    
    ### PROGRESS UPDATE: display calculated fitness scores
    print(f"Fitness Scores: {fitness_scores}") 

    # track the best fitness score in the current generation. 
    # will be used later on test dataset as the best subset of features
    current_best_fitness = np.max(fitness_scores)
    current_best_individual = population[np.argmax(fitness_scores)]
    
    # count the number of selected features (those with values > 0.5)
    num_selected_features = np.sum(current_best_individual > 0.5)
    
    ### PROGRESS UPDATE: print generation stats including number of selected features
    print(f"Best Fitness = {current_best_fitness:.7f}, Number of Features = {num_selected_features}, Selected Features = {current_best_individual}")
    
    # update the overall best fitness and individual if necessary
    best_fitness_overall = current_best_fitness
    best_individual_overall = current_best_individual

    # update the overall best fitness and individual if necessary
    #if current_best_fitness > best_fitness_overall:
    #    best_fitness_overall = current_best_fitness
    #    best_individual_overall = current_best_individual
    #    no_improvement_count = 0  # reset stagnation counter if there's improvement
    #else:
    #    no_improvement_count += 1  # increase stagnation counter if no improvement

    ### PROGRESS UPDATE: print current state of best model
    print(f"Overall best fitness: {best_fitness_overall:.7f}, Overall best individual: {best_individual_overall}") 
    
    
    ###___________________START___________________###
    ### this 2 criteria were introduced to increase efficiency and terminate loop earlier.
    ### At this stage I am not entirely sure it is neceassary nor they work properly.
    ### will keep it commented out for now.

    # check if the fitness threshold is reached
    #if best_fitness_overall >= fitness_threshold:
    #    termination_reason = f"Desired fitness threshold of {fitness_threshold} reached."
    #    break
    
    # check if there's been no improvement for stagnation_limit generations
    #if no_improvement_count >= stagnation_limit:
    #    termination_reason = f"No improvement for {stagnation_limit} generations."
    #    break
    ###___________________END_____________________###


    ###___________________START___________________###
    # The purpose of this block of code to narrow down the individuals to those that have better fitness, 
    # creating a more fit population for the next generation. It mimics the principle of “survival of the fittest,” 
    # where better-performing individuals have a greater chance of reproducing.

    # Selection: roulette wheel selection based on fitness (the better individual's fitness score the more probability to be selected)
    probabilities = fitness_scores / np.sum(fitness_scores)
    ### PROGRESS UPDATE: print probabilities
    print(f"Probabilities: {probabilities}")

    # Randomly selects individuals with replacement, using the probabilities array to bias the selection
    selected_indices = np.random.choice(np.arange(population_size), size=population_size, p=probabilities)
    ### PROGRESS UPDATE: print selected indicies
    print(f"Selected indicies: {selected_indices}")

    # creating a new array that contains only the chosen individuals from the current generation.
    selected_population = population[selected_indices]
    ### PROGRESS UPDATE: print new population
    print(f"Selected population: {selected_population}")
    ###___________________END_____________________###


    
    ###___________________START___________________###
    # ELITISM:
    # technique used to preserve the best solutions (individuals) from one generation to the next. 
    # It helps ensure that the best-found solutions so far are not lost during the evolution process.

    elite_indices = np.argsort(fitness_scores)[-num_elites:]
    ### PROGRESS UPDATE: print new population
    print(f"Elites indicies: {elite_indices}")

    elite_population = population[elite_indices]
    ### PROGRESS UPDATE: print elite individuals
    print(f"Elites individuals: {elite_population}")
    ###___________________END_____________________###


    ###___________________START___________________###
    # Crossover: single-point crossover. The goal is to blend the genetic material of two parents 
    # in such a way that the offspring may inherit the best traits (features in this case) from each parent, 
    # promoting better solutions in future generations.

    offspring = []
    # the loop goes through pair of individuals iterating by 2
    for i in range(0, population_size, 2):
        # selecting 2 consequntive individuals as parents
        parent1, parent2 = selected_population[i], selected_population[i+1]
        ### PROGRESS UPDATE: parents 
        print(f"Parent 1: {parent1}; Parent 2: {parent2}")

        # randomely defines where the “split” between the parents’ genes will occur
        crossover_point = np.random.randint(1, n_features-1)
        ### PROGRESS UPDATE: crossover point
        print(f"Crossover_point: {crossover_point}")

        child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
        child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
        ### PROGRESS UPDATE: children 
        print(f"Child 1: {child1}; Child 2: {child2}")
        
        offspring.extend([child1, child2])
    
    # The two newly created children are added to the offspring list using offspring.extend
    # Ensure correct size by trimming excess if necessary
    offspring = np.array(offspring[:population_size - num_elites])  # Ensure correct size by trimming excess if necessary
    ###___________________END___________________###

    

    ###___________________START___________________### 
    # MUTATION:
    # Flip features value (0 or 1) with a probability of mutation_rate and introduces diversity to the population 
    # by altering some genes in random locations, ensuring that the algorithm doesn’t rely only on existing solutions. 
    # This variation can help the algorithm escape local optima and explore a broader solution space, 
    # which is especially useful in complex optimization problems.
    for individual in offspring:
        # firstly create random values between 0 and 1 for each feature in individual, 
        # then compare to hyperparameter (mutation_rate) 
        # returning TRUE (less then hyperparameter value) or FALSE for each feature 
        mutation_mask = np.random.rand(n_features) < mutation_rate
        # PROGRESS UPDATE: mutation_mask
        print(f"Mutation mask: {mutation_mask}")

        # if TRUE the feature will mutate (flip from 0 to 1 of vice versa)
        individual[mutation_mask] = 1 - individual[mutation_mask]  # flip 0 to 1 or 1 to 0
        ###___________________END___________________###


    # Replace the population with offspring plus elite individuals
    population = np.vstack((elite_population, offspring))

### This statement relied on stagnation variable which is not in use, so commented out for now
# if no termination reason set, the loop ran for the maximum number of generations
#if termination_reason is None:
#    termination_reason = f"Reached maximum number of generations ({n_generations})."

# print termination reason
print(f"Termination Reason: {termination_reason}")

In [None]:
print(f"Best Fitness Overall: {best_fitness_overall}; Number of Selected Features: {num_selected_features}; Best inidvidual Overall: {best_individual_overall}")

### FUNCTION: Feature reduction with Genetics Algorithm

In [97]:
# define overall function (just combined 4 code chunks above into 1 easy to use function)
def ga_feature_reduction(X_train_sample, y_train_sample, X_test_sample, y_test_sample, 
                         population_size=20, n_generations=50, mutation_rate=0.1, 
                         num_elites=2, CLASSIFIER="rf", metric="accuracy"):
    # Validate inputs
    if population_size % 2 != 0:
        raise ValueError("Population size must be an even number.")
    
    # Get the number of features for individual length
    n_features = X_train_sample.shape[1]
    
    # Initialize population with random binary values (0 or 1)
    population = np.random.randint(2, size=(population_size, n_features))
    
    best_fitness_overall = 0
    best_individual_overall = None
    termination_reason = None

    # Main loop for the genetic algorithm
    for generation in range(n_generations):
        print(f"LOOP STEP: Generation {generation + 1}")

        # Calculate fitness scores for the current population
        fitness_scores = np.array([fitness_function(individual, y_train_sample, y_test_sample, CLASSIFIER, metric) 
                                    for individual in population])
        print(f"Fitness Scores: {fitness_scores}") 
        
        # Track the best individual in the current generation
        current_best_fitness = np.max(fitness_scores)
        current_best_individual = population[np.argmax(fitness_scores)]
        
        # Update overall best fitness and individual if necessary
        if current_best_fitness > best_fitness_overall:
            best_fitness_overall = current_best_fitness
            best_individual_overall = current_best_individual
        
        print(f"Overall best fitness: {best_fitness_overall:.7f}, Overall best individual: {best_individual_overall}")

        # Selection: roulette wheel selection based on fitness
        probabilities = fitness_scores / np.sum(fitness_scores)
        selected_indices = np.random.choice(np.arange(population_size), size=population_size, p=probabilities)
        selected_population = population[selected_indices]

        # Elitism: preserve the best individuals
        elite_indices = np.argsort(fitness_scores)[-num_elites:]
        elite_population = population[elite_indices]

        # Crossover: single-point crossover
        offspring = []
        for i in range(0, population_size, 2):
            parent1, parent2 = selected_population[i], selected_population[i + 1]
            crossover_point = np.random.randint(1, n_features - 1)
            child1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
            child2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
            offspring.extend([child1, child2])

        offspring = np.array(offspring[:population_size - num_elites])  # Ensure correct size by trimming excess

        # Mutation: flip bits with a probability of mutation_rate
        for individual in offspring:
            mutation_mask = np.random.rand(n_features) < mutation_rate
            individual[mutation_mask] = 1 - individual[mutation_mask]

        # Replace the population with offspring plus elite individuals
        population = np.vstack((elite_population, offspring))
    
    # Function output
    print(f"Termination Reason: {termination_reason}")
    print(f"Best Fitness: {best_fitness_overall}")
    print(f"Best Individual: {best_individual_overall}")
    return best_fitness_overall, best_individual_overall

def fitness_function(individual, y_train_sample, y_test_sample, CLASSIFIER="rf", metric="accuracy"):
    selected_features = np.where(individual == 1)[0]  # Select features based on individual's genes
    
    if len(selected_features) == 0:  # Avoid empty feature set
        return 0
    
    X_train_selected = X_train_sample[:, selected_features]
    X_test_selected = X_test_sample[:, selected_features]

    if CLASSIFIER == 'lr':
        lr = LogisticRegression()
        lr.fit(X_train_selected, y_train_sample)
        y_pred = lr.predict(X_test_selected)

    elif CLASSIFIER == 'ada':
        ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=1984)
        ada.fit(X_train_selected, y_train_sample)
        y_pred = ada.predict(X_test_selected)

    elif CLASSIFIER == 'rf':
        rf = RandomForestClassifier()
        rf.fit(X_train_selected, y_train_sample)
        y_pred = rf.predict(X_test_selected)

    # Calculate the chosen metric
    if metric == 'accuracy':
        score = accuracy_score(y_test_sample, y_pred)
    elif metric == 'precision':
        score = precision_score(y_test_sample, y_pred, average='weighted')
    elif metric == 'recall':
        score = recall_score(y_test_sample, y_pred, average='weighted')
    elif metric == 'f1':
        score = f1_score(y_test_sample, y_pred, average='weighted')
    else:
        raise ValueError("Invalid metric. Choose from 'accuracy', 'precision', 'recall', or 'f1'.")

    return score


In [None]:
ga_feature_reduction(
    X_train_sample = X_train_sample,
    y_train_sample = y_train_sample_2,
    X_test_sample = X_test_sample,
    y_test_sample = y_test_sample_2,
    population_size = 6,
    n_generations = 5,
    mutation_rate = 0.1,
    num_elites = 2,
    CLASSIFIER = "rf",
    metric = "accuracy"
)

## Feature Selection With GA

### Logistic Regression

#### Binary Classification

In [None]:
# Call the GA function
global_best_fitness, global_best_position = ga_feature_reduction(
    X_train_sample = X_train_sample,
    y_train_sample = y_train_sample_2,
    X_test_sample = X_test_sample,
    y_test_sample = y_test_sample_2,
    population_size = 6,
    n_generations = 3,
    mutation_rate = 0.1,
    num_elites = 2,
    CLASSIFIER = "lr",
    metric = "accuracy"
)

In [145]:
# use the best features identified by GA
selected_features = np.array(global_best_position, dtype=bool)
# Apply the selected features to your datasets
X_train_selected = X_train_sample[:, selected_features]
X_test_selected = X_test_sample[:, selected_features]

lr_benchmark.fit(X_train_selected, y_train_sample_2) 
y_pred_lr_ga_2 = lr_benchmark.predict(X_test_selected)  


In [None]:
# print the classification report
report_lr_ga_2 = classification_report(y_test_sample_2, y_pred_lr_ga_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_lr_ga_2))

print(f"Number of selected features: {X_train_selected.shape[1]}")

# calculate accuracy
accuracy_lr_ga_2 = accuracy_score(y_test_sample_2, y_pred_lr_ga_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_ga_2:.5f}")

In [None]:
# generate the confusion matrix
cm_lr_ga_2 = confusion_matrix(y_test_sample_2, y_pred_lr_ga_2)

# visualize the confusion matrix using seaborn
plt.figure(figsize=(15, 12))
sns.heatmap(cm_lr_ga_2, annot=True, fmt='d', cmap='Blues', xticklabels=lr_benchmark.classes_, yticklabels=lr_benchmark.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

#### 6 Class Classification

In [None]:
# Call the GA function
global_best_fitness, global_best_position = ga_feature_reduction(
    X_train_sample = X_train_sample,
    y_train_sample = y_train_sample_6,
    X_test_sample = X_test_sample,
    y_test_sample = y_test_sample_6,
    population_size = 6,
    n_generations = 3,
    mutation_rate = 0.1,
    num_elites = 2,
    CLASSIFIER = "lr",
    metric = "accuracy"
)

In [152]:
# use the best features identified by GA
selected_features = np.array(global_best_position, dtype=bool)
# Apply the selected features to your datasets
X_train_selected = X_train_sample[:, selected_features]
X_test_selected = X_test_sample[:, selected_features]

lr_benchmark.fit(X_train_selected, y_train_sample_6) 
y_pred_lr_ga_6 = lr_benchmark.predict(X_test_selected)  

In [None]:
# print the classification report
report_lr_ga_6 = classification_report(y_test_sample_6, y_pred_lr_ga_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_lr_ga_6))

print(f"Number of selected features: {X_train_selected.shape[1]}")

# calculate accuracy
accuracy_lr_ga_6 = accuracy_score(y_test_sample_6, y_pred_lr_ga_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_ga_6:.5f}")

In [None]:
# generate the confusion matrix
cm_lr_ga_6 = confusion_matrix(y_test_sample_6, y_pred_lr_ga_6)

# visualize the confusion matrix using seaborn
plt.figure(figsize=(15, 12))
sns.heatmap(cm_lr_ga_6, annot=True, fmt='d', cmap='Blues', xticklabels=lr_benchmark.classes_, yticklabels=lr_benchmark.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

#### 19 Class Classification

In [None]:
# Call the GA function
global_best_fitness, global_best_position = ga_feature_reduction(
    X_train_sample = X_train_sample,
    y_train_sample = y_train_sample_19,
    X_test_sample = X_test_sample,
    y_test_sample = y_test_sample_19,
    population_size = 6,
    n_generations = 3,
    mutation_rate = 0.1,
    num_elites = 2,
    CLASSIFIER = "lr",
    metric = "accuracy"
)

In [156]:
# use the best features identified by GA
selected_features = np.array(global_best_position, dtype=bool)
# Apply the selected features to your datasets
X_train_selected = X_train_sample[:, selected_features]
X_test_selected = X_test_sample[:, selected_features]

lr_benchmark.fit(X_train_selected, y_train_sample_19) 
y_pred_lr_ga_19 = lr_benchmark.predict(X_test_selected)  

In [None]:
# print the classification report
report_lr_ga_19 = classification_report(y_test_sample_19, y_pred_lr_ga_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_lr_ga_19))

print(f"Number of selected features: {X_train_selected.shape[1]}")

# calculate accuracy
accuracy_lr_ga_19 = accuracy_score(y_test_sample_19, y_pred_lr_ga_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_ga_19:.5f}")

In [None]:
# generate the confusion matrix
cm_lr_ga_19 = confusion_matrix(y_test_sample_19, y_pred_lr_ga_19)

# visualize the confusion matrix using seaborn
plt.figure(figsize=(15, 12))
sns.heatmap(cm_lr_ga_19, annot=True, fmt='d', cmap='Blues', xticklabels=lr_benchmark.classes_, yticklabels=lr_benchmark.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()