In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pygad
import torch
# from google.colab import drive

In [2]:
# drive.mount('/content/drive')
# file_path = '/content/drive/My Drive/Research/dataset/cardio_train.csv'
file_path = 'dataset/cardio_train.csv'

df = pd.read_csv(file_path, sep=';')

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
columns = df.columns.drop('cardio')
print(columns)

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active'],
      dtype='object')


In [4]:
# remove duplicates rows of columns=columns
df.drop_duplicates(subset=columns, inplace=True)

In [5]:
# replace age in days to age in years

df['age'] = df['age'] / 365
df['age'] = df['age'].astype(int)

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0


In [6]:
# group by blood pressure
# 1- Normal (sys <= 120 and dia <= 80)
# 2- at risk (120 < sys <= 140 or 80 < dia <= 90)
# 3- high (sys > 140 or dia > 90)

df['blood_pressure'] = 0

df.loc[(df['ap_hi'] <= 120) & (df['ap_lo'] <= 80), 'blood_pressure'] = 1

df.loc[((df['ap_hi'] > 120) & (df['ap_hi'] <= 140)) | ((df['ap_lo'] > 80) & (df['ap_lo'] <= 90)), 'blood_pressure'] = 2

df.loc[(df['ap_hi'] > 140) | (df['ap_lo'] > 90), 'blood_pressure'] = 3

df.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,blood_pressure
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,1
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,2
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,2
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,3
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,1


In [7]:
# calculate BMI

df['bmi'] = df['weight'] / (df['height'] / 100) ** 2

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,blood_pressure,bmi
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,1,21.96712
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,2,34.927679
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,2,23.507805
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,3,28.710479
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,1,23.011177


In [8]:
df.fillna(df.median(), inplace=True)
df = df.drop(columns=['id'],axis=1)

# Extract features (all columns except 'cardio') and target variable ('cardio')
X = df.drop(columns=['cardio'],axis=1)
y = df['cardio']

columns_to_standardize = ['age', 'height','weight','ap_hi','ap_lo','bmi']


# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train[columns_to_standardize] = scaler.fit_transform(X_train[columns_to_standardize])
X_test[columns_to_standardize] = scaler.fit_transform(X_test[columns_to_standardize])
X_val[columns_to_standardize] = scaler.fit_transform(X_val[columns_to_standardize])

In [9]:
from deap import base, creator, tools, algorithms
import random
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


In [10]:
# Create the Fitness and Individual classes
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

def evaluate(individual):
    # Ensure the number of hidden layers is an integer
    num_hidden_layers = int(individual[-3])
    
    # Extract the sizes of the hidden layers based on the number of hidden layers
    hidden_layers = tuple(individual[:num_hidden_layers])
    
    # Extract other hyperparameters
    activation = ['identity', 'logistic', 'tanh', 'relu'][individual[-2]]
    solver = 'adam'
    alpha = individual[-1]

    # Initialize the MLPClassifier with the hyperparameters
    model = MLPClassifier(hidden_layer_sizes=hidden_layers, activation=activation,
                          solver=solver, alpha=alpha, max_iter= 1000, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    
    # Return the fitness score (accuracy)
    return accuracy,

# Define the genetic algorithm components
toolbox = base.Toolbox()

# Define individual components: up to 6 hidden layers, activation, solver, and alpha
toolbox.register("attr_hidden_layer_size", random.randint, 10, 200)
toolbox.register("attr_num_hidden_layers", random.randint, 1, 10)  # 1 to 10 hidden layers
toolbox.register("attr_activation", random.randint, 0, 3)
toolbox.register("attr_alpha", random.uniform, 0.0001, 0.01)

# Define the individual and population
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_hidden_layer_size,
                  toolbox.attr_num_hidden_layers,
                  toolbox.attr_activation,
                  toolbox.attr_alpha), n=1)

toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate",tools.cxTwoPoint)

# Update mutation function to ensure valid hidden layer sizes
toolbox.register("mutate", tools.mutUniformInt, 
                 low=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0.0001],  # Ensure minimum size of 1 for hidden layers
                 up=[200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 10, 3, 2, 0.01], indpb=0.2)

toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("evaluate", evaluate)

# Main function to run the genetic algorithm
def main():
    random.seed(42)
    
    # Initialize population
    population = toolbox.population(n=20)
    
    # Apply the genetic algorithm
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=20, verbose=True)
    
    # Get the best individual
    best_individual = tools.selBest(population, k=1)[0]
    print("Best individual is:", best_individual)
    print("Best validation accuracy is:", evaluate(best_individual)[0])

main()

gen	nevals
0  	20    
1  	15    
2  	4     
3  	13    
4  	11    
5  	14    
6  	12    
7  	10    
8  	12    
9  	15    
10 	7     
Best individual is: [135, 14, 38, 102, 114, 81, 181, 176, 155, 30, 1, 3, 0.0025056726997176727]
Best validation accuracy is: 0.7388095238095238


In [None]:
# select 20K randomly of data
X_train

In [None]:

# Convert pandas DataFrames to numpy arrays
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
X_val_np = X_val.to_numpy()

y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)
y_val_np = y_val.values.reshape(-1, 1)

# Convert numpy arrays to PyTorch tensors
X_train_torch = torch.tensor(X_train_np, dtype=torch.float32)
y_train_torch = torch.tensor(y_train_np, dtype=torch.float32)

X_val_torch = torch.tensor(X_val_np, dtype=torch.float32)
y_val_torch = torch.tensor(y_val_np, dtype=torch.float32)

X_test_torch = torch.tensor(X_test_np, dtype=torch.float32)
y_test_torch = torch.tensor(y_test_np, dtype=torch.float32)


In [None]:
class Model(nn.Module):
    def __init__(self, input_dim, activations, number_of_hidden_units):
        super(Model, self).__init__()
        self.net = nn.Sequential(

            nn.Linear(input_dim, number_of_hidden_units[0]),
            activations[0],

            nn.Linear(number_of_hidden_units[0], number_of_hidden_units[1]),
            activations[1],

            nn.Linear(number_of_hidden_units[1], number_of_hidden_units[2]),
            activations[2],

            nn.Linear(number_of_hidden_units[2], number_of_hidden_units[3]),
            activations[3],

            nn.Linear(number_of_hidden_units[3], number_of_hidden_units[4]),
            activations[4],

            nn.Linear(number_of_hidden_units[4], number_of_hidden_units[5]),
            activations[5],

            nn.Linear(number_of_hidden_units[5], 1),
            nn.Sigmoid()
        )
        

    def forward(self, x):
        return self.net(x)


In [None]:
sytem = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create a DataLoader
train_data = TensorDataset(X_test_torch, y_test_torch)
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# Create the model
number_of_hidden_units = [64, 128, 256, 512, 256, 128, 64]
activations = [nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU()]
model = Model(X_test_torch.size(1), activations, number_of_hidden_units).to(sytem)

# Define the loss function and the optimizer
criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# Function to train the model

def train_model(model, criterion, optimizer, train_loader, num_epochs=20):

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(sytem), labels.to(sytem)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        if (epoch + 1) % 25 == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss:.2f}")

In [None]:
# create genes which contains
#import pygad 1- number of hidden units at each 6 layers
# 2- activation function
# 3- number of epochs
# 4- learning rate
# 5- batch sizeimport torch
# 6- optimizer
# 7- loss function
# 8- feature selection

number_of_hidden_units_layer0 = range(512)
number_of_hidden_units_layer1 = range(512)
number_of_hidden_units_layer2 = range(512)
number_of_hidden_units_layer3 = range(512)
number_of_hidden_units_layer4 = range(512)
number_of_hidden_units_layer5 = range(512)

# Swish Function 
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

activation_functions = [nn.ReLU(), nn.Sigmoid(), nn.Tanh(), nn.Softmax(), nn.Softplus(), nn.Softsign(), nn.ELU(), nn.SELU(), nn.GELU(), nn.LeakyReLU(), nn.PReLU(), nn.RReLU(), nn.CELU(), nn.SiLU(), nn.Mish(), Swish()]
number_of_epochs = 200
learning_rates = [0.1, 0.01, 0.001, 0.0001]
batch_sizes = range(8, 256)
optimizers = [optim.Adam, optim.SGD, optim.Adagrad, optim.RMSprop]

loss_functions = [nn.MSELoss(), nn.L1Loss(), nn.BCELoss(), nn.BCEWithLogitsLoss()]
feature_selection = [True, False]

genes = range(12 + X_train.shape[1] + 5)
# gen 0, ..., 12: Feature selection
# gen 13, ..., 19: Number of hidden units
# gen 20, ..., 26: Activation functions
# gen 27: Number of epochs
# gen 28: Learning rate
# gen 29: Batch size
# gen 30: Optimizer
# gen 31: Loss function
gene_space = [{'low': 0, 'high': 2, 'step': 1}] * 13 + [{'low': 0, 'high': 512, 'step': 1}] * 6 + [{'low': 0, 'high': 15, 'step': 1}] * 6 + [{'low': 50, 'high': 5000, 'step': 1}] + [{'low': 0, 'high': 3, 'step': 1}] + [{'low': 8, 'high': 256, 'step': 1}] + [{'low': 0, 'high': 3, 'step': 1}] + [{'low': 0, 'high': 3, 'step': 1}]
len(gene_space), len(genes)


In [None]:
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
def fintness_func(ga_instance, solution,  solution_idx):
    logging.info(f'Evaluating solution {solution_idx}: {solution}')

    selected_features = []
    for i in range(12):
        if solution[i] == 1:
            selected_features.append(i)
            
    if len(selected_features) == 0:
        return 0
    
    selected_features = X.columns[selected_features]
    X_train_selected = X_train[selected_features]
    X_val_selected = X_val[selected_features]
    X_test_selected = X_test[selected_features]
    # convert the data to tensors
    X_train_torch = torch.tensor(X_train_selected.to_numpy(), dtype=torch.float32)
    y_train_torch = torch.tensor(y_train.to_numpy().reshape(-1, 1), dtype=torch.float32)

    X_val_torch = torch.tensor(X_val_selected.to_numpy(), dtype=torch.float32)
    y_val_torch = torch.tensor(y_val.to_numpy().reshape(-1, 1), dtype=torch.float32)

    X_test_torch = torch.tensor(X_test_selected.to_numpy(), dtype=torch.float32)
    y_test_torch = torch.tensor(y_test.to_numpy().reshape(-1, 1), dtype=torch.float32)


    system = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    idx = X_train.shape[1]

    number_of_hidden_units = [int(solution[idx]), int(solution[idx + 1]), int(solution[idx + 2]), int(solution[idx + 3]), int(solution[idx + 4]), int(solution[idx + 5])]
    idx += 6

    activations = [nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU()]
    for i in range(6):
        activations[i] = activation_functions[int(solution[idx])]
        idx += 1
        
    number_of_epochs = 50
    idx+=1

    lr = learning_rates[int(solution[idx])]
    idx += 1

    batch_size = int(solution[idx])
    idx += 1

    optimizer = optimizers[int(solution[idx])]
    idx += 1

    criterion = loss_functions[int(solution[idx])]
    idx += 1

    model = Model(X_train_torch.size(1), activations, number_of_hidden_units).to(system)
    optimizer = optimizer(model.parameters(), lr=lr)

    logging.info(f'Model structure: {model}')
    logging.info(f'Learning rate: {lr}')
    logging.info(f'Batch size: {batch_size}')
    logging.info(f'Optimizer: {optimizer}')
    logging.info(f'Criterion: {criterion}')

    train_data = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    train_model(model, criterion, optimizer, train_loader, num_epochs=number_of_epochs)   
    #X_train, X_val, y_train, y_val
    model.eval()
    with torch.no_grad():
        y_pred = model(X_val_torch.to(sytem))
        y_pred_class = y_pred.round()

    accuracy = accuracy_score(y_val_torch.cpu().numpy(), y_pred_class.cpu().numpy())
    logging.info(f'Accuracy for solution {solution_idx}: {accuracy}')
    
    return accuracy


In [None]:
fitness_function = fintness_func

num_generations = 20
num_parents_mating = 4

sol_per_pop = 5
num_gens = 13

init_range_low  = -1
init_range_high = 1

parent_selection_type = "sss"

crossover_type = "single_point"

mutation_type = "random"
mutation_percent_genes = 10

keep_parents = 1

ga_instance = pygad.GA(num_generations=num_generations,
                          num_parents_mating=num_parents_mating,
                          fitness_func=fitness_function,
                          sol_per_pop=sol_per_pop,
                          num_genes=len(genes),
                          init_range_low=init_range_low,
                          init_range_high=init_range_high,
                          parent_selection_type=parent_selection_type,
                          crossover_type=crossover_type,
                          mutation_type=mutation_type,
                          mutation_percent_genes=mutation_percent_genes,
                          keep_parents=keep_parents,
                          gene_space=gene_space)

In [None]:
ga_instance.run()

In [None]:
# Retrieve the best solution
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print(f"Best solution: {solution}")
print(f"Fitness value of the best solution: {solution_fitness}")