In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
# from google.colab import drive

In [5]:
# drive.mount('/content/drive')
# file_path = '/content/drive/My Drive/Research/dataset/cardio_train.csv'
file_path = 'cardio_train.csv'

df = pd.read_csv(file_path, sep=';')

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [6]:
columns = df.columns.drop('cardio')
print(columns)

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active'],
      dtype='object')


In [7]:
# remove duplicates rows of columns=columns
df.drop_duplicates(subset=columns, inplace=True)

In [8]:
# replace age in days to age in years

df['age'] = df['age'] / 365
df['age'] = df['age'].astype(int)

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0


In [9]:
# group by blood pressure
# 1- Normal (sys <= 120 and dia <= 80)
# 2- at risk (120 < sys <= 140 or 80 < dia <= 90)
# 3- high (sys > 140 or dia > 90)

df['blood_pressure'] = 0

df.loc[(df['ap_hi'] <= 120) & (df['ap_lo'] <= 80), 'blood_pressure'] = 1

df.loc[((df['ap_hi'] > 120) & (df['ap_hi'] <= 140)) | ((df['ap_lo'] > 80) & (df['ap_lo'] <= 90)), 'blood_pressure'] = 2

df.loc[(df['ap_hi'] > 140) | (df['ap_lo'] > 90), 'blood_pressure'] = 3

df.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,blood_pressure
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,1
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,2
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,2
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,3
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,1


In [10]:
# calculate BMI

df['bmi'] = df['weight'] / (df['height'] / 100) ** 2

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,blood_pressure,bmi
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,1,21.96712
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,2,34.927679
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,2,23.507805
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,3,28.710479
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,1,23.011177


In [11]:
df.fillna(df.median(), inplace=True)
df = df.drop(columns=['id'],axis=1)

# Extract features (all columns except 'cardio') and target variable ('cardio')
X = df.drop(columns=['cardio'],axis=1)
y = df['cardio']

columns_to_standardize = ['age', 'height','weight','ap_hi','ap_lo','bmi']


# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train[columns_to_standardize] = scaler.fit_transform(X_train[columns_to_standardize])
X_test[columns_to_standardize] = scaler.fit_transform(X_test[columns_to_standardize])


In [12]:
# select 20K randomly of data
X_train

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,blood_pressure,bmi
47339,0.910835,1,-1.263910,0.404666,0.008508,-0.035487,2,1,0,0,1,2,1.011377
67456,-1.156104,2,-0.286798,-0.291664,0.075919,-0.035487,1,1,0,0,0,2,-0.144195
12308,0.615558,2,1.178871,1.240262,0.143330,0.015024,1,1,0,0,1,3,0.463786
32557,1.649027,2,1.056732,0.126134,-0.058903,-0.075896,1,1,0,0,1,2,-0.353602
664,0.320281,1,-0.531076,-0.987993,-0.058903,-0.085998,1,1,0,0,1,1,-0.673758
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37194,-1.451381,2,0.690315,0.056501,0.143330,-0.085998,1,1,1,0,1,3,-0.262245
6265,1.501389,2,-0.286798,-0.082765,0.210741,-0.035487,1,1,0,0,1,3,0.042916
54886,1.649027,1,0.568176,-0.013132,-0.058903,-0.085998,1,1,0,0,1,1,-0.269136
860,-0.565550,1,0.323898,-0.291664,-0.058903,-0.085998,1,1,0,0,0,1,-0.401714


In [37]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class Model(nn.Module):
    def __init__(self, input_dim, activations, number_of_hidden_units):
        super(Model, self).__init__()
        self.net = nn.Sequential(

            nn.Linear(input_dim, number_of_hidden_units[0]),
            activations[0],

            nn.Linear(number_of_hidden_units[0], number_of_hidden_units[1]),
            activations[1],

            nn.Linear(number_of_hidden_units[1], number_of_hidden_units[2]),
            activations[2],

            nn.Linear(number_of_hidden_units[2], number_of_hidden_units[3]),
            activations[3],

            nn.Linear(number_of_hidden_units[3], number_of_hidden_units[4]),
            activations[4],

            nn.Linear(number_of_hidden_units[4], number_of_hidden_units[5]),
            activations[5],

            nn.Linear(number_of_hidden_units[5], 1),
            nn.Sigmoid()
        )
        

    def forward(self, x):
        return self.net(x)

In [38]:

# Convert pandas DataFrames to numpy arrays
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)

# Convert numpy arrays to PyTorch tensors
X_train_torch = torch.tensor(X_train_np, dtype=torch.float32)
y_train_torch = torch.tensor(y_train_np, dtype=torch.float32)
X_test_torch = torch.tensor(X_test_np, dtype=torch.float32)
y_test_torch = torch.tensor(y_test_np, dtype=torch.float32)


In [39]:
sytem = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create a DataLoader
train_data = TensorDataset(X_test_torch, y_test_torch)
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

# Create the model
number_of_hidden_units = [64, 128, 256, 512, 256, 128, 64]
activations = [nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU()]
model = Model(X_test_torch.size(1), activations, number_of_hidden_units).to(sytem)

# Define the loss function and the optimizer
criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

In [27]:
# Function to train the model

def train_model(model, criterion, optimizer, train_loader, num_epochs=20):

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(sytem), labels.to(sytem)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss:.2f}")

In [28]:
# Function to calculate metrics

def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)
        y_pred_class = y_pred.round()
        accuracy = accuracy_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy())
        precision = precision_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
        recall = recall_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
        f1 = f1_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
        return accuracy, precision, recall, f1

In [29]:
# Train the model
train_model(model, criterion, optimizer, train_loader, num_epochs=10)

Epoch 10/10, Loss: 40.22


In [30]:
# Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = model(X_test_torch.to(sytem))
y_pred_class = y_pred.round()
# turn of requires_grad for evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_test_torch.to(sytem))
    y_pred_class = y_pred.round()
accuracy = accuracy_score(y_test_torch.cpu().numpy(), y_pred_class.cpu().numpy())
precision = precision_score(y_test_torch.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
recall = recall_score(y_test_torch.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
f1 = f1_score(y_test_torch.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')

print(f"PyTorch Model Metrics:\nAccuracy: {accuracy:.2f}\nPrecision: {precision:.2f}\nRecall: {recall:.2f}\nF1-score: {f1:.2f}\n")

# evaluate the model with train data
y_pred = model(X_train_torch.to(sytem))
y_pred_class = y_pred.round()

# turn of requires_grad for evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_train_torch.to(sytem))
    y_pred_class = y_pred.round()
accuracy = accuracy_score(y_train_torch.cpu().numpy(), y_pred_class.cpu().numpy())

print(f"PyTorch Model Metrics on Train Data:\nAccuracy: {accuracy:.2f}\n")



PyTorch Model Metrics:
Accuracy: 0.74
Precision: 0.74
Recall: 0.74
F1-score: 0.74

PyTorch Model Metrics on Train Data:
Accuracy: 0.73



In [1]:
!pip install -q pygad

In [129]:
# create genes which contains
# 1- number of hidden units at each 6 layers
# 2- activation function
# 3- number of epochs
# 4- learning rate
# 5- batch size
# 6- optimizer
# 7- loss function
# 8- feature selection

import pygad

number_of_hidden_units_layer0 = range(512)
number_of_hidden_units_layer1 = range(512)
number_of_hidden_units_layer2 = range(512)
number_of_hidden_units_layer3 = range(512)
number_of_hidden_units_layer4 = range(512)
number_of_hidden_units_layer5 = range(512)

# Swish Function 
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)

activation_functions = [nn.ReLU(), nn.Sigmoid(), nn.Tanh(), nn.Softmax(), nn.Softplus(), nn.Softsign(), nn.ELU(), nn.SELU(), nn.GELU(), nn.LeakyReLU(), nn.PReLU(), nn.RReLU(), nn.CELU(), nn.SiLU(), nn.Mish(), Swish()]
number_of_epochs = range(50, 5000)
learning_rates = [0.1, 0.01, 0.001, 0.0001]
batch_sizes = range(8, 256)
optimizers = [optim.Adam, optim.SGD, optim.Adagrad, optim.RMSprop]

loss_functions = [nn.MSELoss(), nn.L1Loss(), nn.BCELoss(), nn.BCEWithLogitsLoss()]
feature_selection = [True, False]

genes = range(12 + X_train.shape[1] + 5)


In [130]:
# gen 0, ..., 12: Feature selection
# gen 13, ..., 19: Number of hidden units
# gen 20, ..., 26: Activation functions
# gen 27: Number of epochs
# gen 28: Learning rate
# gen 29: Batch size
# gen 30: Optimizer
# gen 31: Loss function

In [131]:
gene_space = [{'low': 0, 'high': 2, 'step': 1}] * 13 + [{'low': 0, 'high': 512, 'step': 1}] * 6 + [{'low': 0, 'high': 15, 'step': 1}] * 6 + [{'low': 50, 'high': 5000, 'step': 1}] + [{'low': 0, 'high': 3, 'step': 1}] + [{'low': 8, 'high': 256, 'step': 1}] + [{'low': 0, 'high': 3, 'step': 1}] + [{'low': 0, 'high': 3, 'step': 1}]


In [132]:
len(gene_space), len(genes)

(30, 30)

In [142]:
def fintness_func(ga_instance, solution,  solution_idx):
    selected_features = []
    for i in range(12):
        if solution[i] == 1:
            selected_features.append(i)
            
    if len(selected_features) == 0:
        return 0
    
    selected_features = X.columns[selected_features]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    # convert the data to tensors
    X_train_torch = torch.tensor(X_train_selected.to_numpy(), dtype=torch.float32)
    y_train_torch = torch.tensor(y_train.to_numpy().reshape(-1, 1), dtype=torch.float32)
    X_test_torch = torch.tensor(X_test_selected.to_numpy(), dtype=torch.float32)
    y_test_torch = torch.tensor(y_test.to_numpy().reshape(-1, 1), dtype=torch.float32)


    system = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    idx = X_train.shape[1]

    number_of_hidden_units = [int(solution[idx]), int(solution[idx + 1]), int(solution[idx + 2]), int(solution[idx + 3]), int(solution[idx + 4]), int(solution[idx + 5])]
    idx += 6

    activations = [nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU(), nn.ReLU()]
    for i in range(6):
        activations[i] = activation_functions[int(solution[idx])]
        idx += 1
        
    number_of_epochs = int(solution[idx])
    idx += 1

    lr = learning_rates[int(solution[idx])]
    idx += 1

    batch_size = int(solution[idx])
    idx += 1

    optimizer = optimizers[int(solution[idx])]
    idx += 1

    criterion = loss_functions[int(solution[idx])]
    idx += 1

    model = Model(X_train_torch.size(1), activations, number_of_hidden_units).to(system)
    optimizer = optimizer(model.parameters(), lr=lr)

    train_data = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)

    train_model(model, criterion, optimizer, train_loader, num_epochs=number_of_epochs)   
    
    model_ga.eval()
    with torch.no_grad():
        y_pred = model_ga(X_test_torch.to(sytem))
        y_pred_class = y_pred.round()

    f1_score = f1_score(y_test_torch.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')

    return f1_score

     

In [143]:
import pygad


fitness_function = fintness_func

num_generations = 20
num_parents_mating = 4



sol_per_pop = 5
num_gens = 13

init_range_low  = -1
init_range_high = 1

parent_selection_type = "sss"

crossover_type = "single_point"

mutation_type = "random"
mutation_percent_genes = 10

keep_parents = 1

ga_instance = pygad.GA(num_generations=num_generations,
                       
                          num_parents_mating=num_parents_mating,
                          fitness_func=fitness_function,
                          sol_per_pop=sol_per_pop,
                          num_genes=len(genes),
                          init_range_low=init_range_low,
                          init_range_high=init_range_high,
                          parent_selection_type=parent_selection_type,
                          crossover_type=crossover_type,
                          mutation_type=mutation_type,
                          mutation_percent_genes=mutation_percent_genes,
                          keep_parents=keep_parents,
                          gene_space=gene_space)




In [144]:
ga_instance.run()

Epoch 10/2226, Loss: 75.48


KeyboardInterrupt: 