In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
!pip install -q pygad

In [3]:
#drive.mount('/content/drive')
file_path = 'dataset/cardio_train.csv'
df = pd.read_csv(file_path, sep=';')

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
columns = df.columns.drop('cardio')
print(columns)

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active'],
      dtype='object')


In [5]:
# remove duplicates rows of columns=columns
df.drop_duplicates(subset=columns, inplace=True)

In [6]:
# replace age in days to age in years

df['age'] = df['age'] / 365
df['age'] = df['age'].astype(int)

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0


In [7]:
# group by blood pressure
# 1- Normal (sys <= 120 and dia <= 80)
# 2- at risk (120 < sys <= 140 or 80 < dia <= 90)
# 3- high (sys > 140 or dia > 90)

df['blood_pressure'] = 0

df.loc[(df['ap_hi'] <= 120) & (df['ap_lo'] <= 80), 'blood_pressure'] = 1

df.loc[((df['ap_hi'] > 120) & (df['ap_hi'] <= 140)) | ((df['ap_lo'] > 80) & (df['ap_lo'] <= 90)), 'blood_pressure'] = 2

df.loc[(df['ap_hi'] > 140) | (df['ap_lo'] > 90), 'blood_pressure'] = 3

df.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,blood_pressure
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,1
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,2
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,2
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,3
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,1


In [8]:
# calculate BMI

df['bmi'] = df['weight'] / (df['height'] / 100) ** 2

df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,blood_pressure,bmi
0,0,50,2,168,62.0,110,80,1,1,0,0,1,0,1,21.96712
1,1,55,1,156,85.0,140,90,3,1,0,0,1,1,2,34.927679
2,2,51,1,165,64.0,130,70,3,1,0,0,0,1,2,23.507805
3,3,48,2,169,82.0,150,100,1,1,0,0,1,1,3,28.710479
4,4,47,1,156,56.0,100,60,1,1,0,0,0,0,1,23.011177


In [9]:
df.fillna(df.median(), inplace=True)
df = df.drop(columns=['id'],axis=1)

# Extract features (all columns except 'cardio') and target variable ('cardio')
X = df.drop(columns=['cardio'],axis=1)
y = df['cardio']

columns_to_standardize = ['age', 'height','weight','ap_hi','ap_lo','bmi']


# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the feature values
scaler = StandardScaler()
X_train[columns_to_standardize] = scaler.fit_transform(X_train[columns_to_standardize])
X_test[columns_to_standardize] = scaler.fit_transform(X_test[columns_to_standardize])


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

In [11]:
import torch
# Convert pandas DataFrames to numpy arrays
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)

# Convert numpy arrays to PyTorch tensors
X_train_torch = torch.tensor(X_train_np, dtype=torch.float32)
y_train_torch = torch.tensor(y_train_np, dtype=torch.float32)
X_test_torch = torch.tensor(X_test_np, dtype=torch.float32)
y_test_torch = torch.tensor(y_test_np, dtype=torch.float32)


In [12]:
sytem = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create a DataLoader
train_data = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(dataset=train_data, batch_size=1024, shuffle=True)

# Create the model
model = Model(X_train_torch.size(1)).to(sytem)

# Define the loss function and the optimizer
criterion = nn.BCELoss()

optimizer = optim.Adam(model.parameters(), lr=0.01)

In [13]:
# Function to train the model
def train_model(model, criterion, optimizer, train_loader, num_epochs=100):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(sytem), labels.to(sytem)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {running_loss:.2f}")

In [14]:
# Function to calculate metrics
def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test)
        y_pred_class = y_pred.round()
        accuracy = accuracy_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy())
        precision = precision_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
        recall = recall_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
        f1 = f1_score(y_test.cpu().numpy(), y_pred_class.cpu().numpy(), average='weighted')
        return accuracy, precision, recall, f1

In [15]:
# use accuracy as fitness function 
def fitness_func(ga_instance, solution, solution_idx):
    # select the features from the solution if value > 0 value is selected else not selected
    selected_features = [i for i in range(len(solution)) if solution[i] > 0]
    # check if there are no selected features
    if len(selected_features) == 0:
        return 0
    # get the selected data
    # select colums i if i is in selected_features
    selected_features = X.columns[selected_features]
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    # convert the data to tensors
    X_train_torch = torch.tensor(X_train_selected.to_numpy(), dtype=torch.float32)
    y_train_torch = torch.tensor(y_train.to_numpy().reshape(-1, 1), dtype=torch.float32)
    X_test_torch = torch.tensor(X_test_selected.to_numpy(), dtype=torch.float32)
    y_test_torch = torch.tensor(y_test.to_numpy().reshape(-1, 1), dtype=torch.float32)

    
    sytem = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Create a DataLoader
    train_data = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)

    # Create the model
    model_ga = Model(X_train_torch.size(1)).to(sytem)

    # Define the loss function and the optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model_ga.parameters(), lr=0.01)


    # ValueError: Using a target size (torch.Size([64])) that is different to the input size (torch.Size([64, 1])) is deprecated. Please ensure they have the same size. Using a target size (torch.Size([64])) that is different to the input size (torch.Size([64, 1])) is deprecated. Please ensure they have the same size.Traceback (most recent call last):

    # resolve the error by using squeeze() on the target tensor
    y_train_torch = y_train_torch.squeeze()
    y_test_torch = y_test_torch.squeeze()
    

    # train the model
    train_model(model_ga, criterion, optimizer, train_loader, num_epochs=500)
    
    model_ga.eval()
    with torch.no_grad():
        y_pred = model_ga(X_test_torch.to(sytem))
        y_pred_class = y_pred.round()

    accuracy = accuracy_score(y_test_torch.cpu().numpy(), y_pred_class.cpu().numpy())
    return accuracy

In [16]:
import pygad
import matplotlib as plt

def adaptive_mutation_rate(generation, max_generations):
    # Decrease mutation rate over generations
    return max(1, 10 - int((generation / max_generations) * 10))

# Define a callback function to dynamically adjust mutation rate and check for early stopping
def on_generation(ga_instance):
    generation = ga_instance.generations_completed  
    # Early stopping if no improvement in the last 10 generations
    if generation > 10 and np.std(ga_instance.best_solutions_fitness[-10:]) < 1e-6:
        return "stop"



num_generations = 30
num_parents_mating = 4
sol_per_pop = 4  # Larger initial population
num_gens = 13
fitness_function = fitness_func
init_range_low  = -1
init_range_high = 1
parent_selection_type = "sss"
crossover_type = "single_point"
mutation_type = "random"
# Initialize with the first mutation rate
initial_mutation_percent_genes = 10


ga_instance = pygad.GA(num_generations=num_generations,
                       num_parents_mating=num_parents_mating,
                       fitness_func=fitness_function,
                       sol_per_pop=sol_per_pop,
                       num_genes=num_gens,
                       init_range_low=init_range_low,
                       init_range_high=init_range_high,
                       parent_selection_type=parent_selection_type,
                       crossover_type=crossover_type,
                       mutation_type=mutation_type,
                       mutation_percent_genes=initial_mutation_percent_genes,
                       on_generation=on_generation,
                       keep_parents=1)  # Elitism: keep the best parent




In [17]:
ga_instance.run()

Epoch 10/500, Loss: 493.42
Epoch 20/500, Loss: 491.13
Epoch 30/500, Loss: 490.63
Epoch 40/500, Loss: 489.58
Epoch 50/500, Loss: 489.84
Epoch 60/500, Loss: 489.06
Epoch 70/500, Loss: 489.55
Epoch 80/500, Loss: 488.73
Epoch 90/500, Loss: 488.73
Epoch 100/500, Loss: 488.16
Epoch 110/500, Loss: 488.11
Epoch 120/500, Loss: 488.44
Epoch 130/500, Loss: 487.48
Epoch 140/500, Loss: 487.64
Epoch 150/500, Loss: 487.52
Epoch 160/500, Loss: 487.45
Epoch 170/500, Loss: 487.01
Epoch 180/500, Loss: 486.71
Epoch 190/500, Loss: 487.21
Epoch 200/500, Loss: 486.79
Epoch 210/500, Loss: 486.76
Epoch 220/500, Loss: 486.57
Epoch 230/500, Loss: 487.10
Epoch 240/500, Loss: 486.29
Epoch 250/500, Loss: 486.37
Epoch 260/500, Loss: 486.28
Epoch 270/500, Loss: 486.26
Epoch 280/500, Loss: 485.79
Epoch 290/500, Loss: 486.09
Epoch 300/500, Loss: 485.70
Epoch 310/500, Loss: 485.59
Epoch 320/500, Loss: 486.03
Epoch 330/500, Loss: 485.52
Epoch 340/500, Loss: 485.73
Epoch 350/500, Loss: 485.74
Epoch 360/500, Loss: 485.77
E

In [18]:
# Retrieve the best solution
solution, solution_fitness, solution_idx = ga_instance.best_solution()
print(f"Best solution: {solution}")
print(f"Fitness value of the best solution: {solution_fitness}")

Best solution: [ 0.6898952   0.29287429  0.18194055  0.84303246  0.45606688 -0.69119194
  0.25178386  1.18947411 -1.28798733  0.10786774  0.24599728  0.71829953
 -0.05810386]
Fitness value of the best solution: 0.7362142857142857


In [19]:
# Extract the best fitness value for each generation
best_fitness_values = ga_instance.best_solutions_fitness

# Plotting the fitness values
plt.plot(best_fitness_values)
plt.xlabel('Generation')
plt.ylabel('Best Fitness Value')
plt.title('Best Fitness Value over Generations')
plt.grid(True)
plt.show()

AttributeError: module 'matplotlib' has no attribute 'plot'