In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

df1 = pd.read_csv("diabetes.csv")
df1 = df1[(df1['Glucose'] != 0) & (df1['Insulin'] != 0)]
df1 = df1[(df1['BloodPressure'] != 0) & (df1['SkinThickness'] != 0) & (df1['BMI'] != 0) & (df1['DiabetesPedigreeFunction'] != 0)]
df1 = df1.reset_index(drop=True)
df2 = df1
X = df2.drop('Outcome', axis = 1)
y = df2.Outcome

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


class GreyWolf:
    def __init__(self, n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range):
        self.position = np.array([
            np.random.randint(n_estimators_range[0], n_estimators_range[1]),  
            np.random.randint(max_depth_range[0], max_depth_range[1]),         
            np.random.randint(min_samples_split_range[0], min_samples_split_range[1]),  
            np.random.randint(min_samples_leaf_range[0], min_samples_leaf_range[1])       
        ])
        self.fitness = float('inf')  

def evaluate_wolf(wolf, X_train, y_train, X_test, y_test):
    n_estimators = int(wolf.position[0])
    max_depth = int(wolf.position[1])
    min_samples_split = int(wolf.position[2])
    min_samples_leaf = int(wolf.position[3])
    
    model_GWO = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Fit the model
    model_GWO.fit(X_train, y_train)
    
    # Make predictions
    predictions = model_GWO.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

def gwo(num_wolves, n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range, max_iter,X,y):

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    wolves = [GreyWolf(n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range) for _ in range(num_wolves)]
    
    for iter in range(max_iter):
        for wolf in wolves:
            wolf.fitness = evaluate_wolf(wolf, X_train, y_train, X_test, y_test)

        wolves = sorted(wolves, key=lambda w: w.fitness, reverse=True)  # Sort by fitness (accuracy)

        alpha = wolves[0]
        beta = wolves[1]
        delta = wolves[2]

        for wolf in wolves:
            a = 2 - iter * (2 / max_iter)  
            r1 = np.random.rand(4)
            r2 = np.random.rand(4)

            for i in range(len(wolf.position)):
                A1 = 2 * a * r1[i] - a
                C1 = 2 * r2[i]
                D_alpha = np.abs(C1 * alpha.position[i] - wolf.position[i])
                wolf.position[i] = alpha.position[i] - A1 * D_alpha

                r1 = np.random.rand(4)
                A2 = 2 * a * r1[i] - a
                C2 = 2 * r2[i]
                D_beta = np.abs(C2 * beta.position[i] - wolf.position[i])
                wolf.position[i] = beta.position[i] - A2 * D_beta

                r1 = np.random.rand(4)
                A3 = 2 * a * r1[i] - a
                C3 = 2 * r2[i]
                D_delta = np.abs(C3 * delta.position[i] - wolf.position[i])
                wolf.position[i] = delta.position[i] - A3 * D_delta

            wolf.position[0] = np.clip(np.round(wolf.position[0]), n_estimators_range[0], n_estimators_range[1])
            wolf.position[1] = np.clip(np.round(wolf.position[1]), max_depth_range[0], max_depth_range[1])
            wolf.position[2] = np.clip(np.round(wolf.position[2]), min_samples_split_range[0], min_samples_split_range[1])
            wolf.position[3] = np.clip(np.round(wolf.position[3]), min_samples_leaf_range[0], min_samples_leaf_range[1])

    best_wolf = wolves[0]
    return best_wolf.position, best_wolf.fitness

# Example usage
num_wolves = 20  # Increased number of wolves for better exploration
n_estimators_range = (10, 200)  # Expanded range for n_estimators
max_depth_range = (8, 20)  # Expanded range for max_depth
min_samples_split_range = (2, 10)  # Expanded range for min_samples_split
min_samples_leaf_range = (1, 10)  # Expanded range for min_samples_leaf
max_iter = 25  # Increased number of iterations


In [3]:
def main(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    best_position, best_value = gwo(num_wolves, n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range, max_iter,X,y)
    n_estimators, max_depth, min_samples_split, min_samples_leaf = best_position
    model_GWO = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model_GWO.fit(X_train, y_train)
    predictions = model_GWO.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

    # Calculate Sensitivity (True Positive Rate)
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate Specificity (True Negative Rate)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Calculate Prevalence
    prevalence = (tp + fn) / (tp + fn + tn + fp) if (tp + fn + tn + fp) > 0 else 0
    accuracy = f"{accuracy:.3f}"
    specificity = f"{specificity:.3f}"
    sensitivity = f"{sensitivity:.3f}"
    prevalence = f"{prevalence:.3f}"
    f1 = f1_score(y_test, predictions)
    f1 = f"{f1:.3f}"
    print('GWO: ')
    print(f"Best Position (n_estimators, max_depth, min_samples_split, min_samples_leaf): {best_position}")
    print(f"Best Value (Accuracy): {best_value}")
    return model_GWO, accuracy, specificity, sensitivity, prevalence, f1

In [4]:
if __name__ == "__main__":
    main(X, y)

F1 Score: 0.7738356881214025
Best Position (n_estimators, max_depth, min_samples_split, min_samples_leaf): [23  8  2  1]
Best Value (Accuracy): 0.7755102040816326
