In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

df1 = pd.read_csv("diabetes.csv")
df1 = df1[(df1['Glucose'] != 0) & (df1['Insulin'] != 0)]
df1 = df1[(df1['BloodPressure'] != 0) & (df1['SkinThickness'] != 0) & (df1['BMI'] != 0) & (df1['DiabetesPedigreeFunction'] != 0)]
df1 = df1.reset_index(drop=True)
df2 = df1
X = df2.drop('Outcome', axis = 1)
y = df2.Outcome

In [9]:
def update_agents(agents, fitness, best_agent, alpha=0.5, beta=0.05, bounds=None):
    num_agents, dimension = agents.shape

    # Ensure bounds are provided
    if bounds is None:
        bounds = (np.array([10, 4, 2, 2]), np.array([200, 20, 10, 10]))

    lower_bound, upper_bound = bounds

    for i in range(num_agents):
        distance = best_agent - agents[i]

        if np.random.rand() < alpha:
            agents[i] += alpha * distance * np.random.rand(dimension)
        else:
            agents[i] += beta * (np.random.rand(dimension) - 0.5)

        agents[i] = np.clip(agents[i], lower_bound, upper_bound)

    return agents

# Hyperparameter optimization using MPA
def hyperparameter_optimization(X, y, max_iterations=25, num_agents=15):
    # Initialize agents randomly within specified bounds
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    agents = np.random.rand(num_agents, 4)
    agents[:, 0] = agents[:, 0] * 140 + 10  # n_estimators (10 to 90)
    agents[:, 1] = agents[:, 1] * 12 + 8     # max_depth (4 to 20)
    agents[:, 2] = agents[:, 2] * 8 + 2    # min_samples_split (2 to 10)
    agents[:, 3] = agents[:, 3] * 8 + 2    # min_samples_leaf (2 to 10)

    best_fitness = float('-inf')
    best_agent = None

    for iteration in range(max_iterations):
        for i in range(num_agents):
            # Extract hyperparameters
            n_estimators = int(agents[i, 0])
            max_depth = int(agents[i, 1])
            min_samples_split = int(agents[i, 2])
            min_samples_leaf = int(agents[i, 3])
            if min_samples_split <= 2:
                min_samples_split = 2

            # Create and evaluate the RandomForestClassifier
            clf = RandomForestClassifier(n_estimators=n_estimators,
                                         max_depth=max_depth,
                                         min_samples_split=min_samples_split,
                                         min_samples_leaf=min_samples_leaf,
                                         random_state=42)

            # Fit the model and evaluate fitness (accuracy)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            fitness = accuracy_score(y_test, y_pred)

            # Update the best agent if current fitness is better
            if fitness > best_fitness:
                best_fitness = fitness
                best_agent = agents[i]

        # Update agents based on MPA rules
        agents = update_agents(agents, fitness, best_agent)

    return best_agent, best_fitness



In [16]:
def main(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    best_position, best_value = hyperparameter_optimization(X, y)
    n_estimators,max_depth,min_samples_split,min_samples_leaf = best_position
    n_estimators,max_depth,min_samples_split,min_samples_leaf = int(n_estimators),int(max_depth),int(min_samples_split),int(min_samples_leaf)
    best_position = n_estimators,max_depth,min_samples_split,min_samples_leaf
    model_MPA = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model_MPA.fit(X_train, y_train)
    predictions = model_MPA.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

    # Calculate Sensitivity (True Positive Rate)
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate Specificity (True Negative Rate)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Calculate Prevalence
    prevalence = (tp + fn) / (tp + fn + tn + fp) if (tp + fn + tn + fp) > 0 else 0
    accuracy = f"{accuracy:.3f}"
    specificity = f"{specificity:.3f}"
    sensitivity = f"{sensitivity:.3f}"
    prevalence = f"{prevalence:.3f}"
    f1 = f1_score(y_test, predictions)
    f1 = f"{f1:.3f}"
    print("MPA: ")
    print(f"Best Position (n_estimators, max_depth, min_samples_split, min_samples_leaf): {best_position}")
    print(f"Best Value (Accuracy): {best_value}")
    return model_MPA, accuracy, specificity, sensitivity, prevalence, f1

In [17]:
if __name__ == "__main__":
    main(X, y)

F1 Score: 0.65
MPA: 
Best Position (n_estimators, max_depth, min_samples_split, min_samples_leaf): (60, 18, 4, 3)
Best Value (Accuracy): 0.7653061224489796


In [1]:
print(y_train)

NameError: name 'y_train' is not defined