In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

df1 = pd.read_csv("diabetes.csv")
df1 = df1[(df1['Glucose'] != 0) & (df1['Insulin'] != 0)]
df1 = df1[(df1['BloodPressure'] != 0) & (df1['SkinThickness'] != 0) & (df1['BMI'] != 0) & (df1['DiabetesPedigreeFunction'] != 0)]
df1 = df1.reset_index(drop=True)
df2 = df1
X = df2.drop('Outcome', axis = 1)
y = df2.Outcome

In [2]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

class Salp:
    def __init__(self, n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range):
        self.position = np.array([
            np.random.randint(n_estimators_range[0], n_estimators_range[1]),  
            np.random.randint(max_depth_range[0], max_depth_range[1]),         
            np.random.randint(min_samples_split_range[0], min_samples_split_range[1]),  
            np.random.randint(min_samples_leaf_range[0], min_samples_leaf_range[1])       
        ])
        self.fitness = float('inf')  

def evaluate_salp(salp, X_train, y_train, X_test, y_test):
    n_estimators = int(salp.position[0])
    max_depth = int(salp.position[1])
    min_samples_split = int(salp.position[2])
    min_samples_leaf = int(salp.position[3])
    
    model_SSA = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    
    # Fit the model
    model_SSA.fit(X_train, y_train)
    
    # Make predictions
    predictions = model_SSA.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, predictions)
    return accuracy

def ssa(num_salps, n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range, max_iter, X, y):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    salps = [Salp(n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range) for _ in range(num_salps)]
    
    for iter in range(max_iter):
        for salp in salps:
            salp.fitness = evaluate_salp(salp, X_train, y_train, X_test, y_test)

        # Sort salps by fitness (accuracy)
        salps = sorted(salps, key=lambda s: s.fitness, reverse=True)

        # Update positions of salps
        for i in range(len(salps)):
            if i == 0:  # Leader salp
                continue
            # Update position based on the leader
            salps[i].position = salps[0].position + np.random.uniform(-1, 1, size=salps[i].position.shape)

            # Ensure positions are within bounds
            salps[i].position[0] = np.clip(np.round(salps[i].position[0]), n_estimators_range[0], n_estimators_range[1])
            salps[i].position[1] = np.clip(np.round(salps[i].position[1]), max_depth_range[0], max_depth_range[1])
            salps[i].position[2] = np.clip(np.round(salps[i].position[2]), min_samples_split_range[0], min_samples_split_range[1])
            salps[i].position[3] = np.clip(np.round(salps[i].position[3]), min_samples_leaf_range[0], min_samples_leaf_range[1])

    best_salp = salps[0]
    return best_salp.position, best_salp.fitness

# Example usage
num_salps = 20  # Number of salps
n_estimators_range = (10, 200)  # Range for n_estimators
max_depth_range = (8, 20)  # Range for max_depth
min_samples_split_range = (2, 10)  # Range for min_samples_split
min_samples_leaf_range = (1, 10)  # Range for min_samples_leaf
max_iter = 25  # Number of iterations


In [5]:
def main(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    best_position, best_value = ssa(num_salps, n_estimators_range, max_depth_range, min_samples_split_range, min_samples_leaf_range, max_iter, X, y)
    n_estimators, max_depth, min_samples_split, min_samples_leaf = best_position
    n_estimators,max_depth,min_samples_split,min_samples_leaf = int(n_estimators),int(max_depth),int(min_samples_split),int(min_samples_leaf)
    model_SSA = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model_SSA.fit(X_train, y_train)
    predictions = model_SSA.predict(X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()

    # Calculate Sensitivity (True Positive Rate)
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Calculate Specificity (True Negative Rate)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Calculate Prevalence
    prevalence = (tp + fn) / (tp + fn + tn + fp) if (tp + fn + tn + fp) > 0 else 0
    accuracy = f"{accuracy:.3f}"
    specificity = f"{specificity:.3f}"
    sensitivity = f"{sensitivity:.3f}"
    prevalence = f"{prevalence:.3f}"
    f1 = f1_score(y_test, predictions)
    f1 = f"{f1:.3f}"
    print('SSA: ')
    print(f"Best Position (n_estimators, max_depth, min_samples_split, min_samples_leaf): {best_position}")
    print(f"Best Value (Accuracy): {best_value}")
    return model_SSA, accuracy, specificity, sensitivity, prevalence, f1

In [6]:
if __name__ == "__main__":
    main(X, y)

InvalidParameterError: The 'min_samples_leaf' parameter of RandomForestClassifier must be an int in the range [1, inf) or a float in the range (0.0, 1.0). Got 1.0 instead.