In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [3]:
# Load the diabetes dataset (replace 'path_to_diabetes_data' with the actual path)
diabetes_data = pd.read_csv('../dataset/diabetes.csv')

In [4]:
# Assume the target variable is 'Outcome', and the rest are features
X = diabetes_data.drop('Outcome', axis=1)
y = diabetes_data['Outcome']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Preprocess the data

# Impute missing values (replace NaN with mean)
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(random_state=42)
}


In [7]:
# Train and evaluate each classifier
for name, clf in classifiers.items():
    # Fit the model to the training data
    clf.fit(X_train_scaled, y_train)

    # Make predictions on the testing data
    y_pred = clf.predict(X_test_scaled)

    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")

Random Forest Accuracy: 0.72
Support Vector Machine Accuracy: 0.73
K-Nearest Neighbors Accuracy: 0.69
Logistic Regression Accuracy: 0.75
Decision Tree Accuracy: 0.75
Naive Bayes Accuracy: 0.77
Neural Network Accuracy: 0.76


