In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.stats import norm

In [2]:
df = pd.read_csv("/content/diabetes_data_upload.csv")


In [3]:
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

In [4]:
X = df.drop(columns=["class"])
y = df["class"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    fnr = cm[1, 0] / (cm[1, 0] + cm[1, 1])  # Type II Error (False Negatives)
    fpr = cm[0, 1] / (cm[0, 0] + cm[0, 1])  # Type I Error (False Positives)

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "False Negative Rate": fnr,
        "False Positive Rate": fpr
    }

In [7]:
results_df = pd.DataFrame(results).T
print(results_df)


                     Accuracy  Precision    Recall  False Negative Rate  \
Logistic Regression  0.942308   0.983333  0.921875             0.078125   
Decision Tree        0.990385   1.000000  0.984375             0.015625   
Random Forest        0.990385   1.000000  0.984375             0.015625   
SVM                  0.615385   0.615385  1.000000             0.000000   
KNN                  0.932692   0.983051  0.906250             0.093750   
Gradient Boosting    0.990385   1.000000  0.984375             0.015625   

                     False Positive Rate  
Logistic Regression                0.025  
Decision Tree                      0.000  
Random Forest                      0.000  
SVM                                1.000  
KNN                                0.025  
Gradient Boosting                  0.000  


In [8]:
logistic_preds = models["Logistic Regression"].predict(X_test)
diabetic_indices = (y_test == 1)
correctly_classified = (logistic_preds == y_test) & diabetic_indices
misclassified = (~correctly_classified) & diabetic_indices

correct_ages = X_test.loc[correctly_classified, "Age"]
misclassified_ages = X_test.loc[misclassified, "Age"]

mean_correct = np.mean(correct_ages)
mean_misclassified = np.mean(misclassified_ages)
std_correct = np.std(correct_ages, ddof=1)
std_misclassified = np.std(misclassified_ages, ddof=1)
n_correct = len(correct_ages)
n_misclassified = len(misclassified_ages)

std_error = np.sqrt((std_correct**2 / n_correct) + (std_misclassified**2 / n_misclassified))
z_stat = (mean_correct - mean_misclassified) / std_error
p_value = 2 * (1 - norm.cdf(abs(z_stat)))

print(f"Z-Test for Age Difference: Z-Score = {z_stat:.2f}, P-Value = {p_value:.4f}")

Z-Test for Age Difference: Z-Score = 1.41, P-Value = 0.1592


In [9]:
fnr_svm = results["SVM"]["False Negative Rate"]
fnr_logistic = results["Logistic Regression"]["False Negative Rate"]
n_svm = y_test.sum()
n_logistic = y_test.sum()

std_error_fnr = np.sqrt((fnr_svm * (1 - fnr_svm) / n_svm) + (fnr_logistic * (1 - fnr_logistic) / n_logistic))
z_stat_fnr = (fnr_svm - fnr_logistic) / std_error_fnr
p_value_fnr = 2 * (1 - norm.cdf(abs(z_stat_fnr)))

print(f"Z-Test for False Negative Rate: Z-Score = {z_stat_fnr:.2f}, P-Value = {p_value_fnr:.4f}")


Z-Test for False Negative Rate: Z-Score = -2.33, P-Value = 0.0199
