In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from statsmodels.stats.weightstats import ztest as ztest


df = pd.read_csv("/content/diabetes_data_upload.csv")
print(df.columns)

X = df.drop(columns=['class'])
y = df['class']

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier()
}
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label='Positive')
    rec = recall_score(y_test, y_pred, pos_label='Positive')
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "False Positive Rate (Type I Error)": fp / (fp + tn),
        "False Negative Rate (Type II Error)": fn / (fn + tp)
    }

results_df = pd.DataFrame(results).T
print(results_df)

test_indices = y_test.index
y_pred_aligned = pd.Series(y_pred, index=y_test.index)
correct_idx = y_test[y_test == y_pred_aligned].index
incorrect_idx = y_test[y_test != y_pred_aligned].index

z_stat, p_value = ztest(df.loc[correct_idx, 'Age'], df.loc[incorrect_idx, 'Age'])
print(f"Z-Test between correctly and misclassified cases: Z={z_stat}, p={p_value}")

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')
                     Accuracy  Precision    Recall  \
Logistic Regression  0.923077   0.931507  0.957746   
Decision Tree        0.980769   1.000000  0.971831   
Random Forest        0.980769   1.000000  0.971831   
Gradient Boosting    0.971154   1.000000  0.957746   
SVM                  0.990385   0.986111  1.000000   
KNN                  0.894231   0.954545  0.887324   

                     False Positive Rate (Type I Error)  \
Logistic Regression                            0.151515   
Decision Tree                                  0.000000   
Random Forest                                  0.000000   
Gradient Boosting                              0.000000   
SVM                           