<a href="https://colab.research.google.com/github/2203A52112/DAUP_LAB_2025/blob/main/2203A52112_DAUP_LAB_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from scipy.stats import norm, ttest_ind
# Import matplotlib.pyplot and seaborn
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("/content/diabetes_data_upload.csv")

# Encode categorical variables
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])

# Define features and target
X = df.drop(columns=["class"])
y = df["class"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel="linear", probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

results = {}
fnr_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "False Positive Rate": fp / (fp + tn),
        "False Negative Rate": fn / (fn + tp)
    }
    fnr_results[name] = fn / (fn + tp)

# Perform Z-Test on FNR differences between two models
def perform_z_test(model1, model2):
    fnr1, fnr2 = fnr_results[model1], fnr_results[model2]
    n1, n2 = len(y_test), len(y_test)

    std_error = np.sqrt((fnr1 * (1 - fnr1) / n1) + (fnr2 * (1 - fnr2) / n2))
    z_stat = (fnr1 - fnr2) / std_error
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))

    return {"Z-Statistic": z_stat, "P-Value": p_value}

z_test_fnr = perform_z_test("Logistic Regression", "SVM")

# Print results
print("Model Performance:")
for model, metrics in results.items():
    print(f"{model}: {metrics}")

print("\nZ-Test on False Negative Rates:")
print(z_test_fnr)


Model Performance:
Logistic Regression: {'Accuracy': 0.9423076923076923, 'Precision': 0.9833333333333333, 'Recall': 0.921875, 'False Positive Rate': np.float64(0.025), 'False Negative Rate': np.float64(0.078125)}
Decision Tree: {'Accuracy': 0.9903846153846154, 'Precision': 1.0, 'Recall': 0.984375, 'False Positive Rate': np.float64(0.0), 'False Negative Rate': np.float64(0.015625)}
Random Forest: {'Accuracy': 0.9903846153846154, 'Precision': 1.0, 'Recall': 0.984375, 'False Positive Rate': np.float64(0.0), 'False Negative Rate': np.float64(0.015625)}
Gradient Boosting: {'Accuracy': 0.9903846153846154, 'Precision': 1.0, 'Recall': 0.984375, 'False Positive Rate': np.float64(0.0), 'False Negative Rate': np.float64(0.015625)}
SVM: {'Accuracy': 0.9326923076923077, 'Precision': 0.9830508474576272, 'Recall': 0.90625, 'False Positive Rate': np.float64(0.025), 'False Negative Rate': np.float64(0.09375)}
KNN: {'Accuracy': 0.9423076923076923, 'Precision': 0.9833333333333333, 'Recall': 0.921875, 'Fa