In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [52]:
data = pd.read_csv("diabetes_binary_classification_data.csv")

In [53]:
##Checks whats in the dataset
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [54]:
#checks duplicated in dataset
# There is 24206 duplicates
data.duplicated().sum()

np.int64(24206)

In [55]:
# Define features (X) and target (y)
X = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% validation, 15% test

# Create preprocessing pipelines for numeric and categorical features
numeric_features = ['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Income']  # Non-binary variables
categorical_features = ['GenHlth', 'Education', 'HighBP', 'HighChol', 
                        'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
                        'PhysActivity', 'Fruits', 'Veggies', 
                        'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 
                        'DiffWalk', 'Sex']  # Binary variables

# Create preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess training data (numerical and categorical features)
X_train_processed = preprocessor.fit_transform(X_train)

# Preprocess validation and test data
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)


In [71]:
# Define models
log_reg = LogisticRegression(max_iter=2000)
decision_tree = DecisionTreeClassifier(max_depth=50)
random_forest = RandomForestClassifier(n_estimators=100, max_depth=50)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest)],
    voting='hard')  # 'hard' for majority voting

# Fit models on the processed training data
log_reg.fit(X_train_processed, y_train)
decision_tree.fit(X_train_processed, y_train)
random_forest.fit(X_train_processed, y_train)
voting_clf.fit(X_train_processed, y_train)

# Make predictions on the validation set
log_reg_val_pred = log_reg.predict(X_val_processed)
decision_tree_val_pred = decision_tree.predict(X_val_processed)
random_forest_val_pred = random_forest.predict(X_val_processed)
voting_val_pred = voting_clf.predict(X_val_processed)

# Make predictions on the test set
log_reg_test_pred = log_reg.predict(X_test_processed)
decision_tree_test_pred = decision_tree.predict(X_test_processed)
random_forest_test_pred = random_forest.predict(X_test_processed)
voting_test_pred = voting_clf.predict(X_test_processed)

# Make predictions on the test set
log_reg_test_pred = log_reg.predict(X_test_processed)
decision_tree_test_pred = decision_tree.predict(X_test_processed)
random_forest_test_pred = random_forest.predict(X_test_processed)
voting_test_pred = voting_clf.predict(X_test_processed)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest)],
    voting='hard')  # 'hard' for majority voting

In [57]:
# Function to calculate false negative rate
def calculate_fnr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0  # Avoid division by zero
    return tn, fn, tp, fp, fnr

In [64]:
def print_fnr(predictions, model_name):
    accuracy = accuracy_score(y_test, predictions)
    tn, fn, tp, fp, fnr = calculate_fnr(y_test, predictions)
    
    print(f"{model_name}:")
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"true Negatives: {tn}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    print(f"False Positives: {fp}")
    print(f"False Negative Rate: {fnr:.4f}")
    print(classification_report(y_test, predictions)) 

In [65]:
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 177576
Validation set size: 38052
Test set size: 38052


In [70]:
# Evaluate models on validation data and print results
print(f"Evaluation of models based on validation data:")
print(f"")
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_val_pred, decision_tree_val_pred, random_forest_val_pred, voting_val_pred]
):
    print_fnr(predictions, model_name)

Evaluation of models based on validation data:

Logistic Regression
Test Accuracy: 0.8343
true Negatives: 31557
False Negatives: 5003
True Positives: 191
False Positives: 1301
False Negative Rate: 0.9632
              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91     32858
         1.0       0.13      0.04      0.06      5194

    accuracy                           0.83     38052
   macro avg       0.50      0.50      0.48     38052
weighted avg       0.76      0.83      0.79     38052

Decision Tree
Test Accuracy: 0.7502
true Negatives: 27765
False Negatives: 4414
True Positives: 780
False Positives: 5093
False Negative Rate: 0.8498
              precision    recall  f1-score   support

         0.0       0.86      0.84      0.85     32858
         1.0       0.13      0.15      0.14      5194

    accuracy                           0.75     38052
   macro avg       0.50      0.50      0.50     38052
weighted avg       0.76      0.75      0.76    

In [67]:
# Evaluate models on test data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_test_pred, decision_tree_test_pred, random_forest_test_pred, voting_test_pred]
):
    print_fnr(predictions, model_name)

Logistic Regression
Test Accuracy: 0.8686
true Negatives: 32248
False Negatives: 4390
True Positives: 804
False Positives: 610
False Negative Rate: 0.8452
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     32858
         1.0       0.57      0.15      0.24      5194

    accuracy                           0.87     38052
   macro avg       0.72      0.57      0.59     38052
weighted avg       0.84      0.87      0.83     38052

Decision Tree
Test Accuracy: 0.8007
true Negatives: 28747
False Negatives: 3474
True Positives: 1720
False Positives: 4111
False Negative Rate: 0.6688
              precision    recall  f1-score   support

         0.0       0.89      0.87      0.88     32858
         1.0       0.29      0.33      0.31      5194

    accuracy                           0.80     38052
   macro avg       0.59      0.60      0.60     38052
weighted avg       0.81      0.80      0.81     38052

Random Forest
Test Accuracy: 0.8598
true

In [62]:
# Perform correlation analysis
correlation_matrix = X_train_processed_df.corr()
sorted_correlations = correlation_matrix['Diabetes_binary'].sort_values(ascending=False)

print("Top correlations with Diabetes_binary:")
print(sorted_correlations)

Top correlations with Diabetes_binary:
Diabetes_binary             1.000000
HighBP_1.0                  0.263531
DiffWalk_1.0                0.221507
BMI                         0.218647
HighChol_1.0                0.197866
GenHlth_4.0                 0.186045
HeartDiseaseorAttack_1.0    0.179718
Age                         0.176577
PhysHlth                    0.171694
GenHlth_5.0                 0.156761
PhysActivity_0.0            0.118985
Stroke_1.0                  0.107874
GenHlth_3.0                 0.071468
MentHlth                    0.070958
CholCheck_1.0               0.065421
Education_4.0               0.062446
Smoker_1.0                  0.062066
Education_3.0               0.058617
Education_2.0               0.058124
Veggies_0.0                 0.057552
HvyAlcoholConsump_0.0       0.057065
Fruits_0.0                  0.039896
NoDocbcCost_1.0             0.033110
Sex_1.0                     0.032150
Education_5.0               0.014771
AnyHealthcare_1.0           0.014540

In [75]:
# Evaluate models on validation data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_val_pred, decision_tree_val_pred, random_forest_val_pred, voting_val_pred]
):
    accuracy = accuracy_score(y_val, predictions)
    print(f"{model_name} Validation Accuracy: {accuracy:.4f}")
    print(classification_report(y_val, predictions))

Logistic Regression Validation Accuracy: 0.8637
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     32747
         1.0       0.54      0.15      0.24      5305

    accuracy                           0.86     38052
   macro avg       0.71      0.57      0.58     38052
weighted avg       0.83      0.86      0.83     38052

Decision Tree Validation Accuracy: 0.7961
              precision    recall  f1-score   support

         0.0       0.89      0.87      0.88     32747
         1.0       0.29      0.33      0.31      5305

    accuracy                           0.80     38052
   macro avg       0.59      0.60      0.59     38052
weighted avg       0.81      0.80      0.80     38052

Random Forest Validation Accuracy: 0.8551
              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92     32747
         1.0       0.45      0.17      0.25      5305

    accuracy                           0.86     38