In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


In [2]:
data = pd.read_csv("diabetes_binary_classification_data.csv")

In [3]:
##Checks whats in the dataset
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [4]:
#checks duplicated in dataset
# There is 24206 duplicates
print(data.duplicated().sum())

24206


In [5]:
# Define features (X) and target (y)
X = data.drop('Diabetes_binary', axis=1)
y = data['Diabetes_binary']

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)  # 70% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% validation, 15% test

# Create preprocessing pipelines for numeric and categorical features
numeric_features = ['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Income']  # Non-binary variables
categorical_features = ['GenHlth', 'Education', 'HighBP', 'HighChol', 
                        'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
                        'PhysActivity', 'Fruits', 'Veggies', 
                        'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 
                        'DiffWalk', 'Sex']  # Binary variables

# Create preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocess training data (numerical and categorical features)
X_train_processed = preprocessor.fit_transform(X_train)

# Preprocess validation and test data
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)


In [6]:
# Define models
log_reg = LogisticRegression(class_weight='balanced', max_iter=2000, random_state=37)
decision_tree = DecisionTreeClassifier(class_weight='balanced', max_depth=50, random_state=74)
random_forest = RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=50, random_state=20)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest)],
    voting='hard')  # 'hard' for majority voting

# Fit models on the processed training data
log_reg.fit(X_train_processed, y_train)
decision_tree.fit(X_train_processed, y_train)
random_forest.fit(X_train_processed, y_train)
voting_clf.fit(X_train_processed, y_train)

# Make predictions on the validation set
log_reg_val_pred = log_reg.predict(X_val_processed)
decision_tree_val_pred = decision_tree.predict(X_val_processed)
random_forest_val_pred = random_forest.predict(X_val_processed)
voting_val_pred = voting_clf.predict(X_val_processed)

# Make predictions on the test set
log_reg_test_pred = log_reg.predict(X_test_processed)
decision_tree_test_pred = decision_tree.predict(X_test_processed)
random_forest_test_pred = random_forest.predict(X_test_processed)
voting_test_pred = voting_clf.predict(X_test_processed)

# Make predictions on the test set
log_reg_test_pred = log_reg.predict(X_test_processed)
decision_tree_test_pred = decision_tree.predict(X_test_processed)
random_forest_test_pred = random_forest.predict(X_test_processed)
voting_test_pred = voting_clf.predict(X_test_processed)

# Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg),
    ('decision_tree', decision_tree),
    ('random_forest', random_forest)],
    voting='hard')  # 'hard' for majority voting

In [7]:
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 202944
Validation set size: 25368
Test set size: 25368


In [8]:
# Preprocess training data (numerical and categorical features)
X_train_processed = preprocessor.fit_transform(X_train)

# Retrieve the column names from preprocessor (numeric + one-hot encoded features)
processed_columns = numeric_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))

# Convert the processed NumPy array back into a DataFrame with column names
X_train_processed_df = pd.DataFrame(X_train_processed, columns=processed_columns)

# Add the target variable to the DataFrame
X_train_processed_df['Diabetes_binary'] = y_train.reset_index(drop=True)

# Perform correlation analysis
correlation_matrix = X_train_processed_df.corr()
sorted_correlations = correlation_matrix['Diabetes_binary'].sort_values(ascending=False)

print("Top correlations with Diabetes_binary:")
print(sorted_correlations)

Top correlations with Diabetes_binary:
Diabetes_binary             1.000000
HighBP_1.0                  0.262282
DiffWalk_1.0                0.220457
BMI                         0.217411
HighChol_1.0                0.197345
GenHlth_4.0                 0.185752
HeartDiseaseorAttack_1.0    0.178962
Age                         0.177032
PhysHlth                    0.171067
GenHlth_5.0                 0.154718
PhysActivity_0.0            0.118177
Stroke_1.0                  0.107314
GenHlth_3.0                 0.071568
MentHlth                    0.069905
CholCheck_1.0               0.064777
Smoker_1.0                  0.062158
Education_4.0               0.061451
Education_3.0               0.058832
Veggies_0.0                 0.056861
Education_2.0               0.055913
HvyAlcoholConsump_0.0       0.055839
Fruits_0.0                  0.039602
NoDocbcCost_1.0             0.032707
Sex_1.0                     0.031906
AnyHealthcare_1.0           0.015831
Education_5.0               0.015747

In [22]:
# Evaluate models on validation data and print results
for model_name, predictions in zip(
    ["Logistic Regression", "Decision Tree", "Random Forest", "Voting Classifier"],
    [log_reg_test_pred, decision_tree_test_pred, random_forest_test_pred, voting_test_pred]
):
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name}:")
    print(classification_report(y_test, predictions))
    print('confusion matrix:')
    print(confusion_matrix(y_test, predictions, labels=[0,1]))
    print('-'*64)
    print('')

Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     21884
         1.0       0.30      0.78      0.44      3484

    accuracy                           0.72     25368
   macro avg       0.63      0.75      0.63     25368
weighted avg       0.86      0.72      0.77     25368

confusion matrix:
[[15664  6220]
 [  769  2715]]
----------------------------------------------------------------

Decision Tree:
              precision    recall  f1-score   support

         0.0       0.89      0.88      0.88     21884
         1.0       0.29      0.30      0.30      3484

    accuracy                           0.80     25368
   macro avg       0.59      0.59      0.59     25368
weighted avg       0.81      0.80      0.80     25368

confusion matrix:
[[19264  2620]
 [ 2423  1061]]
----------------------------------------------------------------

Random Forest:
              precision    recall  f1-score   support

         0