In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE


In [10]:
# Load data
df = pd.read_csv('balanced_diabetes_dataset.csv')

# Identify categorical and numerical columns
categorical_cols = df.columns[df.nunique() <= 10].tolist()
numerical_cols = df.columns[df.nunique() > 10].tolist()
target_col = 'diabetes'
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

# Impute missing values
imputer_num = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

# Scale numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cols = encoder.fit_transform(df[categorical_cols])
encoded_col_names = encoder.get_feature_names_out(categorical_cols)
df = df.drop(categorical_cols, axis=1)
df[encoded_col_names] = encoded_cols

# Prepare training and testing data
X = df.drop(target_col, axis=1)
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balance the dataset by oversampling the minority class
training_data = pd.concat([X_train, y_train], axis=1)
majority = training_data[training_data.diabetes==0]
minority = training_data[training_data.diabetes==1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
upsampled = pd.concat([majority, minority_upsampled])

y_train = upsampled.diabetes
X_train = upsampled.drop('diabetes', axis=1)

# Train a decision tree classifier
clf = DecisionTreeClassifier(criterion='entropy', random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.96
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     10053
           1       0.92      0.97      0.94      4947

    accuracy                           0.96     15000
   macro avg       0.95      0.96      0.96     15000
weighted avg       0.96      0.96      0.96     15000

Confusion Matrix:
[[9612  441]
 [ 154 4793]]
