In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
print(df)

       gender   age  hypertension  heart_disease smoking_history    bmi  \
0      Female  80.0             0              1           never  25.19   
1      Female  54.0             0              0         No Info  27.32   
2        Male  28.0             0              0           never  27.32   
3      Female  36.0             0              0         current  23.45   
4        Male  76.0             1              1         current  20.14   
...       ...   ...           ...            ...             ...    ...   
99995  Female  80.0             0              0         No Info  27.32   
99996  Female   2.0             0              0         No Info  17.37   
99997    Male  66.0             0              0          former  27.83   
99998  Female  24.0             0              0           never  35.42   
99999  Female  57.0             0              0         current  22.43   

       HbA1c_level  blood_glucose_level  diabetes  
0              6.6                  140        

In [10]:
categorical_cols = df.columns[df.nunique() <= 10].tolist()  # Ensure this is a list
numerical_cols = df.columns[df.nunique() > 10].tolist()
target_col = 'diabetes'
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

In [11]:
# Impute missing values for numerical data
imputer_num = SimpleImputer(strategy='mean')
df[numerical_cols] = imputer_num.fit_transform(df[numerical_cols])

In [12]:
# Impute missing values for categorical data
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

In [13]:
# Scale numerical columns using Z-score normalization
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [14]:
# One-hot encode categorical columns, using drop='first' to avoid the dummy variable trap
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_cols = encoder.fit_transform(df[categorical_cols])
# Check the scikit-learn version and use the appropriate method to get feature names
encoded_col_names = encoder.get_feature_names(categorical_cols)
df = df.drop(categorical_cols, axis=1)  # Drop original categorical columns
df[encoded_col_names] = encoded_cols  # Add encoded columns

In [15]:
print(df.head())
X = df.drop(target_col, axis=1)
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        age       bmi  HbA1c_level  blood_glucose_level  diabetes  \
0  1.692704 -0.321056     1.001706             0.047704         0   
1  0.538006 -0.000116     1.001706            -1.426210         0   
2 -0.616691 -0.000116     0.161108             0.489878         0   
3 -0.261399 -0.583232    -0.492690             0.416183         0   
4  1.515058 -1.081970    -0.679490             0.416183         0   

   gender_Male  gender_Other  hypertension_1  heart_disease_1  \
0          0.0           0.0             0.0              1.0   
1          0.0           0.0             0.0              0.0   
2          1.0           0.0             0.0              0.0   
3          0.0           0.0             0.0              0.0   
4          1.0           0.0             1.0              1.0   

   smoking_history_current  smoking_history_ever  smoking_history_former  \
0                      0.0                   0.0                     0.0   
1                      0.0                

In [17]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy', random_state=42)

In [18]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     18292
           1       0.71      0.75      0.73      1708

    accuracy                           0.95     20000
   macro avg       0.84      0.86      0.85     20000
weighted avg       0.95      0.95      0.95     20000

Confusion Matrix:
[[17781   511]
 [  435  1273]]
