In [2]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.model_selection import train_test_split

In [38]:
df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df.shape

(100000, 9)

In [5]:
df.apply(pd.unique)

gender                                             [Female, Male, Other]
age                    [80.0, 54.0, 28.0, 36.0, 76.0, 20.0, 44.0, 79....
hypertension                                                      [0, 1]
heart_disease                                                     [1, 0]
smoking_history        [never, No Info, current, former, ever, not cu...
bmi                    [25.19, 27.32, 23.45, 20.14, 19.31, 23.86, 33....
HbA1c_level            [6.6, 5.7, 5.0, 4.8, 6.5, 6.1, 6.0, 5.8, 3.5, ...
blood_glucose_level    [140, 80, 158, 155, 85, 200, 145, 100, 130, 16...
diabetes                                                          [0, 1]
dtype: object

In [42]:
df = df[(df["smoking_history"] != "No Info")&(df["gender"] != "Other")]
df.shape

(64172, 9)

In [43]:
X = df.drop("diabetes",axis = 1)
y = df["diabetes"]

In [44]:
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
0,Female,80.0,0,1,never,25.19,6.6,140
2,Male,28.0,0,0,never,27.32,5.7,158
3,Female,36.0,0,0,current,23.45,5.0,155
4,Male,76.0,1,1,current,20.14,4.8,155
5,Female,20.0,0,0,never,27.32,6.6,85


In [45]:
X.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
dtype: int64

In [33]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [46]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.28, random_state = 42)

In [61]:
cat_features = ["gender", "smoking_history"]
ohe = OneHotEncoder(handle_unknown='ignore')
transform = ColumnTransformer([("ohe", ohe, cat_features)], remainder="passthrough")
transformed_X_train = transform.fit_transform(X_train)
transformed_X_test = transform.transform(X_test)
print(transformed_X_train.shape)
ohe_columns = transform.named_transformers_['ohe'].get_feature_names_out(cat_features)
numeric_features = [col for col in X_train.columns if col not in cat_features]
all_columns = list(ohe_columns) + numeric_features
transformed_X_train_df = pd.DataFrame(transformed_X_train, columns=all_columns)
transformed_X_test_df = pd.DataFrame(transformed_X_test, columns=all_columns)


(46203, 13)


In [63]:
transformed_X_train_df.head()

Unnamed: 0,gender_Female,gender_Male,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,76.0,1.0,0.0,28.45,4.8,85.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,60.0,0.0,0.0,31.0,5.8,200.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,62.0,1.0,0.0,32.26,4.0,126.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,54.0,0.0,0.0,49.65,8.2,300.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,45.0,0.0,0.0,32.7,6.5,80.0


In [76]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [85]:
model = RandomForestClassifier(n_estimators = 20,
                               max_depth=10, 
                               min_samples_split=5, 
                               min_samples_leaf=4)
model.fit(transformed_X_train_df,y_train)

In [92]:
print(f"Accuracy on training set : {model.score(transformed_X_train_df,y_train)*100:.2f}%\nAccuracy on test set : {model.score(transformed_X_test_df,y_test)*100:.2f}%")

Accuracy on training set : 96.32%
Accuracy on test set : 96.49%


In [82]:
from sklearn.model_selection import cross_val_score

In [91]:
print(f"Cross val score : {cross_val_score(model,transformed_X_train_df,y_train,cv = 5)} ")

Cross val score : [0.96320745 0.96169246 0.96407315 0.96255411 0.96255411] 


In [93]:
y_preds = model.predict(transformed_X_test_df)

In [94]:
confusion_matrix(y_test,y_preds)

array([[16028,     3],
       [  627,  1311]], dtype=int64)

In [96]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     16031
           1       1.00      0.68      0.81      1938

    accuracy                           0.96     17969
   macro avg       0.98      0.84      0.89     17969
weighted avg       0.97      0.96      0.96     17969

