## Data Ingestion

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
!/opt/conda/bin/python -m pip install kagglehub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/conda/bin/python -m pip install --upgrade pip[0m


In [3]:
import kagglehub

path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/codespace/.cache/kagglehub/datasets/alexteboul/diabetes-health-indicators-dataset/versions/1


In [4]:
csv_path = os.path.join(path, "diabetes_binary_health_indicators_BRFSS2015.csv")
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


## Data Preparation

In [5]:
df.columns = df.columns.str.lower().str.replace(' ','_')
df = df.astype(int) # convert entire dataframe from float to int

In [6]:
df.dtypes

diabetes_binary         int64
highbp                  int64
highchol                int64
cholcheck               int64
bmi                     int64
smoker                  int64
stroke                  int64
heartdiseaseorattack    int64
physactivity            int64
fruits                  int64
veggies                 int64
hvyalcoholconsump       int64
anyhealthcare           int64
nodocbccost             int64
genhlth                 int64
menthlth                int64
physhlth                int64
diffwalk                int64
sex                     int64
age                     int64
education               int64
income                  int64
dtype: object

In [7]:
df['bmi'].describe() # checking the only continuised column bmi

count    253680.000000
mean         28.382364
std           6.608694
min          12.000000
25%          24.000000
50%          27.000000
75%          31.000000
max          98.000000
Name: bmi, dtype: float64

In [8]:
df.isnull().sum()  # all values are not null

diabetes_binary         0
highbp                  0
highchol                0
cholcheck               0
bmi                     0
smoker                  0
stroke                  0
heartdiseaseorattack    0
physactivity            0
fruits                  0
veggies                 0
hvyalcoholconsump       0
anyhealthcare           0
nodocbccost             0
genhlth                 0
menthlth                0
physhlth                0
diffwalk                0
sex                     0
age                     0
education               0
income                  0
dtype: int64

In [9]:
df.diabetes_binary.value_counts()

diabetes_binary
0    218334
1     35346
Name: count, dtype: int64

In [10]:
df['sex']=df['sex'].map({0:'female',1:'male'})

## Set up train/validation/test datasets

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_full_train,df_test = train_test_split(df,test_size=0.2,random_state=1)

In [13]:
df_train,df_val = train_test_split(df_full_train,test_size=0.25,random_state=1)

In [14]:
len(df_train),len(df_val),len(df_test)

(152208, 50736, 50736)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

In [16]:
y_train = df_train.diabetes_binary.values
y_val = df_val.diabetes_binary.values
y_test = df_test.diabetes_binary.values


In [17]:
del df_train['diabetes_binary']
del df_val['diabetes_binary']
del df_test['diabetes_binary']

## EDA

In [18]:
df_full_train['diabetes_binary'].value_counts(normalize=True)

diabetes_binary
0    0.860646
1    0.139354
Name: proportion, dtype: float64

In [19]:
global_rate = df_full_train.diabetes_binary.mean()
global_rate

np.float64(0.1393537133396405)

In [20]:
for col in df_full_train.columns:
    print(df_full_train[col].value_counts(normalize=True))

diabetes_binary
0    0.860646
1    0.139354
Name: proportion, dtype: float64
highbp
0    0.570783
1    0.429217
Name: proportion, dtype: float64
highchol
0    0.576026
1    0.423974
Name: proportion, dtype: float64
cholcheck
1    0.962167
0    0.037833
Name: proportion, dtype: float64
bmi
27    0.097180
26    0.081362
24    0.076864
25    0.067827
28    0.065314
        ...   
88    0.000005
78    0.000005
91    0.000005
96    0.000005
86    0.000005
Name: proportion, Length: 83, dtype: float64
smoker
0    0.556213
1    0.443787
Name: proportion, dtype: float64
stroke
0    0.959486
1    0.040514
Name: proportion, dtype: float64
heartdiseaseorattack
0    0.905585
1    0.094415
Name: proportion, dtype: float64
physactivity
1    0.756578
0    0.243422
Name: proportion, dtype: float64
fruits
1    0.634372
0    0.365628
Name: proportion, dtype: float64
veggies
1    0.811618
0    0.188382
Name: proportion, dtype: float64
hvyalcoholconsump
0    0.943541
1    0.056459
Name: proportion, dtype: 

In [21]:
categorical=['highbp', 'highchol', 'cholcheck', 'smoker',
       'stroke', 'heartdiseaseorattack', 'physactivity', 'fruits', 'veggies',
       'hvyalcoholconsump', 'anyhealthcare', 'nodocbccost', 'genhlth', 'diffwalk', 'sex', 'age', 'education',
       'income']
numerical = ['bmi','menthlth', 'physhlth']

In [22]:
from IPython.display import display

In [23]:
df_full_train.groupby('sex').diabetes_binary.agg(['mean'])

Unnamed: 0_level_0,mean
sex,Unnamed: 1_level_1
female,0.129762
male,0.151536


In [24]:
for c in categorical:
    df_group = df_full_train.groupby(c).diabetes_binary.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_rate
    df_group['ratio'] = df_group['mean'] / global_rate
    display(df_group)

Unnamed: 0_level_0,mean,diff,ratio
highbp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.059998,-0.079356,0.430545
1,0.244883,0.105529,1.757275


Unnamed: 0_level_0,mean,diff,ratio
highchol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.079315,-0.060039,0.569163
1,0.220924,0.081571,1.58535


Unnamed: 0_level_0,mean,diff,ratio
cholcheck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.025788,-0.113566,0.185054
1,0.143819,0.004465,1.032044


Unnamed: 0_level_0,mean,diff,ratio
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.120898,-0.018455,0.867564
1,0.162484,0.023131,1.165986


Unnamed: 0_level_0,mean,diff,ratio
stroke,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.131783,-0.007571,0.945671
1,0.318657,0.179304,2.286679


Unnamed: 0_level_0,mean,diff,ratio
heartdiseaseorattack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.119527,-0.019827,0.857723
1,0.329524,0.19017,2.364655


Unnamed: 0_level_0,mean,diff,ratio
physactivity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.210988,0.071634,1.514044
1,0.116306,-0.023048,0.834611


Unnamed: 0_level_0,mean,diff,ratio
fruits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.158001,0.018647,1.133814
1,0.128606,-0.010748,0.922875


Unnamed: 0_level_0,mean,diff,ratio
veggies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.179723,0.04037,1.289691
1,0.129984,-0.00937,0.932761


Unnamed: 0_level_0,mean,diff,ratio
hvyalcoholconsump,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.144167,0.004813,1.034541
1,0.058911,-0.080443,0.422743


Unnamed: 0_level_0,mean,diff,ratio
anyhealthcare,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.116546,-0.022808,0.836331
1,0.140528,0.001174,1.008426


Unnamed: 0_level_0,mean,diff,ratio
nodocbccost,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.136066,-0.003288,0.976405
1,0.175038,0.035684,1.25607


Unnamed: 0_level_0,mean,diff,ratio
genhlth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.025408,-0.113946,0.182325
2,0.070919,-0.068435,0.508914
3,0.178398,0.039045,1.280183
4,0.310283,0.17093,2.226588
5,0.381356,0.242002,2.736604


Unnamed: 0_level_0,mean,diff,ratio
diffwalk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.105517,-0.033837,0.757186
1,0.306737,0.167383,2.201137


Unnamed: 0_level_0,mean,diff,ratio
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.129762,-0.009591,0.931173
male,0.151536,0.012182,1.087419


Unnamed: 0_level_0,mean,diff,ratio
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.013637,-0.125716,0.097862
2,0.018346,-0.121008,0.131648
3,0.027818,-0.111535,0.199624
4,0.046334,-0.09302,0.332493
5,0.06546,-0.073894,0.469739
6,0.088795,-0.050559,0.63719
7,0.118246,-0.021107,0.848535
8,0.137857,-0.001497,0.98926
9,0.16993,0.030576,1.219413
10,0.203528,0.064174,1.460515


Unnamed: 0_level_0,mean,diff,ratio
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.295455,0.156101,2.120177
2,0.285011,0.145657,2.045233
3,0.244197,0.104843,1.752353
4,0.176823,0.037469,1.268876
5,0.148913,0.009559,1.068594
6,0.096169,-0.043185,0.690105


Unnamed: 0_level_0,mean,diff,ratio
income,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.244741,0.105387,1.756257
2,0.262034,0.12268,1.88035
3,0.224716,0.085362,1.612555
4,0.200136,0.060783,1.436175
5,0.172992,0.033638,1.241389
6,0.14564,0.006287,1.045113
7,0.121509,-0.017845,0.871945
8,0.079835,-0.059519,0.572892


In [25]:
from sklearn.metrics import mutual_info_score

In [26]:
def mi_diabetes_score(series):
    return mutual_info_score(series,df_full_train.diabetes_binary)

In [27]:
mi = df_full_train[categorical].apply(mi_diabetes_score)
mi.sort_values(ascending = False)

genhlth                 0.044296
highbp                  0.035322
highchol                0.020285
age                     0.020104
diffwalk                0.019762
income                  0.013465
heartdiseaseorattack    0.012517
education               0.007721
physactivity            0.006373
stroke                  0.004462
cholcheck               0.003002
hvyalcoholconsump       0.001974
smoker                  0.001769
veggies                 0.001492
fruits                  0.000824
sex                     0.000485
nodocbccost             0.000461
anyhealthcare           0.000117
dtype: float64

## Feature importance: Correlation coefficient for numerical

In [28]:
df_full_train[numerical].corrwith(df_full_train.diabetes_binary)

bmi         0.218196
menthlth    0.070515
physhlth    0.173303
dtype: float64

## one-Hot Encoding

In [29]:
from sklearn.feature_extraction import DictVectorizer

In [30]:
df_train[categorical]=df_train[categorical].astype(str)
df_val[categorical]=df_val[categorical].astype(str)
df_test[categorical]=df_test[categorical].astype(str)

In [31]:
df_train_dict = df_train.to_dict(orient='records')
df_val_dict = df_val.to_dict(orient='records')

df_train_dict[0]

{'highbp': '0',
 'highchol': '0',
 'cholcheck': '1',
 'bmi': 22,
 'smoker': '0',
 'stroke': '0',
 'heartdiseaseorattack': '0',
 'physactivity': '1',
 'fruits': '0',
 'veggies': '1',
 'hvyalcoholconsump': '0',
 'anyhealthcare': '1',
 'nodocbccost': '0',
 'genhlth': '1',
 'menthlth': 0,
 'physhlth': 0,
 'diffwalk': '0',
 'sex': 'female',
 'age': '9',
 'education': '5',
 'income': '6'}

In [32]:
dv = DictVectorizer(sparse=False)

In [33]:
X_train = dv.fit_transform(df_train_dict)
X_val = dv.fit_transform(df_val_dict)

X_train

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.]], shape=(152208, 63))

In [34]:
dv.get_feature_names_out()

array(['age=1', 'age=10', 'age=11', 'age=12', 'age=13', 'age=2', 'age=3',
       'age=4', 'age=5', 'age=6', 'age=7', 'age=8', 'age=9',
       'anyhealthcare=0', 'anyhealthcare=1', 'bmi', 'cholcheck=0',
       'cholcheck=1', 'diffwalk=0', 'diffwalk=1', 'education=1',
       'education=2', 'education=3', 'education=4', 'education=5',
       'education=6', 'fruits=0', 'fruits=1', 'genhlth=1', 'genhlth=2',
       'genhlth=3', 'genhlth=4', 'genhlth=5', 'heartdiseaseorattack=0',
       'heartdiseaseorattack=1', 'highbp=0', 'highbp=1', 'highchol=0',
       'highchol=1', 'hvyalcoholconsump=0', 'hvyalcoholconsump=1',
       'income=1', 'income=2', 'income=3', 'income=4', 'income=5',
       'income=6', 'income=7', 'income=8', 'menthlth', 'nodocbccost=0',
       'nodocbccost=1', 'physactivity=0', 'physactivity=1', 'physhlth',
       'sex=female', 'sex=male', 'smoker=0', 'smoker=1', 'stroke=0',
       'stroke=1', 'veggies=0', 'veggies=1'], dtype=object)

## Logistic Regression Model

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [36]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [37]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred

array([0.49702127, 0.14939146, 0.01301514, ..., 0.34919232, 0.02910848,
       0.50206331], shape=(50736,))

In [38]:
diabetes_decision = (y_pred >=0.5)
diabetes_decision

array([False, False, False, ..., False, False,  True], shape=(50736,))

In [39]:
diabetes_decision.astype(int)

array([0, 0, 0, ..., 0, 0, 1], shape=(50736,))

In [40]:
(y_val == diabetes_decision).mean()

np.float64(0.8651056449069694)

In [41]:
from sklearn.metrics import roc_auc_score

In [42]:
roc_auc_score(y_val,y_pred)


0.8267469450650347

## Tune the model: Cross-validation

In [43]:
def train(df_train,y_train,C=1.0):
    dict = df_train[categorical + numerical].to_dict(orient = 'records') # globally drop 'diabetes_binary' column
    
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dict)

    model = LogisticRegression(C=C, max_iter = 10000)
    model.fit(X_train,y_train)

    return dv,model

In [44]:
def predict(df,dv,model):
    dict = df[categorical + numerical].to_dict(orient='records') # globally drop 'diabetes_binary' column
    X = dv.transform(dict)
    y_pred = model.predict_proba(X)[:,1]
    return y_pred

In [45]:
dv,model = train(df_train,y_train,C=0.001)

In [46]:
y_pred = predict(df_val,dv,model)

In [47]:
from sklearn.model_selection import KFold

In [48]:
!/opt/conda/bin/python -m pip install tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/conda/bin/python -m pip install --upgrade pip[0m


In [49]:
from tqdm.auto import tqdm

In [50]:
results = {}

n_splits = 5

for C in tqdm([0.001,0.01,0.1,0.5,1,5,10]):
    scores = []
    kfold = KFold(n_splits=n_splits,shuffle=True,random_state=1)
    for train_idx, val_idx in kfold.split(df_full_train):
        df2_train = df_full_train.iloc[train_idx]
        df2_val = df_full_train.iloc[val_idx]

        y2_train = df2_train.diabetes_binary.values
        y2_val = df2_val.diabetes_binary.values

        dv,model = train(df2_train,y2_train,C=C)
        y2_pred = predict(df2_val,dv,model)

        auc = roc_auc_score(y2_val,y2_pred)
        scores.append(auc)
    
    results[C] = {
        "scores": scores,
        "mean": np.mean(scores),
        "std": np.std(scores)
    }
        
    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

    

 14%|████████▋                                                    | 1/7 [00:40<04:01, 40.22s/it]

C=0.001 0.822 +- 0.002


 29%|█████████████████▍                                           | 2/7 [01:45<04:34, 54.83s/it]

C=0.01 0.822 +- 0.002


 43%|██████████████████████████▏                                  | 3/7 [03:15<04:44, 71.06s/it]

C=0.1 0.822 +- 0.002


 57%|██████████████████████████████████▊                          | 4/7 [04:10<03:14, 64.71s/it]

C=0.5 0.822 +- 0.002


 71%|███████████████████████████████████████████▌                 | 5/7 [05:00<01:58, 59.48s/it]

C=1 0.822 +- 0.002


 86%|████████████████████████████████████████████████████▎        | 6/7 [05:54<00:57, 57.65s/it]

C=5 0.822 +- 0.002


100%|█████████████████████████████████████████████████████████████| 7/7 [06:49<00:00, 58.45s/it]

C=10 0.822 +- 0.002





In [51]:
for C in results:
    print(f"C={C}  mean={results[C]['mean']:.4f}  std={results[C]['std']:.5f}")

C=0.001  mean=0.8216  std=0.00226
C=0.01  mean=0.8225  std=0.00228
C=0.1  mean=0.8224  std=0.00227
C=0.5  mean=0.8224  std=0.00227
C=1  mean=0.8224  std=0.00228
C=5  mean=0.8224  std=0.00227
C=10  mean=0.8224  std=0.00227


## Use the Logistic Regression model with the best parameter

In [52]:
len(df),len(df_full_train),len(df_test),len(df_train),len(df_val)

(253680, 202944, 50736, 152208, 50736)

In [53]:
dv,model = train(df_full_train,df_full_train.diabetes_binary.values, C = 0.01)
y_pred = predict (df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

0.6592016208566392