In [1]:
import os
os.chdir('..')
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

from Packages.QC import QC

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report, f1_score

# Import raw data

In [3]:
data = pd.read_csv('data/diabetes_012_health_indicators_BRFSS2015.csv')
QC(data).qc()

Unnamed: 0,Example,Type,N_rows,Missing values,Unique values,Most frequent,Second most frequent,Third most frequent,25th percentile,50th percentile,75th percentile,Maximum value,Minimum value,Standard deviation
Age,5.0,float64,253680,0,13,9.0,10.0,8.0,6.0,8.0,10.0,13.0,1.0,3.05422
AnyHealthcare,1.0,float64,253680,0,2,1.0,0.0,,1.0,1.0,1.0,1.0,0.0,0.215759
BMI,27.0,float64,253680,0,84,27.0,26.0,24.0,24.0,27.0,31.0,98.0,12.0,6.608694
CholCheck,1.0,float64,253680,0,2,1.0,0.0,,1.0,1.0,1.0,1.0,0.0,0.189571
Diabetes_012,0.0,float64,253680,0,3,0.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,0.69816
DiffWalk,1.0,float64,253680,0,2,0.0,1.0,,0.0,0.0,0.0,1.0,0.0,0.374066
Education,6.0,float64,253680,0,6,6.0,5.0,4.0,4.0,5.0,6.0,6.0,1.0,0.985774
Fruits,1.0,float64,253680,0,2,1.0,0.0,,0.0,1.0,1.0,1.0,0.0,0.481639
GenHlth,2.0,float64,253680,0,5,2.0,3.0,1.0,2.0,2.0,3.0,5.0,1.0,1.068477
HeartDiseaseorAttack,0.0,float64,253680,0,2,0.0,1.0,,0.0,0.0,0.0,1.0,0.0,0.292087


# Train test split

In [4]:
%%time
X = data.drop(columns=['Diabetes_012'])
y = data['Diabetes_012']

# Split the data into training (80%) and the rest (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=100, stratify=y)

# Split the remaining data (20%) into validation (10%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=100, stratify=y_temp)

print(f'Shape before split: {data.shape}')
print(f'''After split\nX_train: {X_train.shape}\t\tX_val: {X_val.shape}\t\tX_test: {X_test.shape}\n
y_train: Shape - {y_train.shape}\t\ty_val shape - {y_val.shape}\t\ty_test shape - {y_test.shape}
\n\n\ny_train Distribution - {y_train.value_counts()}\n\n\ny_val Distribution - {y_val.value_counts()}\n\n\ny_test Distribution - {y_test.value_counts()}''')

Shape before split: (253680, 22)
After split
X_train: (202944, 21)		X_val: (25368, 21)		X_test: (25368, 21)

y_train: Shape - (202944,)		y_val shape - (25368,)		y_test shape - (25368,)



y_train Distribution - Diabetes_012
0.0    170962
2.0     28277
1.0      3705
Name: count, dtype: int64


y_val Distribution - Diabetes_012
0.0    21370
2.0     3535
1.0      463
Name: count, dtype: int64


y_test Distribution - Diabetes_012
0.0    21371
2.0     3534
1.0      463
Name: count, dtype: int64
CPU times: total: 15.6 ms
Wall time: 123 ms


In [5]:
dist_train = px.pie(y_train.value_counts(),values=y_train.value_counts().values,names=y_train.value_counts().index,width=400,height=300,title='Train set distribution')
dist_val = px.pie(y_val.value_counts(),values=y_val.value_counts().values,names=y_val.value_counts().index,width=400,height=300,title='Validation set distribution')
dist_test = px.pie(y_test.value_counts(),values=y_test.value_counts().values,names=y_test.value_counts().index,width=400,height=300,title='Test set distribution')

dist_train.show()
dist_val.show()
dist_test.show()

# Random forest classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rf_param_grid = {'n_estimators':[250,300,350],'max_features':['sqrt','log2']}
rf = RandomForestClassifier(class_weight='balanced_subsample',n_jobs=-1, random_state=100)

In [8]:
%%time
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5, scoring='roc_auc_ovr_weighted')      
rf_grid_search.fit(X_train, y_train)

CPU times: total: 55min 16s
Wall time: 8min 23s


In [9]:
print(f'Best parameters from grid search are {rf_grid_search.best_params_} corresponding to best roc_auc_ovr_weighted score {rf_grid_search.best_score_}')

Best parameters from grid search are {'max_features': 'sqrt', 'n_estimators': 350} corresponding to best roc_auc_ovr_weighted score 0.7942256218235537


In [10]:
rf_best = rf_grid_search.best_estimator_
rf_best.fit(X_train,y_train)

In [11]:
rfFI = pd.DataFrame({'Features':X_train.columns.tolist(),'RF_Importance':rf_best.feature_importances_})
rfFI[:7]

Unnamed: 0,Features,RF_Importance
0,HighBP,0.034985
1,HighChol,0.026986
2,CholCheck,0.004858
3,BMI,0.178143
4,Smoker,0.03541
5,Stroke,0.010937
6,HeartDiseaseorAttack,0.015969


In [12]:
rfFIpie = px.pie(rfFI,values=rfFI.RF_Importance,names=rfFI.Features,width=950,height=900,title='Random forest feature importance')
rfFIpie.show()

In [13]:
y_val_rfpred = rf_best.predict(X_val)
y_test_rfpred = rf_best.predict(X_test)

In [16]:
confusion_matrix(y_val,y_val_rfpred)

array([[20639,    55,   676],
       [  421,     0,    42],
       [ 2952,    12,   571]], dtype=int64)

In [14]:
confusion_matrix(y_test,y_test_rfpred)

array([[20668,    67,   636],
       [  424,     0,    39],
       [ 2968,     6,   560]], dtype=int64)

In [17]:
print(classification_report(y_val,y_val_rfpred,target_names=['0','2','1']))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91     21370
           2       0.00      0.00      0.00       463
           1       0.44      0.16      0.24      3535

    accuracy                           0.84     25368
   macro avg       0.43      0.38      0.38     25368
weighted avg       0.79      0.84      0.80     25368



In [15]:
print(classification_report(y_test,y_test_rfpred,target_names=['0','2','1']))

              precision    recall  f1-score   support

           0       0.86      0.97      0.91     21371
           2       0.00      0.00      0.00       463
           1       0.45      0.16      0.23      3534

    accuracy                           0.84     25368
   macro avg       0.44      0.38      0.38     25368
weighted avg       0.79      0.84      0.80     25368



In [18]:
roc_auc_score(y_val, rf_best.predict_proba(X_val), multi_class='ovr')

0.7483971942365327

# XGBoost classifier

In [30]:
from xgboost import XGBClassifier

In [20]:
param_grid_xgb = {'booster':['gbtree','dart'],'learning_rate':[0.01,0.05,0.1],'max_depth':[1,2]}
xgb_classifier = XGBClassifier(objective='multi:softmax',n_jobs=-1,num_class=3,random_state=100,device='cuda')

In [21]:
%%time
xgb_grid_search = GridSearchCV(xgb_classifier, param_grid_xgb, cv=5, scoring='roc_auc_ovr_weighted')
xgb_grid_search.fit(X_train, y_train)

CPU times: total: 3min 27s
Wall time: 5min 26s


In [22]:
print(f'Best parameters from grid search are {xgb_grid_search.best_params_} corresponding to best roc_auc_ovr_weighted score {xgb_grid_search.best_score_}')

Best parameters from grid search are {'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 2} corresponding to best roc_auc_ovr_weighted score 0.821868336782576


In [23]:
xgb_best = xgb_grid_search.best_estimator_
xgb_best.fit(X_train,y_train)

In [25]:
xgbFI = pd.DataFrame({'Feature':X_train.columns.tolist(),'Importance':xgb_best.feature_importances_})
xgbFI[:5]

Unnamed: 0,Feature,Importance
0,HighBP,0.453771
1,HighChol,0.07312
2,CholCheck,0.023157
3,BMI,0.058383
4,Smoker,0.0


In [26]:
xgbFIpie = px.pie(xgbFI,values=xgbFI.Importance,names=xgbFI.Feature,width=1450,height=900,title='XGBoost feature importance')
xgbFIpie.show()

In [27]:
y_val_xgbpred = xgb_best.predict(X_val)
y_test_xgbpred = xgb_best.predict(X_test)

In [29]:
confusion_matrix(y_val,y_val_xgbpred)

array([[20959,     0,   411],
       [  419,     0,    44],
       [ 2977,     0,   558]], dtype=int64)

In [28]:
print(classification_report(y_val,y_val_xgbpred,target_names=['0','2','1']))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92     21370
           2       0.00      0.00      0.00       463
           1       0.55      0.16      0.25      3535

    accuracy                           0.85     25368
   macro avg       0.47      0.38      0.39     25368
weighted avg       0.80      0.85      0.81     25368



# LightGBM

In [6]:
import lightgbm as lgb

In [7]:
lgbm = lgb.LGBMClassifier(objective='multiclass',num_class=3,random_state=100)
param_grid = {
    'num_leaves': [31, 63, 127],
    'max_depth': [1,3,5],
    'learning_rate': [0.01,0.05, 0.1],
    'n_estimators': [100, 200, 300],
}

class_weights = {0:202944/(3*170962), 1:202944/(3*3705), 2:253680/(3*28277)}

lgbm_grid_search = GridSearchCV(lgbm, param_grid, cv=5, scoring='roc_auc_ovr_weighted')

In [8]:
%%time
lgbm_grid_search.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 162355, number of used features: 21
[LightGBM] [Info] Start training from score -0.171492
[LightGBM] [Info] Start training from score -4.003246
[LightGBM] [Info] Start training from score -1.970862
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008526 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 201
[LightGBM] [Info] Number of data points in the train set: 162355, number of used features: 21
[LightGBM] [Info] Start training from score -0.171492
[LightGBM] [Info] Start training from score -4.003246
[LightGBM] [Info] Start 

In [14]:
print(f'Best parameters from grid search are {lgbm_grid_search.best_params_} corresponding to best roc_auc_ovr_weighted score {lgbm_grid_search.best_score_}')

Best parameters from grid search are {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'num_leaves': 31} corresponding to best roc_auc_ovr_weighted score 0.8244214429390612


In [10]:
%%time
lgbm_best = lgbm_grid_search.best_estimator_
lgbm_best.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 202
[LightGBM] [Info] Number of data points in the train set: 202944, number of used features: 21
[LightGBM] [Info] Start training from score -0.171489
[LightGBM] [Info] Start training from score -4.003247
[LightGBM] [Info] Start training from score -1.970881
CPU times: total: 19.8 s
Wall time: 1.99 s


In [12]:
y_val_lgbmpred = lgbm_best.predict(X_val)
y_test_lgbmpred = lgbm_best.predict(X_test)

In [13]:
confusion_matrix(y_val,y_val_lgbmpred)

array([[20861,     0,   509],
       [  422,     0,    41],
       [ 2884,     0,   651]], dtype=int64)

In [None]:
print(classification_report(y_val,y_val_lgbmpred,target_names=['0','2','1']))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92     21356
           2       0.00      0.00      0.00       462
           1       0.58      0.20      0.30      3550

    accuracy                           0.85     25368
   macro avg       0.48      0.39      0.40     25368
weighted avg       0.81      0.85      0.81     25368



In [15]:
roc_auc_score(y_val, lgbm_best.predict_proba(X_val), multi_class='ovr')

0.7923650893110429

# Resampling

In [19]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [20]:
oversampler = RandomOverSampler(sampling_strategy={1: 15346})
undersampler = RandomUnderSampler(sampling_strategy={0: 25346})

X_resampled, y_resampled = undersampler.fit_resample(X, y)
X_resampled, y_resampled = oversampler.fit_resample(X_resampled, y_resampled)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=100, stratify=y_resampled)

### Random Forest

In [25]:
%%time
rf_resample = RandomForestClassifier(n_estimators=500,max_features='sqrt',class_weight='balanced_subsample',n_jobs=-1, random_state=100)
rf_resample.fit(X_train,y_train)

CPU times: total: 1min 7s
Wall time: 9.28 s


In [27]:
y_val_rfpred = rf_resample.predict(X_val)
y_test_rfpred = rf_resample.predict(X_test)

confusion_matrix(y_val,y_val_rfpred)

array([[14001,   255,  7114],
       [    5,   448,    10],
       [  131,    21,  3383]], dtype=int64)

In [28]:
print(classification_report(y_val,y_val_rfpred,target_names=['0','2','1']))

              precision    recall  f1-score   support

           0       0.99      0.66      0.79     21370
           2       0.62      0.97      0.75       463
           1       0.32      0.96      0.48      3535

    accuracy                           0.70     25368
   macro avg       0.64      0.86      0.68     25368
weighted avg       0.89      0.70      0.75     25368



In [29]:
roc_auc_score(y_test, rf_resample.predict_proba(X_test), multi_class='ovr')

0.8935608487529967

### XGBoost

In [47]:
%%time
xgb_resample = xgb_classifier = XGBClassifier(learning_rate=0.01,max_depth=1,objective='multi:softmax',n_jobs=-1,num_class=3,random_state=100,device='cuda')
xgb_resample.fit(X_train,y_train)

CPU times: total: 906 ms
Wall time: 535 ms


In [48]:
y_val_xgbpred = xgb_resample.predict(X_val)
y_test_xgbpred = xgb_resample.predict(X_test)

confusion_matrix(y_val,y_val_xgbpred)

array([[ 8965,     0, 12405],
       [   56,     0,   407],
       [  250,     0,  3285]], dtype=int64)

In [49]:
print(classification_report(y_val,y_val_xgbpred,target_names=['0','2','1']))

              precision    recall  f1-score   support

           0       0.97      0.42      0.59     21370
           2       0.00      0.00      0.00       463
           1       0.20      0.93      0.33      3535

    accuracy                           0.48     25368
   macro avg       0.39      0.45      0.31     25368
weighted avg       0.84      0.48      0.54     25368



In [46]:
roc_auc_score(y_test, xgb_resample.predict_proba(X_test), multi_class='ovr')

0.7200659914977307