In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Age,Gender,BMI,SBP,DBP,FPG,Chol,Tri,HDL,LDL,ALT,BUN,CCR,FFPG,smoking,drinking,family_histroy,Diabetes
0,26,1,20.1,119,81,5.8,4.36,0.86,0.9,2.43,12.0,5.4,63.8,5.4,3.0,3.0,0,0
1,40,1,17.7,97,54,4.6,3.7,1.02,1.5,2.04,9.2,3.7,70.3,4.1,1.0,1.0,0,0
2,40,2,19.7,85,53,5.3,5.87,1.29,1.75,3.37,10.1,4.1,61.1,4.85,3.0,3.0,0,0
3,43,1,23.1,111,71,4.5,4.05,0.74,1.27,2.6,36.5,4.38,73.4,5.3,2.0,3.0,0,0
4,36,1,26.5,130,82,5.54,6.69,3.49,0.91,3.64,69.3,3.86,67.5,5.53,3.0,3.0,0,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4303 entries, 0 to 4302
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             4303 non-null   int64  
 1   Gender          4303 non-null   int64  
 2   BMI             4303 non-null   float64
 3   SBP             4303 non-null   int64  
 4   DBP             4303 non-null   int64  
 5   FPG             4303 non-null   float64
 6   Chol            4303 non-null   float64
 7   Tri             4303 non-null   float64
 8   HDL             4303 non-null   float64
 9   LDL             4303 non-null   float64
 10  ALT             4303 non-null   float64
 11  BUN             4303 non-null   float64
 12  CCR             4303 non-null   float64
 13  FFPG            4303 non-null   float64
 14  smoking         4303 non-null   float64
 15  drinking        4303 non-null   float64
 16  family_histroy  4303 non-null   int64  
 17  Diabetes        4303 non-null   i

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier

In [14]:
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([
    ('classifier', RandomForestClassifier())
])
param_grid = {
    'classifier': [RandomForestClassifier(), XGBClassifier(), LGBMClassifier(), LogisticRegression()],
    'classifier__n_estimators': [50, 100,]
}

In [17]:
# Random forest
rf_base = RandomForestClassifier(n_estimators=100, random_state=42)
bagging_rf = BaggingClassifier(
    estimator=rf_base,
    n_estimators=10,  # number of random forests
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)
bagging_rf.fit(x_train, y_train)

y_pred_val = bagging_rf.predict(x_test)
print("Accuracy :", accuracy_score(y_test,y_pred_val.round()))

Accuracy : 0.9651567944250871


In [18]:
# XGB
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric= 'logloss'
)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred.round())
print("Accuracy:", accuracy)

Accuracy: 0.9639953542392566


In [19]:
# LightGBM
# To avoid the feature name warning, convert scaled data back to DataFrames
x_train_df = pd.DataFrame(x_train, columns=X.columns)
x_test_df = pd.DataFrame(x_test, columns=X.columns)


lgbm_model = LGBMClassifier(
    n_estimators=50,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)
lgbm_model.fit(x_train_df, y_train)

y_pred_val = lgbm_model.predict(x_test_df)
print("Accuracy :", accuracy_score(y_test,y_pred_val))

[LightGBM] [Info] Number of positive: 1042, number of negative: 2400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2608
[LightGBM] [Info] Number of data points in the train set: 3442, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302731 -> initscore=-0.834327
[LightGBM] [Info] Start training from score -0.834327
Accuracy : 0.9547038327526133


In [20]:
# LGR
log_model = LogisticRegression(
    solver='liblinear',
    max_iter=1000,
    class_weight='balanced',
    random_state=42
)
log_model.fit(x_train, y_train)

y_pred_val = log_model.predict(x_test)
print("Accuracy :", accuracy_score(y_test,y_pred_val))

Accuracy : 0.9488966318234611


In [21]:
# VotingClassifier
voting_clf_soft = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('lb', LGBMClassifier()),
        ('xb',XGBClassifier())
        ],
    voting='soft',

)
voting_clf_soft.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 1042, number of negative: 2400
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2608
[LightGBM] [Info] Number of data points in the train set: 3442, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302731 -> initscore=-0.834327
[LightGBM] [Info] Start training from score -0.834327


In [22]:
accuracy = voting_clf_soft.score(x_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9616724738675958




In [23]:
from sklearn.ensemble import StackingClassifier

estimator = [
         ('lr',LogisticRegression(solver='liblinear', random_state=42)),
         ('rf',RandomForestClassifier(n_estimators=100,random_state=42)),
         ('lb',LGBMClassifier(n_estimators=100,random_state=42)),
         ('xb',XGBClassifier(n_estimators=100,random_state=42))
        ]

In [25]:
# stacking
Stacking_Clf= StackingClassifier(
    estimators=estimator,
    final_estimator=LogisticRegression(),
    passthrough=False,
    n_jobs=1
)

In [26]:
Stacking_Clf.fit(x_train,y_train)

[LightGBM] [Info] Number of positive: 1042, number of negative: 2400
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000734 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2608
[LightGBM] [Info] Number of data points in the train set: 3442, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302731 -> initscore=-0.834327
[LightGBM] [Info] Start training from score -0.834327
[LightGBM] [Info] Number of positive: 833, number of negative: 1920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000885 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2592
[LightGBM] [Info] Number of data points in the train set: 2753, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302579 -> initscore=-0.835047
[LightGBM] [Info] Start training from score -0.835047




[LightGBM] [Info] Number of positive: 833, number of negative: 1920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2587
[LightGBM] [Info] Number of data points in the train set: 2753, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302579 -> initscore=-0.835047
[LightGBM] [Info] Start training from score -0.835047




[LightGBM] [Info] Number of positive: 834, number of negative: 1920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2594
[LightGBM] [Info] Number of data points in the train set: 2754, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302832 -> initscore=-0.833847
[LightGBM] [Info] Start training from score -0.833847




[LightGBM] [Info] Number of positive: 834, number of negative: 1920
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2582
[LightGBM] [Info] Number of data points in the train set: 2754, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302832 -> initscore=-0.833847
[LightGBM] [Info] Start training from score -0.833847




[LightGBM] [Info] Number of positive: 834, number of negative: 1920
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001289 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2590
[LightGBM] [Info] Number of data points in the train set: 2754, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302832 -> initscore=-0.833847
[LightGBM] [Info] Start training from score -0.833847




In [27]:
y_pred_stack = Stacking_Clf.predict(x_test)
print("Accuracy :", accuracy_score(y_test,y_pred_stack))

Accuracy : 0.9686411149825784


