In [57]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import lightgbm as lgb

In [2]:
# read the data in
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [4]:
# Let's use some weak features as predictors
predictors = ['Age','Insulin']
target = 'Outcome'


In [5]:
# Most common preprocessing step include label encoding and missing value treatment
from sklearn import preprocessing
for f in df.columns:
 if df[f].dtype=='object':
  lbl = preprocessing.LabelEncoder()
  lbl.fit(list(df[f].values))
  df[f] = lbl.transform(list(df[f].values))

In [6]:
df.fillna((-999), inplace=True) # missing value treatment

In [7]:
# Let's use some week features to build the tree
X = df[['Age','Insulin']] # independent variables
y = df['Outcome'].values # dependent variables


In [8]:
#Normalize
X = StandardScaler().fit_transform(X)

In [9]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=2017)
num_rounds = 100


In [10]:
clf_XGB = XGBClassifier(n_estimators = num_rounds,
 objective= 'binary:logistic',
seed=2017)

In [11]:
# use early_stopping_rounds to stop the cv when there is no score imporovement
clf_XGB.fit(X_train,y_train, eval_set=[(X_test,
y_test)], verbose=False)

In [12]:
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=2017)

In [13]:
# Perform cross-validation
results = cross_val_score(clf_XGB, X_train, y_train, cv=kf)

# Print cross-validation results
print("\nxgBoost - CV Train : %.2f" % results.mean())

# Train the model on the entire training set
clf_XGB.fit(X_train, y_train)

# Print training accuracy
print("xgBoost - Train : %.2f" % accuracy_score(clf_XGB.predict(X_train), y_train))

# Print test accuracy
print("xgBoost - Test : %.2f" % accuracy_score(clf_XGB.predict(X_test), y_test))



xgBoost - CV Train : 0.67
xgBoost - Train : 0.85
xgBoost - Test : 0.65


In [14]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
# Function to get user input and make prediction
def predict_diabetes():
    # Get user input
    Age = float(input("Enter age: "))
    Insulin = float(input("Enter insulin level: "))

    # Create input features list
    input_features = [[Age, Insulin]]

    # Make prediction
    y_predict = clf_XGB.predict(input_features)

    # Print the prediction
    if y_predict[0] == 1:
        print("Diabetic")
    else:
        print("Non Diabetic")

# Call the function to get user input and make prediction
predict_diabetes()


Enter age:  34
Enter insulin level:  0


Non Diabetic


# Now let’s also look at how to build a model using xgboost native interface.
a

# DMatrixthe internal data structure of xgboost for input data

In [18]:
xgtrain = xgb.DMatrix(X_train, label=y_train, missing=-999)
xgtest = xgb.DMatrix(X_test, label=y_test, missing=-999)

In [19]:
xgtrain

<xgboost.core.DMatrix at 0x15f52a63650>

In [20]:
# set xgboost params
param = {'max_depth': 3, # the maximum depth of each tree
 'objective': 'binary:logistic'}

In [21]:
# Perform cross-validation
num_rounds = 1000  # Set a large number of rounds for cross-validation
clf_xgb_cv = xgb.cv(
    params=param,
    dtrain=xgtrain,
    num_boost_round=num_rounds,
    nfold=5,
    stratified=True,
    early_stopping_rounds=20,
    seed=2017
)

In [24]:
best_num_boost_round = clf_xgb_cv.shape[0]
print(best_num_boost_round)

8


In [26]:
# Train the model
watchlist = [(xgtest, 'test'), (xgtrain, 'train')]
clf_xgb = xgb.train(
    params=param,
    dtrain=xgtrain,
    num_boost_round=best_num_boost_round,
    evals=watchlist,
    verbose_eval=True
)


[0]	test-logloss:0.60036	train-logloss:0.61159
[1]	test-logloss:0.57427	train-logloss:0.58858
[2]	test-logloss:0.56364	train-logloss:0.57108
[3]	test-logloss:0.55409	train-logloss:0.55749
[4]	test-logloss:0.54665	train-logloss:0.54483
[5]	test-logloss:0.54283	train-logloss:0.53793
[6]	test-logloss:0.54200	train-logloss:0.53390
[7]	test-logloss:0.54120	train-logloss:0.52889


In [28]:

# Make predictions and convert probabilities to class labels using 0.5 cutoff
y_train_pred_prob = clf_xgb.predict(xgtrain)
y_test_pred_prob = clf_xgb.predict(xgtest)

y_train_pred = (y_train_pred_prob > 0.5).astype(int)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)

In [30]:
# Print accuracy scores
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Train Accuracy: 0.7296416938110749
Test Accuracy: 0.7402597402597403


# Using LightGBM

In [63]:
# Initialize LightGBM model with adjusted parameters
clf_LGBM = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,  # Lower learning rate
    num_leaves=50,       # Increase the number of leaves
    max_depth=-1,        # No limit on depth
    objective='binary',
    random_state=2017
)

In [65]:
# Train the model with early stopping
clf_LGBM.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(stopping_rounds=20)]
    # verbose=10
)

[LightGBM] [Info] Number of positive: 217, number of negative: 397
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 128
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.353420 -> initscore=-0.604039
[LightGBM] [Info] Start training from score -0.604039
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.542859


In [67]:
# Make predictions
y_train_pred_prob = clf_LGBM.predict_proba(X_train)[:, 1]
y_test_pred_prob = clf_LGBM.predict_proba(X_test)[:, 1]


In [69]:

# Convert probabilities to class labels using 0.5 cutoff
y_train_pred = (y_train_pred_prob > 0.5).astype(int)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)

In [71]:
# Print training and test accuracy
print("LightGBM - Train Accuracy: %.2f" % accuracy_score(y_train, y_train_pred))
print("LightGBM - Test Accuracy: %.2f" % accuracy_score(y_test, y_test_pred))

LightGBM - Train Accuracy: 0.75
LightGBM - Test Accuracy: 0.70


In [73]:
# Print additional evaluation metrics
roc_auc = roc_auc_score(y_test, y_test_pred_prob)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print(f"ROC AUC: {roc_auc:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

ROC AUC: 0.76
Precision: 0.54
Recall: 0.61
F1 Score: 0.57


In [75]:
# Perform cross-validation
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=2017)
cv_results = cross_val_score(clf_LGBM, X_train, y_train, cv=kf)

# Print cross-validation results
print("\nLightGBM - CV Train Accuracy: %.2f" % cv_results.mean())

[LightGBM] [Info] Number of positive: 174, number of negative: 317
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 114
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354379 -> initscore=-0.599846
[LightGBM] [Info] Start training from score -0.599846
[LightGBM] [Info] Number of positive: 173, number of negative: 318
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000017 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 113
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.352342 -> initscore=-0.608760
[LightGBM] [Info] 

In [39]:
print(df[['Age', 'Insulin']].describe())

              Age     Insulin
count  768.000000  768.000000
mean    33.240885   79.799479
std     11.760232  115.244002
min     21.000000    0.000000
25%     24.000000    0.000000
50%     29.000000   30.500000
75%     41.000000  127.250000
max     81.000000  846.000000


In [41]:
print(df[['Age', 'Insulin', 'Outcome']].corr())


              Age   Insulin   Outcome
Age      1.000000 -0.042163  0.238356
Insulin -0.042163  1.000000  0.130548
Outcome  0.238356  0.130548  1.000000
