# XGB Boosting

In [1]:
# Importing Libraries

import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
from sklearn import datasets
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading data

data=pd.read_csv('pima-indian_diabetes.csv')
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [5]:
cols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

In [6]:
data.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
for col in cols:
    data[col]=data[col].replace(0,np.nan)

In [9]:
data.isna().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [10]:
data['Glucose']=data['Glucose'].fillna(data['Glucose'].mode()[0])
data['BloodPressure']=data['BloodPressure'].fillna(data['BloodPressure'].mode()[0])
data['SkinThickness']=data['SkinThickness'].fillna(data['SkinThickness'].mean())
data['Insulin']=data['Insulin'].fillna(data['Insulin'].mean())
data['BMI']=data['BMI'].fillna(data['BMI'].mean())

In [11]:
data.isna().sum()

Pregnancies                 111
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [12]:
x=data.drop(columns='Outcome')
y=data['Outcome']

In [13]:
x.sample(n=5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
162,,114.0,80.0,34.0,285.0,44.2,0.167,27
364,4.0,147.0,74.0,25.0,293.0,34.9,0.385,30
404,5.0,168.0,64.0,29.15342,155.548223,32.9,0.135,41
522,6.0,114.0,70.0,29.15342,155.548223,32.457464,0.189,26
228,4.0,197.0,70.0,39.0,744.0,36.7,2.329,31


In [16]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaled_data=scaler.fit_transform(x)

In [17]:
# Training and testing data
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(scaled_data,y,test_size=0.3,random_state=42)

In [19]:
# Creating a model
model=XGBClassifier(objective='binary:logistic')
model.fit(train_x,train_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
#Checking training accuracy
y_pred=model.predict(train_x)
prediction=[round(value) for value in y_pred]
accuracy=accuracy_score(train_y,prediction)
accuracy

1.0

In [21]:
#Checking initial test accuracy
y_pred=model.predict(test_x)
prediction=[round(value) for value in y_pred]
accuracy=accuracy_score(test_y,prediction)
accuracy

0.7445887445887446

In [22]:
test_x[1]

array([-7.75986173e-01, -3.13055830e-01,  2.23527220e-01,  3.24019370e-01,
       -3.34507888e-16,  4.71938618e-01, -9.78144869e-01, -1.04154944e+00])

In [28]:
#Increasing the accuracy of the model, we use hyperparmeter tuning GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid={
    'learning_rate':[1.0,5.0,1.0,0.1,0.001],
    'max_depth':[3,5,10,20],
    'n_estimators':[10,50,100,200]
}

In [29]:
grid=GridSearchCV(XGBClassifier(objective='binary:logistics'),
                 param_grid,verbose=3)

In [33]:
grid.best_params_

{'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10}

In [34]:
new_model=XGBClassifier(learning_rate=.1,max_depth=5,n_estimators=45)
new_model.fit(train_x,train_y)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=45, n_jobs=4,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [35]:
y_pred_new=new_model.predict(test_x)
prediction_new=[round(value) for value in y_pred_new]
accuracy_new=accuracy_score(test_y,prediction_new)
accuracy

0.7445887445887446

In [37]:
#Trying a Random Prediction
d=scaler.transform([[6,148,72,56,89,76,44,99]])
pred=new_model.predict(d)
print("This data belongs to:",pred)

This data belongs to: [1]
