# Extreme Gradient Boosting

### Importing Libraries

In [1]:
#Importing required libraries
import pandas as pd 
import numpy as np

### Loading the dataset

In [2]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

In [3]:
#first five rows of the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


### Separating independent and dependent variables

In [4]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

### Creating the train and test dataset

In [5]:
#import the train-test split
from sklearn.model_selection import train_test_split

In [6]:
#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

## Install XGBoost

Use the following command in terminal or command prompt

_**$ pip install xgboost**_

## Building an XGBM Model

In [7]:
#Importing XGBM Classifier 
from xgboost import XGBClassifier

In [8]:
#creating an extreme Gradient boosting instance
clf = XGBClassifier(random_state=96)

In [9]:
#training the model
clf.fit(train_x,train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=96, ...)

In [10]:
#calculating score on training data
clf.score(train_x, train_y)

0.9775449101796407

In [11]:
#calculating score on test data
clf.score(test_x, test_y)

0.7623318385650224

# Hyperparamter Tuning

Same as GBDT

1. **n_estimators:** Total number of trees
2. **learning_rate:**This determines the impact of each tree on the final outcome
3. **random_state:** The random number seed so that same random numbers are generated every time
4. **max_depth:** Maximum depth to which tree can grow (stopping criteria)
5. **subsample:** The fraction of observations to be selected for each tree. Selection is done by random sampling
6. **objective:** Defines Loss function (*binary:logistic* is for classification using probability, *reg:logistic* is for classification, *reg:linear* is for regression)
7. **colsample_bylevel:** Random feature selection at levels
8. **colsample_bytree:** Random feature selection at tree

In [12]:
#set parameters
clf = XGBClassifier(random_state=96, colsample_bytree=0.7, max_depth=6)

In [13]:
#training the model
clf.fit(train_x,train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=96, ...)

In [14]:
#calculating score on test data
clf.score(test_x, test_y)

0.7892376681614349

Regularization

1. **gamma:** Minimum reduction in loss at every split
2. **reg_alpha:** Makes leaf weights 0
3. **reg_lambda:** Decrease leaf weights more smoothly

In [15]:
clf = XGBClassifier(gamma=0.1, random_state=96)

In [16]:
#training the model
clf.fit(train_x,train_y)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=96, ...)

In [17]:
#calculating score on test data
clf.score(test_x, test_y)

0.7892376681614349

In [18]:
df=pd.read_csv("diabetic_data.csv")

In [20]:
df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [21]:
!pip install dtale

Collecting dtale
  Downloading dtale-2.12.3-py2.py3-none-any.whl (14.0 MB)
     ---------------------------------------- 14.0/14.0 MB 4.1 MB/s eta 0:00:00
Collecting dash-colorscales
  Downloading dash_colorscales-0.0.4.tar.gz (62 kB)
     ---------------------------------------- 62.3/62.3 kB 1.7 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting Flask-Compress
  Downloading Flask_Compress-1.13-py3-none-any.whl (7.9 kB)
Collecting missingno<=0.4.2
  Downloading missingno-0.4.2-py3-none-any.whl (9.7 kB)
Collecting matplotlib==3.6.0
  Downloading matplotlib-3.6.0-cp39-cp39-win_amd64.whl (7.2 MB)
     ---------------------------------------- 7.2/7.2 MB 4.3 MB/s eta 0:00:00
Collecting strsimpy
  Downloading strsimpy-0.2.1-py3-none-any.whl (45 kB)
     ---------------------------------------- 45.9/45.9 kB 2.4 MB/s eta 0:00:00
Collecting squarify
  Downloading squarify-0.4.3-py3-none-any.whl (4.3 kB)
Collecting ka

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'c:\\users\\barath murugan\\anaconda3\\lib\\site-packages\\matplotlib\\mpl-data\\fonts\\ttf\\DejaVuSans.ttf'
Consider using the `--user` option or check the permissions.



In [None]:
import dtale