### XGBM & LGBM

#### XGBM ~ Extreme Gradient Boosting Machine

#### Installing the XGBM library
!pip install xgboost

In [1]:
#Loading the Required Library
import numpy as np
import pandas as pd
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")  #--to ignore warnings

In [2]:
#Load the data set
dataset = pd.read_csv("C:/Users/Akaash/Downloads/bank-full.csv", delimiter = ";")
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
#Checking Null Value
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


Inference: No NA Value, Also Datasets Has X as well as the Y Variable Categorial, So for Y variable Using Label Encoder then for X Variable will Use Dummies

In [4]:
#Complete Bank dataset - applying lable encoder to species column - (Y Variable is Categorial)
label_encoder = preprocessing.LabelEncoder()
dataset['y'] = label_encoder.fit_transform(dataset['y']) 
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [5]:
#Dummies for X Variable
dataset = pd.get_dummies(dataset)
dataset.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


Inference: All the Variables are now Numberical

In [6]:
# Dividing our data into input and output variables 
X = pd.concat([dataset.iloc[:,0:7],dataset.iloc[:,8:]],axis = 1)
Y = dataset.iloc[:,7]

In [7]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#### Grid Search CV

In [8]:
#importing The Required library
from sklearn.model_selection import GridSearchCV

#Creating XGBM instance
clf = XGBClassifier()

#Defining Grid Search Parameter
param_grid = [{'n_estimators':[70,80,90,100],'max_depth':[3,4,5],'learning_rate':[0.1,0.2] }]

#Creating Grid Search instance and fitting the Model
gsv = GridSearchCV(clf,param_grid,cv=5)
gsv.fit(X_train,y_train)











GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight

In [9]:
#Getting Search Results From Grid Search
gsv.best_params_ , gsv.best_score_ 

({'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100},
 0.908454587444874)

In [10]:
# fit model no training data
model = XGBClassifier(n_estimators = 100, max_depth = 4,learning_rate=0.2)
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [12]:
predictions

[0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [13]:
#evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 91.05%


Inference: Using XGBM with Parameter n_estimators = 100 ans max_dept = 6, Comes the Best Accuracy of 91.05%

#### Light GBM ~ Light Gradient Boosting Machine

#### Installing the LGBM library
!pip install lightgbm

In [14]:
#loading the Required library
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [15]:
#Load the data set
dataset = pd.read_csv("C:/Users/Akaash/Downloads/bank-full.csv", delimiter = ";")
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [16]:
#Checking Null Value
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


Inference: No NA Value, Also Datasets Has X as well as the Y Variable Categorial, So for Y variable Using Label Encoder then for X Variable will Use Dummies

In [17]:
#Complete Bank dataset - applying lable encoder to species column - (Y Variable is Categorial)
label_encoder = preprocessing.LabelEncoder()
dataset['y'] = label_encoder.fit_transform(dataset['y']) 
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [18]:
#Dummies for X Variable
dataset = pd.get_dummies(dataset)
dataset.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [19]:
# Dividing our data into input and output variables 
X = pd.concat([dataset.iloc[:,0:7],dataset.iloc[:,8:]],axis = 1)
Y = dataset.iloc[:,7]

In [20]:
# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [22]:
# Importing the required Library
import lightgbm as lgb
from lightgbm import LGBMClassifier
#Creating the LGBM Dataset (Gradient Dataset)
d_train = lgb.Dataset(x_train, label=y_train)

#### Grid Search CV

In [23]:
#importing The Required library
from sklearn.model_selection import GridSearchCV

#Creating LGBM instance
clf = lgb.LGBMClassifier(boosting_type = 'gbdt', objective = 'binary')
#Defining Grid Search Parameter
param_grid = [{'n_estimators':[80,100],'num_leaves':[10,15],'max_depth':[5,10],'learning_rate':[0.1,0.2] }]

#Creating Grid Search instance and fitting the Model
gsv = GridSearchCV(clf,param_grid,cv=5)
gsv.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=LGBMClassifier(objective='binary'),
             param_grid=[{'learning_rate': [0.1, 0.2], 'max_depth': [5, 10],
                          'n_estimators': [80, 100], 'num_leaves': [10, 15]}])

In [24]:
#Getting Search Results From Grid Search
gsv.best_params_ , gsv.best_score_ 

({'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 100, 'num_leaves': 10},
 0.9103163509017055)

In [25]:
#Defining the Parameters for LGBM
params = {}
params['learning_rate'] = 0.2
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 5

In [26]:
#Training the Lgbm Dataset 
clf = lgb.train(params, d_train, 100) # n_estimators = 100 

[LightGBM] [Info] Number of positive: 3964, number of negative: 29944
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 33908, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116905 -> initscore=-2.022075
[LightGBM] [Info] Start training from score -2.022075


In [27]:
#Prediction
y_pred=clf.predict(x_test)
predictions = [round(value) for value in y_pred]

In [28]:
accuracy = accuracy_score(y_test, predictions)
accuracy*100

90.683889232947

Inference: Using LGBM with Parameter n_estimators =100 ans max_dept = 5, Comes the Best Accuracy of 90.68%