### XGBM & LGBM

#### XGBM ~ Extreme Gradient Boosting Machine

#### Installing the XGBM library
!pip install xgboost

In [1]:
#Loading the Required Library

import numpy as np
import pandas as pd
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")  #--to ignore warnings

In [2]:
# loading dataset
dataset = pd.read_csv('C:/Users/Akaash/Downloads/claimants.csv')
dataset.head()

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,5,0,0.0,1.0,0.0,50.0,34.94
1,3,1,1.0,0.0,0.0,18.0,0.891
2,66,1,0.0,1.0,0.0,5.0,0.33
3,70,0,0.0,1.0,1.0,31.0,0.037
4,96,1,0.0,1.0,0.0,30.0,0.038


In [3]:
#Checking the Dataset for Na Values
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   1340 non-null   int64  
 1   ATTORNEY  1340 non-null   int64  
 2   CLMSEX    1328 non-null   float64
 3   CLMINSUR  1299 non-null   float64
 4   SEATBELT  1292 non-null   float64
 5   CLMAGE    1151 non-null   float64
 6   LOSS      1340 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 73.4 KB


Inference: "CASENUM" Column is Not Relevent is just Serial numbers to the Dataset and Also Dataset has NA Values So Dropping

In [4]:
#Dropping the case number columns as it is not required
dataset.drop(["CASENUM"],inplace=True,axis = 1)

# Removing NA values in data set
dataset = dataset.dropna()

#Final Dataset
dataset.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


Inference: The Dataset is Now Correct

In [5]:
# Dividing our data into input and output variables 
X = dataset.iloc[:,1:]
Y = dataset.iloc[:,0]

In [6]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

#### Grid Search CV

In [7]:
#importing The Required library
from sklearn.model_selection import GridSearchCV

#Creating XGBM instance
clf = XGBClassifier()

#Defining Grid Search Parameter
param_grid = [{'n_estimators':[50,75,100],'max_depth':[3,4,5],'learning_rate':[0.1,0.2] }]

#Creating Grid Search instance and fitting the Model
gsv = GridSearchCV(clf,param_grid,cv=5)
gsv.fit(X_train,y_train)









GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight

In [8]:
#Getting Search Results From Grid Search
gsv.best_params_ , gsv.best_score_ 

({'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50},
 0.7098499673842139)

In [9]:
# fit model no training data
model = XGBClassifier(n_estimators = 50, max_depth = 3,learning_rate=0.1)
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [11]:
predictions

[0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [12]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 73.20%


Inference: Using XGBM with Parameter n_estimators = 50 and max_dept = 3, Comes the Best Accuracy of 73.20%

#### Light GBM ~ Light Gradient Boosting Machine

#### Installing the LGBM library
!pip install lightgbm

In [13]:
#loading the Required library
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [14]:
# loading dataset
dataset = pd.read_csv('C:/Users/Akaash/Downloads/claimants.csv')
dataset.head()

Unnamed: 0,CASENUM,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,5,0,0.0,1.0,0.0,50.0,34.94
1,3,1,1.0,0.0,0.0,18.0,0.891
2,66,1,0.0,1.0,0.0,5.0,0.33
3,70,0,0.0,1.0,1.0,31.0,0.037
4,96,1,0.0,1.0,0.0,30.0,0.038


In [15]:
#Checking the Dataset for Na Values
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   1340 non-null   int64  
 1   ATTORNEY  1340 non-null   int64  
 2   CLMSEX    1328 non-null   float64
 3   CLMINSUR  1299 non-null   float64
 4   SEATBELT  1292 non-null   float64
 5   CLMAGE    1151 non-null   float64
 6   LOSS      1340 non-null   float64
dtypes: float64(5), int64(2)
memory usage: 73.4 KB


Inference: "CASENUM" Column is Not Relevent is just Serial numbers to the Dataset and Also Dataset has NA Values So Dropping

In [16]:
#Dropping the case number columns as it is not required
dataset.drop(["CASENUM"],inplace=True,axis = 1)

# Removing NA values in data set
dataset = dataset.dropna()

#Final Dataset
dataset.head()

Unnamed: 0,ATTORNEY,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS
0,0,0.0,1.0,0.0,50.0,34.94
1,1,1.0,0.0,0.0,18.0,0.891
2,1,0.0,1.0,0.0,5.0,0.33
3,0,0.0,1.0,1.0,31.0,0.037
4,1,0.0,1.0,0.0,30.0,0.038


In [17]:
# split data into X and y
X = dataset.iloc[:,1:]
Y = dataset.iloc[:,0]

In [18]:
# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [19]:
# Importing the required Library
import lightgbm as lgb
from lightgbm import LGBMClassifier
#Creating the LGBM Dataset (Gradient Dataset)
d_train = lgb.Dataset(x_train, label=y_train)

#### Grid Search CV

In [20]:
#importing The Required library
from sklearn.model_selection import GridSearchCV

#Creating LGBM instance
clf = lgb.LGBMClassifier(boosting_type = 'gbdt', objective = 'binary')
#Defining Grid Search Parameter
param_grid = [{'n_estimators':[70,80,100],'num_leaves':[5,10,15],'max_depth':[4,5,6,7],'learning_rate':[0.1,0.2] }]

#Creating Grid Search instance and fitting the Model
gsv = GridSearchCV(clf,param_grid,cv=5)
gsv.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=LGBMClassifier(objective='binary'),
             param_grid=[{'learning_rate': [0.1, 0.2],
                          'max_depth': [4, 5, 6, 7],
                          'n_estimators': [70, 80, 100],
                          'num_leaves': [5, 10, 15]}])

In [21]:
#Getting Search Results From Grid Search
gsv.best_params_ , gsv.best_score_ 

({'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 70, 'num_leaves': 15},
 0.724929785661493)

In [22]:
#Defining the Parameters for LGBM
params = {}
params['learning_rate'] = 0.1
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 15
params['min_data'] = 50
params['max_depth'] = 4

In [23]:
#Training the Lgbm Dataset 
clf = lgb.train(params, d_train, 70) # n_estimators = 70 

[LightGBM] [Info] Number of positive: 389, number of negative: 433
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 315
[LightGBM] [Info] Number of data points in the train set: 822, number of used features: 4
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.473236 -> initscore=-0.107158
[LightGBM] [Info] Start training from score -0.107158


In [24]:
#Prediction
y_pred=clf.predict(x_test)
predictions = [round(value) for value in y_pred]

In [25]:
accuracy = accuracy_score(y_test, predictions)
accuracy*100

72.26277372262774

Inference: Using LGBM with Parameter n_estimators = 70 ans max_dept = 2, Comes the Best Accuracy of 72.26%