# Light GBM : Implementation

In [1]:
#Importing Standard Libraries
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

#Import Light GBM
import lightgbm as lgb

In [2]:
# Import Data
data=pd.read_csv("adult.data",header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
#Naming the columns
data.columns=['age','workclass','fnlwgt','education','education-num','marital_Status'
             ,'occupation','relationship','race','sex','capital_gain','capital_loss',
             'hours_per_week','native_country','Income'] 

In [4]:
data.shape

(32561, 15)

In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital_Status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
#Label Encoding of Income variable
from sklearn.preprocessing import LabelEncoder,OneHotEncoder 
l=LabelEncoder() 
l.fit(data.Income) 
data.Income= Series(l.transform(data.Income)) 

In [7]:
data.Income.value_counts()

0    24720
1     7841
Name: Income, dtype: int64

In [8]:
#One Hot Encoding of the Categorical features 
one_hot_workclass=pd.get_dummies(data.workclass) 
one_hot_education=pd.get_dummies(data.education) 
one_hot_marital_Status=pd.get_dummies(data.marital_Status) 
one_hot_occupation=pd.get_dummies(data.occupation) 
one_hot_relationship=pd.get_dummies(data.relationship) 
one_hot_race=pd.get_dummies(data.race) 
one_hot_sex=pd.get_dummies(data.sex) 
one_hot_native_country=pd.get_dummies(data.native_country)

In [9]:
one_hot_workclass

Unnamed: 0,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay
0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...
32556,0,0,0,0,1,0,0,0,0
32557,0,0,0,0,1,0,0,0,0
32558,0,0,0,0,1,0,0,0,0
32559,0,0,0,0,1,0,0,0,0


In [10]:
#removing categorical features 
data.drop(['workclass','education','marital_Status','occupation','relationship',
           'race','sex','native_country'],axis=1,inplace=True) 

In [11]:
#Merging one hot encoded features with our dataset 'data' 
data=pd.concat([data,one_hot_workclass,one_hot_education,one_hot_marital_Status,
                one_hot_occupation,one_hot_relationship,one_hot_race,one_hot_sex,
                one_hot_native_country],axis=1) 

In [12]:
data.shape

(32561, 109)

In [13]:
#Removing duplicate columns
_ ,i= np.unique(data.columns, return_index=True) 
data=data.iloc[:, i] 

In [14]:
data.shape

(32561, 107)

In [15]:
#Here our target variable is 'Income' with values as 1 or 0. 
#Separating our data into features dataset x and our target dataset y 
x=data.drop('Income',axis=1) 
y=data.Income 

In [16]:
#Imputing missing values in our target variable 
y.fillna(y.mode()[0],inplace=True)

In [17]:
#Now splitting our dataset into test and train 
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.3)

#LightGBM Dataset
train_data=lgb.Dataset(x_train,label=y_train)


In [18]:
#setting parameters for lightgbm
param = {'num_leaves':150, 'objective':'binary','max_depth':7,'learning_rate':.05,
         'max_bin':200} 
param['metric'] = ['auc', 'binary_logloss']

In [19]:
#training our model using light gbm
import datetime
num_round=50 
start= datetime.datetime.now() 
lgbm= lgb.train(param,train_data,num_round) 
stop= datetime.datetime.now()


In [20]:
#Execution time of the model
execution_time_lgbm = stop-start 
execution_time_lgbm

datetime.timedelta(microseconds=348068)

In [21]:
#predicting on test set
ypred2=lgbm.predict(x_test)
ypred2[0:5]

array([0.09169418, 0.02176525, 0.17843277, 0.0317519 , 0.0305792 ])

In [22]:
len(ypred2)

9769

In [23]:
#converting probabilities into 0 or 1
for i in range(0,9769): 
    if ypred2[i]>=.5:    # setting threshold to .5
        ypred2[i]=1 
    else:
        ypred2[i]=0

In [24]:
np.unique(ypred2,return_counts=True)

(array([0., 1.]), array([8165, 1604], dtype=int64))

In [25]:
y_test.value_counts()

0    7447
1    2322
Name: Income, dtype: int64

In [26]:
#calculating accuracy_score
from sklearn.metrics import accuracy_score, roc_auc_score
accuracy_lgbm = accuracy_score(ypred2,y_test) 
accuracy_lgbm


0.8648786979219981

In [27]:
#calculating roc_auc_score for Light GBM.
auc_lgbm = roc_auc_score(y_test,ypred2) 
auc_lgbm 

0.7603676083889749

**Model Training Time : 348068 microseconds**

**Accuracy_Score : 0.86**

**Roc_Auc_Score : 0.76**

# XGBoost : Implementation

In [28]:
from xgboost import XGBClassifier
Start_time=datetime.datetime.now()
model=XGBClassifier(n_estimators=100, max_depth=7)
model.fit(x_train,y_train)
Stop_time=datetime.datetime.now()
ypred=model.predict(x_test)
execution_time_xgb=Stop_time-Start_time
execution_time_xgb

datetime.timedelta(seconds=5, microseconds=247831)

In [29]:
np.unique(ypred,return_counts=True)

(array([0, 1]), array([7756, 2013], dtype=int64))

In [30]:
y_test.value_counts()

0    7447
1    2322
Name: Income, dtype: int64

In [31]:
#calculating accuracy_score
accuracy_xgb=accuracy_score(ypred,y_test)
accuracy_xgb

0.8727607738765483

In [32]:
#calculating roc_auc_score for XGBoost
auc_xgb=roc_auc_score(ypred,y_test)
auc_xgb

0.8339781876800283

**Model Training Time : 5 seconds and 247831 microseconds**

**Accuracy_Score : 0.87**

**Roc_Auc_Score : 0.83**