## Importing libraries


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from skopt import BayesSearchCV
from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

## Importing the data and preprocessing

In [2]:
train = pd.read_csv("../input/creditcardfraud/creditcard.csv")

In [3]:
train

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
train.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
# calculating correleation of each column with target class
x = train.corrwith(train["Class"]).to_dict()

In [6]:
x

{'Time': -0.012322570929245644,
 'V1': -0.10134729859508507,
 'V2': 0.09128865034461915,
 'V3': -0.19296082706741322,
 'V4': 0.13344748623900432,
 'V5': -0.09497429899144809,
 'V6': -0.043643160699963775,
 'V7': -0.18725659151429797,
 'V8': 0.019875123914794363,
 'V9': -0.0977326860740787,
 'V10': -0.21688294364102725,
 'V11': 0.1548756447439473,
 'V12': -0.26059292487721686,
 'V13': -0.004569778799461258,
 'V14': -0.3025436958044044,
 'V15': -0.004223402267856669,
 'V16': -0.19653894030401792,
 'V17': -0.32648106724371434,
 'V18': -0.11148525388904092,
 'V19': 0.03478301303651474,
 'V20': 0.020090324196975373,
 'V21': 0.04041338061057561,
 'V22': 0.0008053175052984614,
 'V23': -0.002685155740250693,
 'V24': -0.007220906715952716,
 'V25': 0.0033077055972996422,
 'V26': 0.00445539750128335,
 'V27': 0.01757972818951325,
 'V28': 0.009536040916236168,
 'Amount': 0.005631753006768537,
 'Class': 1.0}

In [7]:
del x['Class']

In [8]:
# choosing features which have a absolute correlation value greater than 0.1
features = []
for k,v in x.items():
    if abs(v)>0.1:
        print(f"{k} : {v:.2f}")
        features.append(k)

V1 : -0.10
V3 : -0.19
V4 : 0.13
V7 : -0.19
V10 : -0.22
V11 : 0.15
V12 : -0.26
V14 : -0.30
V16 : -0.20
V17 : -0.33
V18 : -0.11


In [9]:
x_train = train[features]
y_train = train['Class']

In [10]:
y_train.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [11]:
# scaling the dataset
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

In [12]:
X_train,X_val,Y_train,Y_val = train_test_split(x_train,y_train,test_size = 0.1,random_state=26,stratify=y_train)

## Bayesian Optimization

In [13]:
%%time
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6,n_estimators=10000, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, free_raw_data=False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
                
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.001, 0.2),
                                            'num_leaves': (25, 60),
                                            'feature_fraction': (0.1, 1),
                                            'bagging_fraction': (0.5, 1),
                                           'max_depth': (2, 20),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)


    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(X_train, Y_train, init_round=5, opt_round=10, n_folds=5, random_seed=6,n_estimators=10000)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y



| [0m 1       [0m | [0m 0.9757  [0m | [0m 0.9738  [0m | [0m 0.3039  [0m | [0m 0.1193  [0m | [0m 49.98   [0m | [0m 15.75   [0m | [0m 20.17   [0m | [0m 35.74   [0m | [0m 56.84   [0m | [0m 0.4615  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, num



| [0m 2       [0m | [0m 0.9755  [0m | [0m 0.9909  [0m | [0m 0.8806  [0m | [0m 0.1972  [0m | [0m 84.63   [0m | [0m 7.466   [0m | [0m 70.77   [0m | [0m 12.12   [0m | [0m 52.5    [0m | [0m 0.258   [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, number of negative: 204707
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `f



| [0m 3       [0m | [0m 0.9749  [0m | [0m 0.548   [0m | [0m 0.9491  [0m | [0m 0.1654  [0m | [0m 56.28   [0m | [0m 17.72   [0m | [0m 54.7    [0m | [0m 45.01   [0m | [0m 48.81   [0m | [0m 0.4252  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, number of negative: 204707
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 176
[LightGBM] [Info] Number of data points in the train set: 205061, number o



| [0m 4       [0m | [0m 0.9754  [0m | [0m 0.8202  [0m | [0m 0.6478  [0m | [0m 0.02198 [0m | [0m 87.62   [0m | [0m 15.66   [0m | [0m 60.78   [0m | [0m 32.93   [0m | [0m 25.93   [0m | [0m 0.8056  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, num



| [0m 5       [0m | [0m 0.9606  [0m | [0m 0.9864  [0m | [0m 0.3546  [0m | [0m 0.1302  [0m | [0m 38.59   [0m | [0m 5.378   [0m | [0m 45.14   [0m | [0m 66.6    [0m | [0m 43.11   [0m | [0m 0.8559  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, 



| [0m 6       [0m | [0m 0.9732  [0m | [0m 0.5833  [0m | [0m 0.7947  [0m | [0m 0.01122 [0m | [0m 87.46   [0m | [0m 16.52   [0m | [0m 62.81   [0m | [0m 32.9    [0m | [0m 28.46   [0m | [0m 0.2623  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, 



| [0m 7       [0m | [0m 0.9616  [0m | [0m 0.6641  [0m | [0m 0.1269  [0m | [0m 0.03582 [0m | [0m 89.56   [0m | [0m 10.83   [0m | [0m 49.19   [0m | [0m 25.61   [0m | [0m 25.25   [0m | [0m 0.2165  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 154
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, 



| [0m 8       [0m | [0m 0.9738  [0m | [0m 0.6661  [0m | [0m 0.8645  [0m | [0m 0.02065 [0m | [0m 53.05   [0m | [0m 13.98   [0m | [0m 24.85   [0m | [0m 29.26   [0m | [0m 59.79   [0m | [0m 0.3422  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 110
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, 



| [0m 9       [0m | [0m 0.9748  [0m | [0m 0.8434  [0m | [0m 0.9794  [0m | [0m 0.04131 [0m | [0m 47.13   [0m | [0m 10.14   [0m | [0m 26.87   [0m | [0m 39.19   [0m | [0m 59.33   [0m | [0m 0.5162  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, number of negative: 204707
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number o



| [0m 10      [0m | [0m 0.9732  [0m | [0m 0.7199  [0m | [0m 0.6769  [0m | [0m 0.1069  [0m | [0m 54.16   [0m | [0m 17.99   [0m | [0m 27.53   [0m | [0m 43.05   [0m | [0m 52.43   [0m | [0m 0.4457  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, 



| [95m 11      [0m | [95m 0.9767  [0m | [95m 0.6651  [0m | [95m 0.9998  [0m | [95m 0.06651 [0m | [95m 42.08   [0m | [95m 11.26   [0m | [95m 23.71   [0m | [95m 29.34   [0m | [95m 51.8    [0m | [95m 0.7295  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 132
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 132
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 132
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, 



| [95m 12      [0m | [95m 0.9768  [0m | [95m 0.7353  [0m | [95m 0.5974  [0m | [95m 0.1235  [0m | [95m 34.53   [0m | [95m 11.72   [0m | [95m 23.77   [0m | [95m 36.49   [0m | [95m 55.12   [0m | [95m 0.1665  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, number of negative: 204707
You can set `force_row_wise=true` to 



| [0m 13      [0m | [0m 0.9718  [0m | [0m 0.8791  [0m | [0m 0.3765  [0m | [0m 0.02767 [0m | [0m 40.94   [0m | [0m 15.45   [0m | [0m 23.63   [0m | [0m 39.25   [0m | [0m 45.39   [0m | [0m 0.02214 [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 66
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, num



| [0m 14      [0m | [0m 0.9715  [0m | [0m 0.908   [0m | [0m 0.503   [0m | [0m 0.1761  [0m | [0m 37.28   [0m | [0m 6.481   [0m | [0m 27.48   [0m | [0m 27.63   [0m | [0m 58.99   [0m | [0m 0.5049  [0m |


[LightGBM] [Fatal] Cannot change max_bin after constructed Dataset handle.


[LightGBM] [Info] Number of positive: 354, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205060, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 355, number of negative: 204706
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198
[LightGBM] [Info] Number of data points in the train set: 205061, number of used features: 11
[LightGBM] [Info] Number of positive: 354, number of negative: 204707
You can set `force_col_wise=true` to 



| [0m 15      [0m | [0m 0.9739  [0m | [0m 0.5964  [0m | [0m 0.3617  [0m | [0m 0.05104 [0m | [0m 31.4    [0m | [0m 18.24   [0m | [0m 21.14   [0m | [0m 28.47   [0m | [0m 51.33   [0m | [0m 0.8771  [0m |
CPU times: user 5min 26s, sys: 4.1 s, total: 5min 30s
Wall time: 1min 30s


In [14]:
# optimal parameters
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

{'bagging_fraction': 0.735299589325346,
 'feature_fraction': 0.5974236057443596,
 'learning_rate': 0.12354576365652588,
 'max_bin': 35,
 'max_depth': 12,
 'min_data_in_leaf': 24,
 'min_sum_hessian_in_leaf': 36.49206163580857,
 'num_leaves': 55,
 'subsample': 0.16645744047130134,
 'objective': 'binary',
 'metric': 'auc',
 'is_unbalance': True,
 'boost_from_average': False}

## Prediciton with best parameters and KFold Technique

In [15]:
%%time
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=26)
oof = np.zeros(len(x_train))
predictions = np.zeros(len(X_val))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=y_train.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=y_train.iloc[val_idx])
    num_round = 10000
    clf = lgb.train(opt_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 250,)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) 
    predictions += clf.predict(X_val, num_iteration=clf.best_iteration) / folds.n_splits


Fold 0




[LightGBM] [Info] Number of positive: 443, number of negative: 255883
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 385
[LightGBM] [Info] Number of data points in the train set: 256326, number of used features: 11
Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[107]	training's auc: 0.999884	valid_1's auc: 0.991683
Fold 1
[LightGBM] [Info] Number of positive: 443, number of negative: 255883
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 385
[LightGBM] [Info] Number of data points in the train set: 256326, number of used features: 11
Training until validation scores don't improve for 250 rounds
Early stopping, best iteration is:
[42]	training's auc: 0.999858	valid_1's auc: 0.97006
Fold 2
[LightGBM] [Info] Number of positive: 443, number of negative: 255883
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 385
[LightGBM] [I

In [16]:
print("CV score: {:<8.5f}".format(roc_auc_score(y_train, oof)))

CV score: 0.96300 


In [17]:
# putting threshold as 0.5
binary_predictions = [i>0.5 for i in predictions]

## Model Report

In [18]:
# Scoring our model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score, precision_score, recall_score

# Confusion Matrix
print('Confusion Matrix')
print(confusion_matrix(Y_val, binary_predictions))
print('--'*50)

# Classification Report
print('Classification Report')
print(classification_report(Y_val, binary_predictions))


# Accuracy of our model
print('--'*50)
bayesOpt_accuracy = round(accuracy_score(Y_val, binary_predictions) * 100,8)
print('Accuracy = ', bayesOpt_accuracy,'%')


Confusion Matrix
[[28401    31]
 [    6    43]]
----------------------------------------------------------------------------------------------------
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.58      0.88      0.70        49

    accuracy                           1.00     28481
   macro avg       0.79      0.94      0.85     28481
weighted avg       1.00      1.00      1.00     28481

----------------------------------------------------------------------------------------------------
Accuracy =  99.87008883 %
