In [1]:
import numpy as np
import pandas as pd
from time import gmtime, strftime
import gc

from sklearn.model_selection import (train_test_split, GridSearchCV)
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score

In [2]:
train_df = pd.read_csv('../../../data/feature/cbasi_train.csv')
val_df = pd.read_csv('../../../data/feature/cbasi_validation.csv')
X_tr = train_df.iloc[:, :-1]
y_tr = train_df.iloc[:,-1]-1

In [3]:
params = {
    'application': 'multiclass', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_depth': -1, # <0 means no limit
    'max_bin': 510, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'multi_logloss',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}

# Initiate classifier to use
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'multiclass', 
          n_jobs = 5, 
          num_class = 3,
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

# To view the default model parameters:
mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'num_class', 'max_bin'])

In [4]:
gridParams = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['multiclass'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X_tr, y_tr)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 3456 candidates, totalling 13824 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 6026 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 7176 tasks      | elapsed: 24.6min
[Parallel(n_jobs=-1)]: Done 8426 tasks      | elapsed: 29.3min
[Parallel(n_jobs=-1)]: Done 9776 tasks    

GridSearchCV(cv=4,
             estimator=LGBMClassifier(max_bin=510, min_child_samples=5,
                                      min_child_weight=1, min_split_gain=0.5,
                                      n_jobs=5, num_class=3,
                                      objective='multiclass', subsample=1,
                                      subsample_for_bin=200),
             n_jobs=-1,
             param_grid={'boosting_type': ['gbdt', 'dart'],
                         'colsample_bytree': [0.64, 0.65, 0.66],
                         'learning_rate': [0.005, 0.01], 'max_bin': [255, 510],
                         'n_estimators': [8, 16, 24],
                         'num_leaves': [6, 8, 12, 16],
                         'objective': ['multiclass'], 'random_state': [500],
                         'reg_alpha': [1, 1.2], 'reg_lambda': [1, 1.2, 1.4],
                         'subsample': [0.7, 0.75]},
             verbose=1)

{'boosting_type': 'dart', 'colsample_bytree': 0.64, 'learning_rate': 0.01, 'max_bin': 255, 'n_estimators': 16, 'num_leaves': 16, 'objective': 'multiclass', 'random_state': 500, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
0.5884965473948525


In [6]:
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = grid.best_params_['learning_rate'] 
params['num_class'] = 3
params['max_bin'] = grid.best_params_['max_bin']
params['num_leaves'] = grid.best_params_['num_leaves']
params['reg_alpha'] = grid.best_params_['reg_alpha']
params['reg_lambda'] = grid.best_params_['reg_lambda']
params['subsample'] = grid.best_params_['subsample']


X_train, X_valid, y_train, y_valid = train_test_split(X_tr, y_tr, test_size=0.1, random_state = 12)
    
#del X, y; gc.collect();

d_train = lgb.Dataset(X_train, label=y_train)
d_valid = lgb.Dataset(X_valid, label=y_valid) 

watchlist = [d_train, d_valid]


model = lgb.train(params, train_set=d_train, num_boost_round=1000, valid_sets=watchlist, early_stopping_rounds=50, verbose_eval=4)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 57348, number of used features: 15
[LightGBM] [Info] Start training from score -1.094540
[LightGBM] [Info] Start training from score -1.100235
[LightGBM] [Info] Start training from score -1.101074
Training until validation scores don't improve for 50 rounds
[4]	training's multi_logloss: 1.08193	valid_1's multi_logloss: 1.08234
[8]	training's multi_logloss: 1.06639	valid_1's multi_logloss: 1.06712
[12]	training's multi_logloss: 1.0519	valid_1's multi_logloss: 1.0529
[16]	training's multi_logloss: 1.03789	valid_1's multi_logloss: 1.03921
[20]	training's multi_logloss: 1.02495	valid_1's multi_logloss: 1.02661
[24]	training's multi_logloss: 1.01254	valid_1's multi_logloss: 1.01449
[28]	training's multi_logloss: 1.00046	valid_1's multi_logloss: 1.00264
[32]	training's multi_logloss: 0.988965	valid_1's multi_logloss: 0.991358
[36]	training's mul

In [7]:
X_val = val_df.iloc[:, :-1]
y_val = val_df.iloc[:, -1]-1
pred_y = model.predict(X_val)

In [8]:
def func(x):
    return x.argmax()

y_pred = np.apply_along_axis(func,1,pred_y)

In [10]:
print(f1_score(y_val, y_pred, average='macro'))

0.2462338456832607
