In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from tqdm import tqdm

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

In [3]:
# Define parameters (identify by gridsearch)
space =  {
    'boosting': 'gbdt', 
    'colsample_bytree': 1, 
    'learning_rate': hp.choice('learning_rate', [0.1, 0.03, 0.001]), 
    'max_depth': hp.choice('max_depth', range(5, 20)), 
    'min_child_samples': hp.choice('min_child_samples', range(20, 101, 10)), 
    'n_estimators': hp.choice('n_estimators', range(200, 701, 100)),
    'num_leaves': hp.choice('num_leaves', range(100, 1001, 100)),  
    'reg_alpha': hp.choice('reg_alpha', np.arange(0.0, 1.0, 0.1)), 
    'reg_lambda': hp.choice('reg_lambda', np.arange(0.0, 1.0, 0.1)), 
    'subsample': hp.choice('subsample', np.arange(0.1, 1.0, 0.1)),
    'objective': 'multiclass',
    'num_class':4,
    'verbose':1
    }


## Load Data

In [4]:
train = pd.read_csv('../data/generated/train_eng.csv')
test = pd.read_csv('../data/generated/test_eng.csv')

In [5]:
#ensure there's no null values
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [6]:
#extract labels & convert to categorical to have our "y"
labels_train = train['label']

#remove labels from train
train.drop(columns=['label'], inplace=True)

print(train.shape)
print(labels_train.shape)

(24840, 148)
(24840,)


Build a normalize that will apply standard scaler and PCA if required

In [7]:
def NormalizeData(train, CVorTest, PCA_comp = 0.9, ScaleCat = False):
    '''
    Normalize data using a standard scaler
    train:
        dataframe that will be use to fit and transformed by the scaler and PCA
    CVorTest:
        dataframe that will be transformed the scaler and PCA
    PCA_comp:
        Number of PCA components to keep, if None, PCA not applied
    ScaleCat:
        Scale or not the categorical columns with the standard scaler
    '''
    sc = StandardScaler()
    
    if ScaleCat:
        scale_columns = train.columns
    else:
        scale_columns = [col for col in train.columns[~train.columns.str.startswith('Cat_')]]
          
    #perform feature scaling    
    train.loc[:, scale_columns] = sc.fit_transform(train.loc[:, scale_columns]) 
    CVorTest.loc[:, scale_columns] = sc.transform(CVorTest.loc[:, scale_columns]) 
    
    if PCA_comp is None:
        return train.values, CVorTest.values
    
    pca = PCA(PCA_comp)
    train = pca.fit_transform(train)
    CVorTest = pca.transform(CVorTest)
    
    return train, CVorTest

In [8]:
train, test = NormalizeData(train, test, None)
print(train.shape)

(24840, 148)


In [9]:
#build the dataset in Lgbm format
d_train = lgb.Dataset(train, labels_train)
d_test = lgb.Dataset(test)

In [10]:
#Kaggle is evaluate on the F1 score, let's define this metric for training
def f1_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(len(np.unique(labels)), -1)
    preds = preds.T.argmax(axis = 1)
    f_score = f1_score(preds, labels, average="macro")
    return 'f1_score', f_score, True

In [11]:
#wrap the model KFold valudation into a function returning the f1 score to optimize then this function
#with HyperOpt

def evaluateModel(lgb_params):
    cv_mod = lgb.cv(lgb_params, d_train, nfold=10, early_stopping_rounds = 25, feval=f1_eval)
    #print(lgb_params)
    #print(cv_mod['f1_score-mean'][-1], cv_mod['f1_score-stdv'][-1])
    lgb_params['f1_score-mean'] = cv_mod['f1_score-mean'][-1]
    lgb_params['f1_score-stdv'] = cv_mod['f1_score-stdv'][-1]
    
    with open ('./LGB_grid_search.csv', 'a+') as fp:
        fp.write(str(lgb_params) +'\n')

    return {
        'loss': - cv_mod['f1_score-mean'][-1],
        'status': STATUS_OK,
        'stats_running': STATUS_RUNNING
    }
    

In [12]:
#Hyperopt loop for optimization

import warnings
warnings.filterwarnings('ignore')

trials = Trials()

# Set algoritm parameters
algo = partial(tpe.suggest, 
               n_startup_jobs=-1)

# Seting the number of Evals
MAX_EVALS= 200

# Fit Tree Parzen Estimator
best_vals = fmin(evaluateModel, space=space, verbose=1,
                 algo=algo, max_evals=MAX_EVALS, trials=trials)

# Print best parameters
best_params = space_eval(space, best_vals)

100%|██████████| 200/200 [6:26:50<00:00, 116.05s/it, best loss: -0.9648291118427668]  


In [13]:
best_params

{'boosting': 'gbdt',
 'colsample_bytree': 1,
 'learning_rate': 0.03,
 'max_depth': 15,
 'min_child_samples': 30,
 'n_estimators': 600,
 'num_class': 4,
 'num_leaves': 200,
 'objective': 'multiclass',
 'reg_alpha': 0.0,
 'reg_lambda': 0.2,
 'subsample': 0.5,
 'verbose': 1}