In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from tqdm import tqdm

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
#parameters dict for LightGBM
lgb_params =  {
    'boosting': 'gbdt', 
    'colsample_bytree': 1, 
    'learning_rate': 0.1, 
    'max_depth': 15, 
    'min_child_samples': 20, 
    'n_estimators': 500, 
    'num_leaves': 500,  
    'objective': 'multiclass',
    'num_class':4,
    'reg_alpha': 0.6, 
    'reg_lambda': 0.3, 
    'subsample': 0.7,
    'verbose':1
    }

#provided by grid search but LB score is less
lgb_params_old =  {
'boosting': 'gbdt',
 'colsample_bytree': 1,
 'learning_rate': 0.03,
 'max_depth': 15,
 'min_child_samples': 30,
 'n_estimators': 600,
 'num_class': 4,
 'num_leaves': 200,
 'objective': 'multiclass',
 'reg_alpha': 0.0,
 'reg_lambda': 0.2,
 'subsample': 0.5,
 'verbose': 1}
 

## Load Data

In [3]:
train = pd.read_csv('../data/generated/train_eng.csv')
test = pd.read_csv('../data/generated/test_eng.csv')

In [4]:
#ensure that there are not null values
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

0
0


In [5]:
#extract labels & convert to categorical to have our "y"
labels_train = train['label']

#remove labels from train
train.drop(columns=['label'], inplace=True)

print(train.shape)
print(labels_train.shape)

(24840, 148)
(24840,)


Build a normalize that will apply standard scaler and PCA if required

In [6]:
def NormalizeData(train, CVorTest, PCA_comp = 0.95, ScaleCat = False):
    '''
    Normalize data using a standard scaler
    train:
        dataframe that will be use to fit and transformed by the scaler and PCA
    CVorTest:
        dataframe that will be transformed the scaler and PCA
    PCA_comp:
        Number of PCA components to keep, if None, PCA not applied
    ScaleCat:
        Scale or not the categorical columns with the standard scaler
    '''
    sc = StandardScaler()
    
    if ScaleCat:
        scale_columns = train.columns
    else:
        scale_columns = [col for col in train.columns[~train.columns.str.startswith('Cat_')]]
          
    #perform feature scaling    
    train.loc[:, scale_columns] = sc.fit_transform(train.loc[:, scale_columns]) 
    CVorTest.loc[:, scale_columns] = sc.transform(CVorTest.loc[:, scale_columns]) 
    
    if PCA_comp is None:
        return train.values, CVorTest.values
    
    pca = PCA(PCA_comp)
    train = pca.fit_transform(train)
    CVorTest = pca.transform(CVorTest)
    
    return train, CVorTest

In [7]:
train, test = NormalizeData(train, test, None)
print(train.shape)

(24840, 148)


In [8]:
#build the dataset in Lgbm format
d_train = lgb.Dataset(train, labels_train)
d_test = lgb.Dataset(test)

## Perform training

In [9]:
#Kaggle is evaluate on the F1 score, let's define this metric for training
def f1_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds = preds.reshape(len(np.unique(labels)), -1)
    preds = preds.T.argmax(axis = 1)
    f_score = f1_score(preds, labels, average="macro")
    return 'f1_score', f_score, True

In [10]:
#perform KFold validation (10 folds)
cv_mod = lgb.cv(lgb_params, d_train, nfold=10, early_stopping_rounds = 25, feval=f1_eval)

  'recall', 'true', average, warn_for)


In [11]:
#display the KFold CV score
cv_mod['f1_score-mean'][-1]

0.9643972468274253

In [12]:
#do the training on the full train_set
mod = lgb.train(lgb_params, d_train, valid_sets=[d_train], feval=f1_eval, verbose_eval=100)



[100]	training's multi_logloss: 0.0122088	training's f1_score: 0.999967
[200]	training's multi_logloss: 0.00439959	training's f1_score: 1
[300]	training's multi_logloss: 0.00342553	training's f1_score: 1
[400]	training's multi_logloss: 0.00342497	training's f1_score: 1
[500]	training's multi_logloss: 0.00342497	training's f1_score: 1


In [13]:
#display the F1 score on training
f1_score(labels_train, mod.predict(train).argmax(axis = 1), average='macro')

1.0

## Build Submission

In [14]:
test.shape

(10647, 148)

In [15]:
#generate predictions
preds = mod.predict(test)

In [16]:
#create submission file
pred_df = pd.DataFrame(preds.round().argmax(axis=1), columns=['label'])
pred_df.to_csv('../data/generated/submissionLGB.csv', index=True, index_label='Id')

## Features importance
    Rerun the cell if no graph display

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df_train = pd.read_csv('../data/generated/train_eng.csv')

feature_imp = pd.DataFrame(sorted(zip(mod.feature_importance(importance_type='split'),df_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 30))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

<Figure size 2000x3000 with 1 Axes>