In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pickle
from helpers import random_seed, target_feature, rename_cols_for_lgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score


np.random.seed(random_seed)

In [2]:
# reading in data and dividing into x and y
train = pd.read_csv('../data/post_fs_train.csv', index_col=0)
train = rename_cols_for_lgbm(train)

test = pd.read_csv('../data/post_fs_test.csv', index_col=0)
test = rename_cols_for_lgbm(test)

y_train = train[target_feature]
y_test = test[target_feature]

x_train = train.drop(target_feature, axis=1)
x_test = test.drop(target_feature, axis=1)

# Gradinet Boosting using LightGBM library

For this part we will tune hyperparameters on reduced datset based on 10 most relevannt features (mututal information score) and later we will train the model on full dataset given the discovered parameters.

In [3]:
# top 10 features by mututal information
data = pd.read_excel('../scores/mutual_info_score.xlsx', index_col=0)
data.head(10)

Unnamed: 0,mutual_info_score
R_SVA,1.610244
R_SIGMA,1.597569
R_PRES,1.099207
R_Depth,1.09047
R_DYNHT,0.848832
R_O2Sat,0.731166
R_SALINITY,0.686451
R_O2,0.672186
R_O2_sqrt,0.669059
R_SIO3,0.641994


In [4]:
# selecting top 10 columns
columns = list(data.index.values)[:10]
print(columns)

['R_SVA', 'R_SIGMA', 'R_PRES', 'R_Depth', 'R_DYNHT', 'R_O2Sat', 'R_SALINITY', 'R_O2', 'R_O2_sqrt', 'R_SIO3']


In [5]:
#We can train the LGBM regression model on default parameters on full data set
model = LGBMRegressor()
default_params = model.get_params()
print(pd.DataFrame.from_dict(default_params, orient='index'))
scores = cross_val_score(model, x_train, y_train, scoring = 'neg_root_mean_squared_error', cv=5, verbose=1)
print('RMSE:', -np.mean(scores))


                        0
boosting_type        gbdt
class_weight         None
colsample_bytree      1.0
importance_type     split
learning_rate         0.1
max_depth              -1
min_child_samples      20
min_child_weight    0.001
min_split_gain        0.0
n_estimators          100
n_jobs               None
num_leaves             31
objective            None
random_state         None
reg_alpha             0.0
reg_lambda            0.0
subsample             1.0
subsample_for_bin  200000
subsample_freq          0


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4373
[LightGBM] [Info] Number of data points in the train set: 296588, number of used features: 36
[LightGBM] [Info] Start training from score 10.988796
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4373
[LightGBM] [Info] Number of data points in the train set: 296589, number of used features: 36
[LightGBM] [Info] Start training from score 10.989167
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.9s finished


### Hyperparameter tunning using randomised grid search

In [6]:
compute_cv = True

In [7]:
params = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'n_estimators':[20, 40, 70, 100, 200, 80, 90],
    'max_depth': [3, 4, 6, 8, 9, 10, 20],
    'num_leaves': [5, 20, 50, 100],
    'learning_rate': [0.01, 0.05, 0.20, 0.03, 0.45, 0.85, 0.6, 0.75, 1],
    'min_child_samples': list(range(20, 500, 10)),
    'reg_alpha': [0.001, 0.0007, 0.1, 0.03, 0.06, 0.8, 0.65, 0.3, 1],
    'reg_lambda': [0.01, 0.1, 1, 0.6, 0.006, 0.3, 0.2],
}

In [8]:
model_for_grid = LGBMRegressor()
if compute_cv:
    lgbm_reg = RandomizedSearchCV(model_for_grid, params, scoring='neg_root_mean_squared_error', n_iter=18, random_state=random_seed)
    lgbm_cv_fit = lgbm_reg.fit(x_train[columns], y_train)
    parameters = lgbm_cv_fit.best_params_
    score = lgbm_cv_fit.best_score_
    



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 296588, number of used features: 10
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 10.988796
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 296589, number of used features: 10
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 10.989167
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025046 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 

In [9]:
print('Best parameters:', parameters)
print('RMSE:', -score)

Best parameters: {'reg_lambda': 0.1, 'reg_alpha': 1, 'num_leaves': 50, 'n_estimators': 70, 'min_child_samples': 50, 'max_depth': 10, 'learning_rate': 0.45, 'boosting_type': 'gbdt'}
RMSE: 0.3462813961264066


We have noticed that after tuning the parameters with grid search we improved slightly the model's performence.
Now, let's see if that parameters also imporve the performence on model trained on whole data set

In [10]:
model_tuned = LGBMRegressor(**parameters, verbose=1)
model_tuned.fit(x_train.values, y_train)
with open("../models/LGBM.pkl", "wb") as f:
    pickle.dump(model_tuned, f)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4373
[LightGBM] [Info] Number of data points in the train set: 370736, number of used features: 36
[LightGBM] [Info] Start training from score 10.987534


# Make prediction on test

In [11]:

y_predict = model_tuned.predict(x_test)
print('RMSE:',mean_squared_error(y_test, y_predict)**0.5)
print('R2', r2_score(y_test, y_predict))

RMSE: 0.23946326868452752
R2 0.996671606819053
