In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pickle
from helpers import random_seed, target_feature, rename_cols_for_lgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score


np.random.seed(random_seed)

In [None]:
# reading in data and dividing into x and y
train = pd.read_csv('../data/post_fs_train.csv', index_col=0)
train = rename_cols_for_lgbm(train)

test = pd.read_csv('../data/post_fs_test.csv', index_col=0)
test = rename_cols_for_lgbm(test)

y_train = train[target_feature]
y_test = test[target_feature]

x_train = train.drop(target_feature, axis=1)
x_test = test.drop(target_feature, axis=1)

# Gradinet Boosting using LightGBM library

For this part we will tune hyperparameters on reduced datset based on 10 most relevannt features (mututal information score) and later we will train the model on full dataset given the discovered parameters.

In [None]:
# top 10 features by mututal information
data = pd.read_excel('../scores/mutual_info_score.xlsx', index_col=0)
data.head(10)

In [None]:
# selecting top 10 columns
columns = list(data.index.values)[:10]
print(columns)

In [None]:
#We can train the LGBM regression model on default parameters on full data set
model = LGBMRegressor()
default_params = model.get_params()
print(pd.DataFrame.from_dict(default_params, orient='index'))
scores = cross_val_score(model, x_train, y_train, scoring = 'neg_root_mean_squared_error', cv=5, verbose=1)
print('RMSE:', (-np.mean(scores))**0.5)


RMSE with default parameters for whole dataset is 0.344

### Hyperparameter tunning using randomised grid search

In [None]:
compute_cv = True

In [None]:
params = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'n_estimators':[20, 40, 70, 100, 200, 80, 90],
    'max_depth': [3, 4, 6, 8, 9, 10, 20],
    'num_leaves': [5, 20, 50, 100],
    'learning_rate': [0.01, 0.05, 0.20, 0.03, 0.45, 0.85, 0.6, 0.75, 1],
    'min_child_samples': list(range(20, 500, 10)),
    'reg_alpha': [0.001, 0.0007, 0.1, 0.03, 0.06, 0.8, 0.65, 0.3, 1],
    'reg_lambda': [0.01, 0.1, 1, 0.6, 0.006, 0.3, 0.2],
}

In [None]:
model_for_grid = LGBMRegressor()
if compute_cv:
    lgbm_reg = RandomizedSearchCV(model_for_grid, params, scoring='neg_root_mean_squared_error', n_iter=18, random_state=random_seed)
    lgbm_cv_fit = lgbm_reg.fit(x_train[columns], y_train)
    parameters = lgbm_cv_fit.best_params_
    score = lgbm_cv_fit.best_score_
    

In [None]:
print('Best parameters:', parameters)
print('RMSE:', -score)

We have noticed that after tuning the parameters with grid search we improved slightly the model's performence.
Now, let's see if that parameters also imporve the performence on model trained on whole data set

In [None]:
model_tuned = LGBMRegressor(**parameters, verbose=1)
model_tuned.fit(x_train.values, y_train)
with open("../models/LGBM.pkl", "wb") as f:
    pickle.dump(model_tuned, f)

# Make prediction on test

In [None]:

y_predict = model_tuned.predict(x_test)
print('RMSE:',mean_squared_error(y_test, y_predict)**0.5)
print('R2', r2_score(y_test, y_predict))