In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pickle
from helpers import random_seed, target_feature, rename_cols_for_lgbm
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score


np.random.seed(random_seed)

In [None]:
# reading in data and dividing into x and y
train = pd.read_csv('../data/post_fs_train.csv', index_col=0)
train = rename_cols_for_lgbm(train)

test = pd.read_csv('../data/post_fs_test.csv', index_col=0)
test = rename_cols_for_lgbm(test)

y_train = train[target_feature]
y_test = test[target_feature]

x_train = train.drop(target_feature, axis=1)
x_test = test.drop(target_feature, axis=1)

In [None]:
data = pd.read_excel('../scores/mutual_info_score.xlsx', index_col=0)
data.head(10)

In [None]:
columns = x_train.columns.to_list()
print(columns)

In [None]:
train_set = lgb.Dataset(data = x_train, label = y_train, free_raw_data=False)
test_set = lgb.Dataset(data = x_test, label = y_test, free_raw_data=False)


In [None]:
model = LGBMRegressor()
default_params = model.get_params()
print(default_params)
scores = cross_val_score(model, x_train, y_train, scoring = 'neg_root_mean_squared_error', cv=5, verbose=1)
print('RMSE:', (-np.mean(scores))**0.5)


### Hyperparameter tunning using randomised grid search

In [None]:
compute_cv = False

In [None]:
params = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'n_estimators':[10, 20, 40, 70, 100],
    'max_depth': [3, 4, 6, 8, 10],
    'num_leaves': [5, 20, 50, 100],
    'learning_rate': [0.01, 0.05, 0.20, 0.03, 0.45, 0.85, 0.6, 0.75, 1],
    'min_child_samples': list(range(20, 500, 10)),
    'reg_alpha': [0.001, 0.0007, 0.1, 0.03, 0.06, 0.8, 0.65, 0.3, 1],
    'reg_lambda': [0.01, 0.1, 1, 0.6, 0.006, 0.3, 0.2],
}

In [None]:
if compute_cv:
    lgbm_reg = RandomizedSearchCV(model, params, scoring='neg_root_mean_squared_error', n_iter=15, random_state=random_seed)
    lgbm_cv_fit = lgbm_reg.fit(x_train, y_train)

    with open('../models/LGBM_CV.pkl', 'wb') as f:
        pickle.dump(lgbm_cv_fit, f)

In [None]:
with open('../models/LGBM_CV.pkl', 'rb') as f:
    cv = pickle.load(f)

### Train the model

Train the model on the best parameters and save it.

In [None]:
print('RMSE:', -cv.best_score_)
print(cv.best_params_)

In [None]:
lgbm_reg = LGBMRegressor(**cv.best_params_)
lgbm_fit = lgbm_reg.fit(x_train, y_train)

In [None]:
with open("../models/LGBM.pkl", "wb") as f:
    pickle.dump(lgbm_fit, f)

### Check prediction

In [None]:
y_predict = lgbm_fit.predict(x_test)
print(mean_squared_error(y_test, y_predict)**0.5, r2_score(y_test, y_predict))