# Imports
**Other**
* CPU monitoring in terminal:  
```bash
top
```
* GPU monitoring in terminal:  
```bash
pip install gpustat
watch -c gpustat -cp --color
```

In [1]:
import os
import sys
import pickle
import json
import pandas as pd
# to save results to data directory
module_path = '..'
if module_path not in sys.path:
    sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

In [27]:

import lightgbm_optimizer as lgbmo

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

import lightgbm as lgb

import lime
import shap
from lime.lime_tabular import LimeTabularExplainer
from eli5.sklearn import PermutationImportance
from eli5 import explain_weights, explain_weights_df,explain_prediction_df

from pytorch_widedeep.utils import LabelEncoder

from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, classification_report
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, RobustScaler, StandardScaler

import matplotlib.pyplot as plt
from typing import Union
from copy import copy
from datetime import datetime
from time import time
import importlib

# increase displayed columns in jupyter notebook
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

# temporarily remove deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

**helper functions**

In [None]:
def intsec(list1, list2):
    """Simple intesection of two lists.

    Args:
        list1 (list): list1
        list2 (list): list2

    Returns:
        list (list): intersection of lists
    """
    return list(set.intersection(set(list1), set(list2)))


def scale(data_pd, non_scale_cols, scaler_sk='Standard'):
    """Procedure to scale the dataset except the given list of columns.

    Args:
        data_pd (obj): pandas dataframe
        non_scale_cols (list): columns to not scale
        scaler_sk (str, sklearn.peprocessing obj): type of scaler from['Standard', 'Yeo-Johnson',
        'Robust', 'MinMax'] or already fitted scaler

    Returns:
        tuple (tuple): data_pd_scaled (obj): scaled pandas dataframe\n
        scaler_sk (obj): sklearn scaler object
    """
    non_scale_cols = intsec(data_pd.columns.values, non_scale_cols)
    data_pd_toscale = data_pd.drop(columns=non_scale_cols)
    if type(scaler_sk) is str:
        if scaler_sk == 'Standard':
            scaler_sk = StandardScaler()
        elif scaler_sk == 'Yeo-Johnson':
            scaler_sk = PowerTransformer(method='yeo-johnson')
        elif scaler_sk == 'Robust':
            scaler_sk = RobustScaler()
        elif scaler_sk == 'MinMax':
            scaler_sk = MinMaxScaler()
        scaler_sk.fit(data_pd_toscale)
    # if 'sklearn.peprocessing' in str(type(scaler_sk)):

    data_pd_scaled = pd.DataFrame(scaler_sk.transform(data_pd_toscale),
                                  columns=data_pd_toscale.columns.values)
    data_pd_scaled[non_scale_cols] = data_pd[non_scale_cols].copy()
    return data_pd_scaled, scaler_sk


def rmse(y_true, ypred):
    return mean_squared_error(y_true, ypred, squared=False)

## Set (i)identifier and which columns are (ii)numerical and (iii)categorical

In [4]:
column_types = json.load(open('./#datasets/column_types.json', 'r'))

identifier = column_types['identifier']
cat_cols = column_types['categorical']

target = column_types['target']

random_state = 1

unclosed file <_io.TextIOWrapper name='../data/column_types_im.json' mode='r' encoding='UTF-8'>


# Dataset

In [5]:
data_raw = pd.read_pickle('./#datasets/data.pkl')

**Fill NA - 0 for numerical and 'NA' for categorical**

In [6]:
# categorical
data_raw[cat_cols] = data_raw[cat_cols].fillna('NA')
data_raw[cat_cols] = data_raw[cat_cols].astype(str)
# non-categorical
non_cat_cols = data_raw.drop(columns=cat_cols).columns.tolist()
data_raw[non_cat_cols] = data_raw[non_cat_cols].fillna(0)

**Constant columns**

In [1]:
const_cols = data_raw.columns[data_raw.nunique() == 1].values.tolist()
data_raw.drop(columns=const_cols, inplace=True)
print('Dropped constant columns:\n{}'.format(const_cols))

**Train, valid, test dataset split**

In [9]:
data_train, data_valid = train_test_split(data_raw, test_size=0.2, random_state=1)
data_valid, data_test = train_test_split(data_valid, test_size=0.5, random_state=1)

data_train.reset_index(inplace=True, drop=True)
data_valid.reset_index(inplace=True, drop=True)
data_test.reset_index(inplace=True, drop=True)

**Data scale**

In [10]:
cat_cols_filtered = intsec(cat_cols, data_raw.columns.values)
data_train_scaled, Scaler = scale(data_train, cat_cols_filtered  + [target], scaler_sk='Standard')
data_valid_scaled, Scaler = scale(data_train, cat_cols_filtered  + [target], scaler_sk=Scaler)
data_test_scaled, Scaler = scale(data_train, cat_cols_filtered  + [target], scaler_sk=Scaler)

Ptran = PowerTransformer(standardize = False)
Ptran.fit(data_train_scaled.loc[data_train_scaled[target] > 0, target].values.reshape(-1, 1))

data_train_scaled.loc[data_train_scaled[target] > 0, target] = Ptran.transform(data_train_scaled.loc[data_train_scaled[target] > 0, target].values.reshape(-1, 1)).flatten()
data_valid_scaled.loc[data_valid_scaled[target] > 0, target] = Ptran.transform(data_valid_scaled.loc[data_valid_scaled[target] > 0, target].values.reshape(-1, 1)).flatten()
# no need to power-transform test target
#data_test_scaled.loc[data_train_scaled[target] > 0, target] = Ptran.transform(data_train_scaled. loc[data_train_scaled[target] > 0,target].values.reshape(-1, 1)).flatten()

**Label encode**

In [11]:
label_encoder = LabelEncoder(cat_cols_filtered)
label_encoder.fit(data_raw[cat_cols_filtered])

data_train_scaled[cat_cols_filtered] = label_encoder.transform(data_train_scaled[cat_cols_filtered])
data_valid_scaled[cat_cols_filtered] = label_encoder.transform(data_valid_scaled[cat_cols_filtered])
data_test_scaled[cat_cols_filtered] = label_encoder.transform(data_test_scaled[cat_cols_filtered])

# LightGBM

* based on 
  * [Putting ML in production II: logging and monitoring](https://towardsdatascience.com/putting-ml-in-production-ii-logging-and-monitoring-algorithms-91f174044e4e)
  * [10.1 Regression with GBMs: preparing the data](https://github.com/jrzaurin/RecoTour/blob/master/Ponpare/Chapter10_GBM_reg_Recommendations.ipynb)
  * [ml-pipeline](https://github.com/jrzaurin/ml-pipeline)
  * [tabulardl-benchmark](https://github.com/jrzaurin/tabulardl-benchmark/blob/master/run_experiments/adult/adult_lightgbm.py)

In [1]:
OPTIMIZE_WITH = "hyperopt"
RESULTS_DIR = "./"
MODELS_DIR = "./"
suffix = str(datetime.now()).replace(" ", "_").split(".")[:-1][0]
results_filename = "_".join(["lightgbm_results", suffix]) + ".pkl"
models_filename = "_".join(["lightgbm_model", suffix]) + ".pkl"

lgbtrain = lgb.Dataset(data_train_scaled.drop(columns=[target]), data_train_scaled[target], categorical_feature=cat_cols_filtered, free_raw_data=False)
lgbvalid = lgb.Dataset(data_valid_scaled.drop(columns=[target]), data_valid_scaled[target], reference=lgbtrain, free_raw_data=False)

# https://lightgbm.readthedocs.io/en/latest/Parameters.html?highlight=rmse#core-parameters
# https://neptune.ai/blog/lightgbm-parameters-guide
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.lightgbm.LightGBMTunerCV.html#
if OPTIMIZE_WITH == "optuna":
    optimizer = lgbmo.LGBOptimizerOptuna(objective='regression', metric='rmse', verbose=True)
elif OPTIMIZE_WITH == "hyperopt":
    optimizer = lgbmo.LGBOptimizerHyperopt(objective='regression', metric=rmse, verbose=True)

optimizer.optimize(lgbtrain, lgbvalid)

# Final TRAIN/TEST
ftrain = pd.concat([data_train_scaled, data_valid_scaled]).reset_index(drop=True)
flgbtrain = lgb.Dataset(ftrain.drop(columns=[target]), ftrain[target], categorical_feature=cat_cols_filtered, free_raw_data=False)
lgbtest = lgb.Dataset(data_test_scaled.drop(columns=[target]), data_test_scaled[target], categorical_feature=cat_cols_filtered, reference=flgbtrain, free_raw_data=False)

params = copy(optimizer.best)
params["n_estimators"] = 1000


start = time()
model = lgb.train(params, flgbtrain, valid_sets=[lgbtest], early_stopping_rounds=50, verbose_eval=True)
runtime = time() - start

data_pred = (model.predict(lgbtest.data))
data_pred = Ptran.inverse_transform(data_pred.reshape(-1, 1))
rmse_test_score = mean_squared_error(data_test_scaled[target], data_pred, squared=False)

# SAVE
# results_d = {}
# results_d["best_params"] = optimizer.best
# results_d["runtime"] = runtime
# results_d["rmse"] = rmse_test_score_reg02

# with open(RESULTS_DIR + results_filename, "wb") as f:
#     pickle.dump(results_d, f)

# with open(MODELS_DIR + model_filename, "wb") as f:
#     pickle.dump(model, f)

In [51]:
rmse_test_score

1.2356612039261006