# Imports
This notebook uses preprocessed dataset by following [notebook](12_PowerConverter_dataset_preprocessing.ipynb).

**notes**
* CPU monitoring in terminal:  
```bash
top
```
* GPU monitoring in terminal:  
```bash
pip install gpustat
watch -c gpustat -cp --color
```

In [1]:
import sys
import pandas as pd

# to save results to data directory
module_path = ".."
if module_path not in sys.path:
    sys.path.insert(1, module_path)
# increase displayed columns in jupyter notebook
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [2]:
import re
import tracemalloc
from copy import copy
from datetime import datetime
from time import time
from typing import Union

import dill
import lightgbm as lgbm
import lime
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from eli5 import explain_prediction_df, explain_weights, explain_weights_df
from eli5.sklearn import PermutationImportance
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from lime.lime_tabular import LimeTabularExplainer
from pytorch_widedeep import Tab2Vec
from pytorch_widedeep.utils import LabelEncoder
from sklearn.metrics import classification_report, log_loss, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import PowerTransformer
import src
from src import common

tracemalloc.start()

import tracemalloc

import ray
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.integration.lightgbm import TuneReportCheckpointCallback
from ray.tune.integration.wandb import WandbLogger
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import AsyncHyperBandScheduler

tracemalloc.start()

# temporarily remove deprecation warnings
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

OSError: dlopen(/Users/abeattie/miniconda/envs/FIREMAN-project/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so, 0x0006): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/abeattie/miniconda/envs/FIREMAN-project/lib/python3.7/site-packages/lightgbm/lib_lightgbm.so
  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file)

# Dataset

**identifiers**

In [None]:
column_types = common.json_load("#datasets/Colab_PowerConverter/column_types.json")
target = column_types["target"]
measurement_label = column_types["measurement_label"]
RANDOM_STATE = 1
TEST_SIZE_TRAIN = 0.2
TEST_SIZE_VALID = 0.5
EMBEDDING = False
TASK = "multiclass" #(or "binary")

In [None]:
df = pd.read_pickle("#datasets/Colab_PowerConverter/dataset.pkl")

In [None]:
# this measurement did not have a fault (?)
df = df[df[measurement_label]!="Single-Phase_Sensor_Fault"]
df.reset_index(inplace=True, drop=True)

In [None]:
fault_dict = {}
for label,i in zip(df[measurement_label].unique(), range(len(df[measurement_label].unique()))):
    df.loc[(df[measurement_label]==label) & (df[target]==1), target] = int(i+1)
    fault_dict[label] = int(i+1)

In [None]:
# imbalance of the classes
df[target].value_counts()

In [None]:
fault_dict

# Preprocessing

In [None]:
df.drop(columns=[measurement_label], inplace=True)

In [None]:
df_train, df_valid = train_test_split(df, test_size=TEST_SIZE_TRAIN, stratify=df[target], random_state=RANDOM_STATE)
df_valid, df_test = train_test_split(df_valid, test_size=TEST_SIZE_VALID, stratify=df_valid[target], random_state=RANDOM_STATE)

df_train.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [None]:
df_train_scaled, Scaler = common.scale(df_train, [target], scaler_sk='Standard')
df_valid_scaled, Scaler = common.scale(df_valid, [target], scaler_sk=Scaler)
df_test_scaled, Scaler = common.scale(df_test, [target], scaler_sk=Scaler)

## Categorical features tranformation

In [None]:
if EMBEDDING:
    CAT_FEATURE_TRANSFORMATION = "Entity Embedding"
    if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
        label_encoder = LabelEncoder(cat_cols)
        label_encoder.fit(data[cat_cols])

        df_train_scaled_enc = df_train_scaled.copy()
        df_valid_scaled_enc = df_valid_scaled.copy()
        df_test_scaled_enc = df_test_scaled.copy()

        df_train_scaled_enc[cat_cols] = label_encoder.transform(
            df_train_scaled_enc[cat_cols]
        )
        df_valid_scaled_enc[cat_cols] = label_encoder.transform(
            df_valid_scaled_enc[cat_cols]
        )
        df_test_scaled_enc[cat_cols] = label_encoder.transform(
            df_test_scaled_enc[cat_cols]
        )
        df_test_scaled_enc[cat_cols].head()

    if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
        # using pretrained embedding from pytorch-widedeep model and its tab_preprocessor
        with open("dl_entity_emb_model_" + TASK + ".dill", "rb") as f:
            model = dill.load(f)
        with open("dl_entity_emb_model_tab_preprocessor_" + TASK + ".dill", "rb") as f:
            tab_preprocessor = dill.load(f)

        t2v = Tab2Vec(model=model, tab_preprocessor=tab_preprocessor, return_dataframe=True)
        df_train_scaled_enc, df_train_y = t2v.transform(
            df_train_scaled, target_col=target_col
        )
        df_valid_scaled_enc, df_valid_y = t2v.transform(
            df_valid_scaled, target_col=target_col
        )
        df_test_scaled_enc, df_test_y = t2v.transform(
            df_test_scaled, target_col=target_col
        )
        df_train_scaled_enc[target_col] = df_train_y
        df_valid_scaled_enc[target_col] = df_valid_y
        df_test_scaled_enc[target_col] = df_test_y

        cols_list = list(df_test_scaled_enc.columns)
        cat_cols_emb = []
        for cat_col in cat_cols:
            r = re.compile(cat_col + "*")
            cat_cols_emb.extend(list(filter(r.match, cols_list)))
    # df_test_scaled_enc[cat_cols_emb].head()
else:
    df_train_scaled_enc = df_train_scaled.copy()
    df_valid_scaled_enc = df_valid_scaled.copy()
    df_test_scaled_enc = df_test_scaled.copy()

# LightGBM

In [None]:
# df_train_scaled_enc = df_train_scaled_enc.sample(100000)
# df_valid_scaled_enc = df_valid_scaled_enc.sample(30000)

In [None]:
NUM_CLASSES = df[target].nunique()
NUM_CLASSES

## Prepare Dataset, metric and objective functions

In [None]:
config = {}
if TASK == "binary" or TASK == "multiclass":
    config["objective"] = TASK
    config["num_classes"] = NUM_CLASSES

if TASK == "multiclass":
    ray_metric = "multi_logloss"

if EMBEDDING:
    if CAT_FEATURE_TRANSFORMATION == "Label Encoding":
        lgb_cat_cols = cat_cols_f
    if CAT_FEATURE_TRANSFORMATION == "Entity Embedding":
        lgb_cat_cols = []
else:
    lgb_cat_cols = []

lgbtrain = lgbm.Dataset(
    df_train_scaled_enc.drop(columns=[target]),
    df_train_scaled_enc[target],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbvalid = lgbm.Dataset(
    df_valid_scaled_enc.drop(columns=[target]),
    df_valid_scaled_enc[target],
    reference=lgbtrain,
    free_raw_data=False,
)
# Final TRAIN/TEST
ftrain = pd.concat([df_train_scaled_enc, df_valid_scaled_enc]).reset_index(
    drop=True
)
flgbtrain = lgbm.Dataset(
    ftrain.drop(columns=[target]),
    ftrain[target],
    categorical_feature=lgb_cat_cols,
    free_raw_data=False,
)
lgbtest = lgbm.Dataset(
    df_test_scaled_enc.drop(columns=[target]),
    df_test_scaled_enc[target],
    categorical_feature=lgb_cat_cols,
    reference=flgbtrain,
    free_raw_data=False,
)

## Train model

In [None]:
%%time
model = lgbm.train(
    config,
    flgbtrain,
    valid_sets=[lgbvalid],
    valid_names=[""],
    #feval=feval,
    #fobj=fobj,
    #callbacks=[log_evaluation()],
    )

## Prediction & Evaluation

In [None]:
if TASK == "binary":
    res = np.rint(model.predict(lgbtest.data))

if TASK == "multiclass":
    res = model.predict(lgbtest.data).argmax(1)

result = pd.DataFrame({"predicted": res,
                       "ground_truth": df_test[target].values,})

In [None]:
print('Classification report:\n{}'.format(classification_report(result['predicted'], result['ground_truth'])))

## w RayTune

In [None]:
start = time()

#config["eta"] = tune.loguniform(1e-4, 1e-1),
#config["subsample"] = tune.uniform(0.5, 1.0),
config["max_depth"] = tune.randint(1, 9),
# config["wandb"]["project"] = "GBM_classifier",
# config["wandb"]["api_key_file"] = "../data/wandb_api.key",
# config["wandb"]["log_config"] = True


def training_function(config, train, valid):
    lgbm_config = config.copy()
    #lgbm_config.pop("wandb")
    trainer = lgbm.train(
        lgbm_config,
        train,
        valid_sets=[valid],
        valid_names=[""],
        callbacks=[
            TuneReportCheckpointCallback(
                {
                    ray_metric: ray_metric,
                }
            )
        ],
    )


asha_scheduler = AsyncHyperBandScheduler(
    time_attr="training_iteration",
    metric=ray_metric,
    mode="min",
    max_t=100,
    grace_period=10,
    reduction_factor=3,
    brackets=1,
)

analysis = tune.run(
    tune.with_parameters(training_function, train=lgbtrain, valid=lgbvalid),
    # resources_per_trial={"cpu": 4, "gpu": 0},
    num_samples=2,
    progress_reporter=JupyterNotebookReporter(overwrite=True),
    scheduler=asha_scheduler,
    config=config,
    #loggers=DEFAULT_LOGGERS + (WandbLogger,),
)

In [None]:
analysis.trial_dataframes

### Train best params model

In [None]:
runtime = time() - start
print("Optimization time:\n{}".format(runtime))

params = copy(analysis.get_best_config(ray_metric, "min"))
params.pop("wandb")
# params["n_estimators"] = 1000

start = time()
model = lgbm.train(
    params,
    flgbtrain,
    valid_sets=[lgbtest],
    callbacks=[lgbm.log_evaluation(show_stdv=False)],
)
runtime = time() - start
print("Final model training time:\n{}".format(str(datetime.timedelta(seconds=runtime))))a

### Tensorboard visualization

In [None]:
from tensorboard import notebook

notebook.list()

In [None]:
%load_ext tensorboard
%tensorboard --logdir ~/ray_results