In [None]:
import os
import pickle
import sys
import warnings

import numpy as np
import optuna
import optuna.integration.lightgbm as lgb
import pandas as pd
import xarray as xr
from IPython.display import clear_output
from optuna.integration import LightGBMPruningCallback
from scipy.misc import derivative
from sklearn.metrics import (accuracy_score, classification_report, f1_score,
                             log_loss, roc_auc_score, roc_curve)
from sklearn.model_selection import StratifiedKFold, train_test_split

warnings.simplefilter(action='ignore', category=FutureWarning)
sys.path.append(os.path.join(os.getcwd(), ".."))
import shared_utils.utils_data as utils_data
from Metrics.Wrapper_main_function import (compute_metrics,
                                           save_metrics_to_xarray)

path_formatted_glasgow = "/workspaces/maitrise/data/20221006_physio_quality/set-a/dataParquet"
path_petastorm = f"file:///{path_formatted_glasgow}"

In [None]:
def sigmoid(x): return 1./(1. +  np.exp(-x))

def focal_loss_lgb(y_pred, dtrain, alpha, gamma):
    """
    Focal Loss for lightgbm

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    """
    a,g = alpha, gamma
    y_true = dtrain.label
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess

def lgb_focal_f1_score(preds, lgbDataset):
    """
    When using custom losses the row prediction needs to passed through a
    sigmoid to represent a probability

    Parameters:
    -----------
    preds: numpy.ndarray
        array with the predictions
    lgbDataset: lightgbm.Dataset
    """
    preds = sigmoid(preds)
    
    binary_preds = [int(p>0.5) for p in preds]
    y_true = lgbDataset.get_label()
    return 'f1', f1_score(y_true, binary_preds), True

def focal_loss_lgb_eval_error(y_pred, dtrain, alpha, gamma):
  a,g = alpha, gamma
  y_true = dtrain.label
  p = 1/(1+np.exp(-y_pred))
  loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
  # (eval_name, eval_result, is_higher_better)
  return 'focal_loss', np.mean(loss), False

In [None]:
# metric_name = "binary_logloss"
# metric_name = 'auc'
focal_loss = lambda x,y: focal_loss_lgb(x, y,alpha=0.25, gamma=2 )
focal_loss_error = lambda y_pred, dtrain: focal_loss_lgb_eval_error(y_pred, dtrain, alpha=0.25, gamma=2)
metric_name = "focal_loss"
param_fixed = { 
        "metric": metric_name,  
        # "is_unbalance": True,
        "verbosity": -1
        }
        
def objective(trial, X, y):
    
    d_train = lgb.Dataset(X, label=y)
    param_grid = {
        # "n_estimators": trial.suggest_categorical("n_estimators", [10,50,100,500, 1000]),
        "learning_rate":  trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        # "num_leaves": trial.suggest_int("num_leaves", 20, 1000, step=20),
        "max_depth": trial.suggest_int("max_depth", 2, 20),
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        # "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95),
        'feature_fraction': trial.suggest_float("feature_fraction", 0.2, 0.95)
    }
    param = param_fixed | param_grid

    lcv = lgb.cv(
        param, 
        d_train, 
        callbacks = [lgb.early_stopping(30), 
                            # lgb.log_evaluation(0)
                            ], 
                            num_boost_round =300,
        fobj = focal_loss,
        feval = focal_loss_error,
        # verbose_eval=False
        )
    return lcv[f"{metric_name}-mean"][-1]

In [None]:
save_path = "/workspaces/maitrise/results"
name_method = ["Corr_interlead","Corr_intralead","wPMF","SNRECG","HR","Kurtosis","Flatline","TSD"]

if not os.path.exists(save_path):
    os.makedirs(save_path)

if not "quality_metrics.nc" in os.listdir(save_path):
    print("Computing metrics")
    if not "ecg_data.nc" in os.listdir(save_path):
        ds_data = utils_data.format_data_to_xarray(path_petastorm, save_path)
    else:
        ds_data = xr.load_dataset(os.path.join(save_path,"ecg_data.nc"))

    ds_metrics = save_metrics_to_xarray(ds_data, name_method, save_path, verbose = True)
else:
    ds_metrics = xr.load_dataset(os.path.join(save_path,"quality_metrics.nc"))

In [None]:
ds_filtered = ds_metrics.where(ds_metrics.data_quality != "unlabeled").dropna(dim = "id")

np_metrics = ds_filtered.quality_metrics.values
metrics_names = ds_filtered.metric_name.values.tolist()
np_label = ds_filtered.data_quality.values
np_label[np_label == "acceptable" ] = 0
np_label[np_label == "unacceptable" ] = 1
np_label = np_label.astype(int)

In [None]:
X = np.concatenate((np_metrics.min(axis = 1),np_metrics.mean(axis = 1)),axis =-1)
metric_name_merged = [f"{x}_min" for x in metrics_names] + [f"{x}_mean" for x in metrics_names]
df_X = pd.DataFrame(X, columns =metric_name_merged )
y = np_label
df_y = pd.DataFrame(np_label, columns = ["y"])

In [None]:
train_x, test_x, train_y, test_y = train_test_split(df_X, y, test_size=0.2, random_state=1234)
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, train_x, train_y)
# optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(func, n_trials=100)

clear_output()

In [None]:
print(f"\tBest value binary_logloss: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
import lightgbm as lgbm
train2_x, val_x, train2_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=1234)
d_train = lgb.Dataset(train2_x, label=train2_y)
d_val = lgb.Dataset(val_x, label=val_y)
param = param_fixed | study.best_params
gbm = lgbm.train(param, 
            d_train, valid_sets = d_val, 
            callbacks = [lgb.early_stopping(30), 
                            lgb.log_evaluation(0)], 
            fobj = focal_loss,
            feval = focal_loss_error,      
                            )

# np.save(os.path.join(path_results,'test_set_x'), test_x.values)
# np.save(os.path.join(path_results,'test_set_y'), test_y)

preds = gbm.predict(test_x)
preds = sigmoid(preds)
pred_labels = (preds > 0.5).astype('int')
score = classification_report(test_y, pred_labels)

clear_output()

In [None]:
print(score)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
cm = confusion_matrix(test_y, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = ["acceptable", "unacceptable"])
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve,auc
print(roc_auc_score(test_y, preds))

fpr, tpr, _ = roc_curve(test_y, preds)
prec,rec,_ = precision_recall_curve(test_y,preds)

plt.clf()
plt.plot(fpr, tpr,label="AUC = {:.2f}".format(roc_auc_score(test_y, preds)))
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc = "best")
plt.grid()
plt.title('ROC curve')
plt.show()

plt.figure()
plt.plot(rec, prec,label="AUC = {:.2f}".format(auc(rec,prec)))
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid()
plt.legend(loc = "best")
plt.title('PR curve')
plt.show()

In [None]:
# save model with pickle
# with open(os.path.join(path_results,'lgb_classifier.pkl'), 'wb') as fout:
#     pickle.dump(gbm, fout)