In [None]:
from runner.data_utils.base_datamodule import DataModule

from sklearn.datasets import fetch_openml
import os
import pickle
from types import SimpleNamespace
from typing import Tuple, List
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

class DiabetesDataModule(DataModule):
    def __init__(self
        ) -> None:
        super().__init__('diabetes')
    
    def load_data(self) -> Tuple[pd.DataFrame, pd.Series]:
        diabetes = fetch_openml(data_id = 37, data_home='./data_cache')

        data = diabetes.data
        
        le = LabelEncoder()
        label = pd.Series(le.fit_transform(diabetes.target))

        return data, label
    
    def prepare_data(self) -> Tuple[pd.DataFrame, pd.Series, List[str], List[str]]:

        data, label = self.load_data()

        categorical_cols = []
        
        numeric_cols = list(map(str, data.columns))
            
        return data, label.values, numeric_cols, categorical_cols

In [None]:
test_size = 0.2
random_seed = 42

In [None]:
from sklearn.model_selection import train_test_split
from runner.data_utils.kamir  import KamirDataModule
def prepare_data() -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]:
    datamodule = DiabetesDataModule()
    # import yaml
    # with open('data_config/6M_mortality.yaml', 'rb') as f:
    #     config = yaml.load(f, Loader=yaml.FullLoader)
    # datamodule = KamirDataModule("binary", SimpleNamespace(**config))
    data, label, continuous_cols, categorical_cols = datamodule.prepare_data()

    train_idx, test_idx, _, _ = train_test_split(np.arange(len(label)).reshape((-1, 1)), label, test_size=test_size, random_state=random_seed, stratify=label)
    train_idx, test_idx = train_idx.ravel(), test_idx.ravel()

    X_test, y_test = data.iloc[test_idx], label[test_idx]
    data, label = data.iloc[train_idx], label[train_idx]
    
    return data, label, X_test, y_test, continuous_cols, categorical_cols

In [None]:
import importlib

def prepare_config(target_config: str) -> SimpleNamespace:
    configlib = importlib.import_module('runner.config')
    config = getattr(configlib, target_config)
    
    config.model.out_dim = 2
    
    config.experiment.metric = "accuracy_score"
    config.experiment.metric_params = []
    config.experiment.data_config = "diabetes"
    config.experiment.optuna.direction = 'maximize'
    config.experiment.random_seed = random_seed
    config.experiment.task = "binary"
    
    config.experiment.optuna.n_trials = 1
    
    config.experiment.calibrator = "HistogramBinning"
    
    config.experiment.KFold = 1
    config.experiment.early_stopping_patience = 1
    config.dice.backend = "sklearn"
    config.dice.desired_class = 0
    
    config.experiment.fast_dev_run = True
    return config

In [None]:
data, label, X_test, y_test, continuous_cols, categorical_cols = prepare_data()
# continuous_cols = list(map(str, data.columns))
# data.columns = continuous_cols
config = prepare_config('xgb_config')

In [None]:
from runner.runner import Runner
def prepare_runner(config: SimpleNamespace, X: pd.DataFrame, y: np.array, continuous_cols: List[str], categorical_cols: List[str]) -> Runner:
    modellib = importlib.import_module('runner.models')
    model_class = getattr(modellib, config.model.model_class)

    runner = Runner(config = config, model_class=model_class, X=X, y = y, continuous_cols=continuous_cols, categorical_cols=categorical_cols)
    
    return runner

In [None]:
from runner.models import BaseModel
from runner.misc.eval_metric import EvalMetric
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score, recall_score, average_precision_score
from typing import Type

class DiabetesEvalMetric(EvalMetric):
    def eval(self, model: Type[BaseModel], X_test: pd.DataFrame, y_test: np.array):
        preds_proba = model.predict_proba(X_test)
        preds = preds_proba.argmax(1)
        
        f1 = f1_score(y_test, preds)
        roc = roc_auc_score(y_test, preds_proba[:, 1])
        specificity = recall_score(np.logical_not(y_test) , np.logical_not(preds))
        sensitivity = recall_score(y_test, preds)
        accuracy = accuracy_score(y_test, preds)
        pr_auc = average_precision_score(y_test, preds_proba[:, 1])
        tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
        ppv = tp / (tp + fp)
        npv = tn / (tn + fn)
        
        print("F1 Score: %.4f" % f1)
        print("ROC AUC Score: %.4f" % roc)
        print("Specificity Score: %.4f" % specificity)
        print("Sensitivity Score: %.4f" % sensitivity)
        print("Accuracy Score: %.4f" % accuracy)
        print("Precision Recall AUC Score: %.4f" % pr_auc)
        print("PPV Score: %.4f" % ppv)
        print("NPV Score: %.4f" % npv)

In [None]:
runner = prepare_runner(config, data, label, continuous_cols, categorical_cols)

runner.train()

runner.init_calibrator()
runner.test(X_test, y_test, DiabetesEvalMetric())

In [None]:
dice_d = X_test.copy()

In [None]:
dice = runner.dice(dice_d)

In [None]:
dice.visualize_as_dataframe()

In [None]:
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(data.values, feature_names=data.columns, class_names=["0", "1"], verbose=True, mode='classification', discretize_continuous=True)

In [None]:
exp = explainer.explain_instance(X_test.iloc[0], runner.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True)

In [None]:
exp.save_to_file('temp.html')

In [None]:
shap_explainer = runner.shap(X_test)
shap_values = shap_explainer(X_test)

In [None]:
import shap
shap.plots.waterfall(shap_values[0])

In [None]:
shap.plots.beeswarm(shap_values)