In [1]:
from runner.data_utils.base_datamodule import DataModule

from sklearn.datasets import fetch_openml
import os
import pickle
from types import SimpleNamespace
from typing import Tuple, List
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

class DiabetesDataModule(DataModule):
    def __init__(self
        ) -> None:
        super().__init__('diabetes')
    
    def load_data(self) -> Tuple[pd.DataFrame, pd.Series]:
        diabetes = fetch_openml(data_id = 37, data_home='./data_cache')

        data = diabetes.data
        
        le = LabelEncoder()
        label = pd.Series(le.fit_transform(diabetes.target))

        return data, label
    
    def prepare_data(self) -> Tuple[pd.DataFrame, pd.Series, List[str], List[str]]:

        data, label = self.load_data()

        categorical_cols = []
        
        numeric_cols = list(map(str, data.columns))
            
        return data, label.values, numeric_cols, categorical_cols

In [2]:
test_size = 0.2
random_seed = 42

In [3]:
from sklearn.model_selection import train_test_split

def prepare_data() -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]:
    datamodule = DiabetesDataModule()
    
    data, label, continuous_cols, categorical_cols = datamodule.prepare_data()

    train_idx, test_idx, _, _ = train_test_split(np.arange(len(label)).reshape((-1, 1)), label, test_size=test_size, random_state=random_seed, stratify=label)
    train_idx, test_idx = train_idx.ravel(), test_idx.ravel()

    X_test, y_test = data.iloc[test_idx], label[test_idx]
    data, label = data.iloc[train_idx], label[train_idx]
    
    return data, label, X_test, y_test, continuous_cols, categorical_cols

In [4]:
import importlib

def prepare_config(target_config: str) -> SimpleNamespace:
    configlib = importlib.import_module('runner.config')
    config = getattr(configlib, target_config)
    
    config.model.out_dim = 2
    
    config.experiment.metric = "accuracy_score"
    config.experiment.metric_params = []
    config.experiment.data_config = "diabetes"
    config.experiment.optuna.direction = 'maximize'
    config.experiment.random_seed = random_seed
    config.experiment.task = "binary"
    
    config.experiment.optuna.n_trials = 1
    
    config.experiment.calibrator = "HistogramBinning"
    
    config.experiment.KFold = 1
    config.experiment.early_stopping_patience = 1
    config.dice.backend = "sklearn"
    config.dice.desired_class = 1
    return config

In [5]:
data, label, X_test, y_test, continuous_cols, categorical_cols = prepare_data()
continuous_cols = list(map(str, data.columns))
data.columns = continuous_cols
config = prepare_config('mlp_config')

In [6]:
from runner.runner import Runner
def prepare_runner(config: SimpleNamespace, X: pd.DataFrame, y: np.array, continuous_cols: List[str], categorical_cols: List[str]) -> Runner:
    modellib = importlib.import_module('runner.models')
    model_class = getattr(modellib, config.model.model_class)

    runner = Runner(config = config, model_class=model_class, X=X, y = y, continuous_cols=continuous_cols, categorical_cols=categorical_cols)
    
    return runner



In [7]:
from runner.models import BaseModel
from runner.misc.eval_metric import EvalMetric
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score, recall_score, average_precision_score
from typing import Type

class DiabetesEvalMetric(EvalMetric):
    def eval(self, model: Type[BaseModel], X_test: pd.DataFrame, y_test: np.array):
        preds_proba = model.predict_proba(X_test)
        preds = preds_proba.argmax(1)
        
        f1 = f1_score(y_test, preds)
        roc = roc_auc_score(y_test, preds_proba[:, 1])
        specificity = recall_score(np.logical_not(y_test) , np.logical_not(preds))
        sensitivity = recall_score(y_test, preds)
        accuracy = accuracy_score(y_test, preds)
        pr_auc = average_precision_score(y_test, preds_proba[:, 1])
        tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
        ppv = tp / (tp + fp)
        npv = tn / (tn + fn)
        
        print("F1 Score: %.4f" % f1)
        print("ROC AUC Score: %.4f" % roc)
        print("Specificity Score: %.4f" % specificity)
        print("Sensitivity Score: %.4f" % sensitivity)
        print("Accuracy Score: %.4f" % accuracy)
        print("Precision Recall AUC Score: %.4f" % pr_auc)
        print("PPV Score: %.4f" % ppv)
        print("NPV Score: %.4f" % npv)

In [8]:
runner = prepare_runner(config, data, label, continuous_cols, categorical_cols)

runner.train()

runner.init_calibrator()
runner.test(X_test, y_test, DiabetesEvalMetric())

2023-10-06 23:33:03.144559: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[I 2023-10-06 23:33:04,334] A new study created in memory with name: no-name-63230beb-179e-4835-a3b6-a6fa12a67d49
2023-10-06 23:33:04,349 - {pytorch_tabular.tabular_model:105} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-10-06 23:33:04,367 - {pytorch_tabular.tabular_model:473} - INFO - Preparing the DataLoaders
2023-10-06 23:33:04,367 - {pytorch_tabular.tabular_datamodule:290} - INFO - Setting up the datamodule for classification task
2023-10-06 23:33:04,377 - {pytorch_tabular.tabular_model:521} - INFO - Preparing the Model: CategoryEmbeddingModel
2023-10-06 23:33:04,392 - {pytorch_tabular.tabular_model:268} - INFO - Preparing the Trainer
  rank_zero_depr

Output()

2023-10-06 23:34:24,196 - {pytorch_tabular.tabular_model:584} - INFO - Training the model completed


Output()

<class 'pandas.core.frame.DataFrame'> (123, 8)


[I 2023-10-06 23:34:26,093] Trial 0 finished with value: 0.6260162601626016 and parameters: {'embedding_dropout': 0.0749080237694725, 'layers': '128-64-32', 'activation': 'ReLU', 'learning_rate': 0.043322189674169266}. Best is trial 0 with value: 0.6260162601626016.
2023-10-06 23:34:26,104 - {pytorch_tabular.tabular_model:105} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-10-06 23:34:26,119 - {pytorch_tabular.tabular_model:473} - INFO - Preparing the DataLoaders
2023-10-06 23:34:26,120 - {pytorch_tabular.tabular_datamodule:290} - INFO - Setting up the datamodule for classification task
2023-10-06 23:34:26,129 - {pytorch_tabular.tabular_model:521} - INFO - Preparing the Model: CategoryEmbeddingModel
2023-10-06 23:34:26,144 - {pytorch_tabular.tabular_model:268} - INFO - Preparing the Trainer
  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPU

Best Parameters
{'embedding_dropout': 0.0749080237694725, 'layers': '128-64-32', 'activation': 'ReLU', 'learning_rate': 0.043322189674169266}


Output()

2023-10-06 23:34:40,385 - {pytorch_tabular.tabular_model:584} - INFO - Training the model completed


Output()

<class 'pandas.core.frame.DataFrame'> (123, 8)


Output()

Validation Score: 0.6260
<class 'pandas.core.frame.DataFrame'> (123, 8)


Output()

Uncalibrated ECE : 0.22770648581975297
Calibrated ECE : 4.197184605290226e-17
<class 'pandas.core.frame.DataFrame'> (154, 8)


F1 Score: 0.6906
ROC AUC Score: 0.8146
Specificity Score: 0.6300
Sensitivity Score: 0.8889
Accuracy Score: 0.7208
Precision Recall AUC Score: 0.6900
PPV Score: 0.5647
NPV Score: 0.9130


In [9]:
runner.dice(X_test)

  0%|          | 0/154 [00:00<?, ?it/s]




ValueError: ('Target', 'target', 'present in query instance')

In [None]:
del data["target"]

In [10]:
import lime
import lime.lime_tabular
explainer = lime.lime_tabular.LimeTabularExplainer(data.values, feature_names=data.columns, class_names=["0", "1"], verbose=True, mode='classification', discretize_continuous=True)

In [11]:
data.shape

(614, 8)

In [12]:
X_test.drop(['target'], axis=1).iloc[0].index.values

array(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age'],
      dtype=object)

In [18]:
runner.model.model.predict(X_test.drop(['target'], axis=1).iloc[0].values)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [13]:
exp = explainer.explain_instance(X_test.drop(['target'], axis=1).iloc[0], runner.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True)

<class 'numpy.ndarray'> (5000, 8)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
type(X_test.drop(['target'], axis=1).iloc[0]) == pd.Series

True

In [None]:
temp = pd.DataFrame(X_test.drop(['target'], axis=1).iloc[0].values.reshape((1, -1)), columns = list(map(str, X_test.drop(['target'], axis=1).iloc[0].index.values)))

In [None]:
temp['target'] = [0 for i in range(len(temp))]

In [None]:
exp.save_to_file('temp.html')