In [16]:
from main_workflow import load_predictor,split_data,SequenceDataPreparer,prepare_dataloaders,load_autoencoder
import pandas as pd
from data_preparation.collators import pad_collate_fn 




In [2]:
from config import ( # Your config file
    RAW_DATA_PATH, NO_MISSINGS_ENCODED_PATH, DIAG_EMBEDDINGS_PATH, DIAG_LABEL_ENCODER_PATH, LABEL_ENCODERS_PATH,
    ICD9_HIERARCHY_PATH, ICD9_CHAPTERS_PATH, SPACY_MODEL_NAME, MISSING_VALUES,
    DROP_COLUMNS, ONE_HOT_COLUMNS, ORDINAL_MAPPINGS, TREATMENT_COLUMNS,
    TREATMENT_MAPPING, LABEL_ENCODING,

    LOG_FILE, RANDOM_SEED, PATIENT_ID_COL, TEST_SPLIT_SIZE, VALIDATION_SPLIT_SIZE,
    OTHER_EMBEDDING_DIM, HIDDEN_DIM, NUM_RNN_LAYERS, DROPOUT, USE_GRU, USE_ATTENTION,
    ATTENTION_DIM, AE_BATCH_SIZE, AE_EPOCHS, PREDICTOR_EPOCHS,
    LEARNED_EMB_COLS, FINETUNE_DIAG_EMBEDDINGS, PRECOMPUTED_EMB_COLS,AE_OPTIMIZER,
    AE_LEARNING_RATE,AE_WEIGHT_DECAY, AE_SCHEDULER_FACTOR, AE_SCHEDULER_PATIENCE,
    AE_EARLY_STOPPING_PATIENCE, PREDICTOR_OPTIMIZER, PREDICTOR_LEARNING_RATE,
    MODELS_DIR, PREDICTOR_EARLY_STOPPING_PATIENCE, PREDICTOR_SCHEDULER_FACTOR,
    PREDICTOR_SCHEDULER_PATIENCE, PREDICTOR_WEIGHT_DECAY, PREDICTOR_FINETUNE_ENCODER,
    SCALER_PATH, ISOLATION_FOREST_PATH, IF_N_ESTIMATORS, IF_CONTAMINATION,
    OUTLIER_MODE, VISIT_ERROR_PERCENTILE,
    FINAL_ENCODED_DATA_PATH, ENCOUNTER_ID_COL, TARGET_COL, NUMERICAL_FEATURES,
    OHE_FEATURES_PREFIX, ICD9_HIERARCHY_PATH, ICD9_CHAPTERS_PATH,
    MAX_SEQ_LENGTH,  AE_MODEL_LOAD_PATH, PREDICTOR_MODEL_LOAD_PATH, RESULTS_DIR
)

In [3]:
df_final = pd.read_csv('../data/diabetic_data_no_na_diag.csv', low_memory=False)

df_raw_ids = pd.read_csv('../data/diabetic_data.csv', usecols=['encounter_id', 'patient_nbr'])
# Ensure indices align before assigning
df_final = df_final.reset_index(drop=True)
df_raw_ids = df_raw_ids.reset_index(drop=True)
df_final['encounter_id'] = df_raw_ids['encounter_id']
df_final['patient_nbr'] = df_raw_ids['patient_nbr']

df_final.reset_index(drop=True, inplace=True) # Ensure clean index



In [4]:
df_train, df_val, df_test = split_data(df_final)

[2025-04-30 08:46:59] {main_workflow.py:101} INFO - --- Splitting Data (Patient Level - Revised Index Handling) ---
[2025-04-30 08:46:59] {main_workflow.py:109} INFO - Total rows before split: 101766
[2025-04-30 08:46:59] {main_workflow.py:110} INFO - Total unique patients: 71518
[2025-04-30 08:46:59] {main_workflow.py:121} INFO - Test set created: 15108 rows, 10728 patients.
[2025-04-30 08:46:59] {main_workflow.py:138} INFO - Train set created: 71395 rows, 50062 patients.
[2025-04-30 08:46:59] {main_workflow.py:139} INFO - Validation set created: 15263 rows, 10728 patients.
[2025-04-30 08:46:59] {main_workflow.py:140} INFO - --- Data Splitting Complete ---


In [5]:
# 4. Prepare DataLoaders
data_preparer = SequenceDataPreparer(
    patient_id_col=PATIENT_ID_COL, timestamp_col=ENCOUNTER_ID_COL, target_col=TARGET_COL,
    numerical_features=NUMERICAL_FEATURES, ohe_feature_prefixes=OHE_FEATURES_PREFIX,
    learned_emb_cols=LEARNED_EMB_COLS, precomputed_emb_cols=PRECOMPUTED_EMB_COLS,
    max_seq_length=MAX_SEQ_LENGTH, scaler_path=SCALER_PATH
)
# Need a sample batch to determine dims for loading AE if not training
# Prepare loaders *before* deciding whether to train or load AE
train_loader, val_loader = prepare_dataloaders(data_preparer, df_train, df_val, AE_BATCH_SIZE)
sample_batch_for_build = next(iter(train_loader)) # Get a sample batch

[2025-04-30 08:46:59] {sequence_preparer.py:48} INFO - SequenceDataPreparer initialized. Max length: 50
[2025-04-30 08:46:59] {sequence_preparer.py:116} INFO - Scaler loaded from c:\Users\lukag\OneDrive\Desktop\Universidad\3ero\cuadrimestre2\PAID\github\IDSS-for-Diabetes-Readmission-Prediction\models\scaler.pkl
[2025-04-30 08:46:59] {main_workflow.py:160} INFO - --- Preparing Sequences and DataLoaders ---
[2025-04-30 08:46:59] {sequence_preparer.py:60} INFO - Identified 17 OHE columns.
[2025-04-30 08:46:59] {sequence_preparer.py:141} INFO - Transforming DataFrame (71395 rows) into sequences.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[2025-04-30 08:47:38] {sequence_preparer.py:199} INFO - Created 50062 sequences for 50062 patients.
[2025-04-30 08:47:38] {sequence_preparer.py:141} INFO - Transforming DataFrame (15263 rows) into sequences.
[2025-04-30 08:47:46] {sequence_preparer.py:199} INFO - Created 10728 sequences for 10728 patients.
[2025-04-30 08:47:46] {main_workflow.py:176} INFO - Train and Validation DataLoaders created.
[2025-04-30 08:47:46] {main_workflow.py:177} INFO - --- Sequence Preparation Complete ---


In [6]:
ae_model_load_path = AE_MODEL_LOAD_PATH # Load path from config
trained_ae = load_autoencoder(ae_model_load_path, sample_batch_for_build)

predictor_model_load_path = PREDICTOR_MODEL_LOAD_PATH # Load path from config
trained_predictor = load_predictor(predictor_model_load_path, sample_batch_for_build) 

[2025-04-30 08:48:47] {main_workflow.py:202} INFO - --- Loading Pre-trained Autoencoder from c:\Users\lukag\OneDrive\Desktop\Universidad\3ero\cuadrimestre2\PAID\github\IDSS-for-Diabetes-Readmission-Prediction\models\autoencoder_best.pth ---
[2025-04-30 08:48:47] {model_builder.py:10} INFO - Building AE model architecture from config...
[2025-04-30 08:48:47] {embeddings.py:32} INFO - Initialized learned embedding for 'discharge_disposition_id' (Vocab: 26, Dim: 10)
[2025-04-30 08:48:47] {embeddings.py:32} INFO - Initialized learned embedding for 'admission_source_id' (Vocab: 17, Dim: 10)
[2025-04-30 08:48:47] {embeddings.py:45} INFO - Initialized precomputed embedding for 'diag_1' (Shape: torch.Size([916, 8]), Finetune: True)
[2025-04-30 08:48:47] {embeddings.py:45} INFO - Initialized precomputed embedding for 'diag_2' (Shape: torch.Size([916, 8]), Finetune: True)
[2025-04-30 08:48:47] {embeddings.py:45} INFO - Initialized precomputed embedding for 'diag_3' (Shape: torch.Size([916, 8]), 

  artifact = torch.load(path, map_location=device)


In [11]:
from analysis.predictor_inference import Predictor
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [17]:
import torch
import torch.nn as nn
from typing import Dict
class ShapModel(nn.Module):
    def __init__(self, predictor_model):
        super().__init__()
        self.pred = predictor_model.eval()

    def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
        # batch ya viene de pad_collate_fn:
        #   batch['features'] → el dict interno
        #   batch['mask']     → Tensor(batch, seq_len)
        logits = self.pred({
            'features': batch['features'],
            'mask': batch['mask']
        })                                     # → (batch, seq_len, n_classes)
        probs  = torch.softmax(logits, dim=-1)  # → (batch, seq_len, n_classes)
        return probs[:, -1, :]                  # → (batch, n_classes)


In [18]:
# Número de ejemplos de fondo
N_BG = 50
bg_batches = []
count = 0

for batch in train_loader:
    # batch: dict{'num_ohe', 'learned_labels', 'precomputed_labels', 'mask', ...}
    # Reconstruye el campo 'features' que espera tu predictor:
    features = {
      'num_ohe':            batch['num_ohe'],            # (b, seq_len, d₁)
      'learned_labels':     batch['learned_labels'],     # dict de (b, seq_len)
      'precomputed_labels': batch['precomputed_labels'], # dict de (b, seq_len)
    }
    bg_batches.append({
      'features': features,
      'mask':     batch['mask']                         # (b, seq_len)
    })
    count += batch['num_ohe'].size(0)
    if count >= N_BG:
        break

# Aplana la lista de batches a una lista de ejemplos
# y recorta a exactamente N_BG:
bg_list = []
for b in bg_batches:
    batch_size = b['mask'].size(0)
    for i in range(batch_size):
        # extrae el i-ésimo ejemplar de cada tensor
        single = {
          'features': {
            'num_ohe':            b['features']['num_ohe'][i:i+1],
            'learned_labels':     {col: t[i:i+1] for col,t in b['features']['learned_labels'].items()},
            'precomputed_labels': {col: t[i:i+1] for col,t in b['features']['precomputed_labels'].items()},
          },
          'mask': b['mask'][i:i+1]
        }
        bg_list.append(single)
        if len(bg_list) == N_BG:
            break
    if len(bg_list) == N_BG:
        break

# Convierte la lista a un único batch concatenado
def collate_bg(list_of_dicts):
    # reutiliza tu pad_collate_fn desplegando
    #   cada dict como si fuese un batch de tamaño 1
    return pad_collate_fn([
      {
        'features': {
          **{'num_ohe': ds['features']['num_ohe'][0]},
          **{'learned_labels': {col: ds['features']['learned_labels'][col][0] for col in ds['features']['learned_labels']}},
          **{'precomputed_labels': {col: ds['features']['precomputed_labels'][col][0] for col in ds['features']['precomputed_labels']}}
        },
        'targets': torch.zeros(1, dtype=torch.long),  # dummy
        'length': ds['mask'].shape[1],
        'patient_id': None
      }
      for ds in list_of_dicts
    ])

bg_collated = collate_bg(bg_list)
# bg_collated['features'] es el dict con tensores (50, seq_len, …)
# bg_collated['mask']     es (50, seq_len)

# Pásalo a device
for k,v in bg_collated['features'].items():
    if isinstance(v, torch.Tensor):
        bg_collated['features'][k] = v.to(device)
for col in bg_collated['features']['learned_labels']:
    bg_collated['features']['learned_labels'][col] = bg_collated['features']['learned_labels'][col].to(device)
for col in bg_collated['features']['precomputed_labels']:
    bg_collated['features']['precomputed_labels'][col] = bg_collated['features']['precomputed_labels'][col].to(device)
bg_collated['mask'] = bg_collated['mask'].to(device)


KeyError: 0

In [None]:
import shap

shap_model     = ShapModel(predictor.model).to(device)
shap_explainer = shap.DeepExplainer(shap_model, bg_collated)


In [None]:
# 1. Transforma df_test
feature_seqs_test, test_target_seqs, test_pids = data_preparer.transform(df_test)
# 2. Toma solo el primero
dataset_test = PatientSequenceDataset(feature_seqs_test, test_target_seqs, test_pids)
batch_first  = pad_collate_fn([dataset_test[0]])
# 3. Llévalo a device
for k,v in batch_first['features'].items():
    batch_first['features'][k] = v.to(device)

batch_first['mask'] = batch_first['mask'].to(device)

X_batch = batch_first


In [None]:
shap_values = shap_explainer.shap_values(X_batch)


In [None]:
# Clase predicha
probs_first = shap_model(X_batch).cpu().detach().numpy()  # (1, n_classes)
pred_cls    = int(probs_first.argmax(axis=1)[0])

# Valores SHAP de la clase predicha
sv = shap_values[pred_cls][0]                             # (seq_len, n_feats)

# Importancia media por característica
mean_abs = sv.abs().mean(axis=0)                          # (n_feats,)
feat_names = data_preparer.feature_cols

for name, imp in zip(feat_names, mean_abs):
    print(f"{name:20s}: {imp:.4f}")


In [None]:
shap.initjs()
shap.force_plot(
    shap_explainer.expected_value[pred_cls],
    sv,
    X_batch['features']['num_ohe'][0].cpu().numpy(),
    feature_names=feat_names,
    matplotlib=True
)
