In [1]:
from main_workflow import load_predictor,split_data,SequenceDataPreparer,prepare_dataloaders,load_autoencoder
import pandas as pd
from data_preparation.collators import pad_collate_fn 




[2025-05-12 08:15:24] {logging_config.py:23} INFO - Logging configured.
[2025-05-12 08:15:24] {main_workflow.py:61} INFO - Using device: cpu


In [2]:
from config import ( # Your config file
    RAW_DATA_PATH, NO_MISSINGS_ENCODED_PATH, DIAG_EMBEDDINGS_PATH, DIAG_LABEL_ENCODER_PATH, LABEL_ENCODERS_PATH,
    ICD9_HIERARCHY_PATH, ICD9_CHAPTERS_PATH, SPACY_MODEL_NAME, MISSING_VALUES,
    DROP_COLUMNS, ONE_HOT_COLUMNS, ORDINAL_MAPPINGS, TREATMENT_COLUMNS,
    TREATMENT_MAPPING, LABEL_ENCODING,

    LOG_FILE, RANDOM_SEED, PATIENT_ID_COL, TEST_SPLIT_SIZE, VALIDATION_SPLIT_SIZE,
    OTHER_EMBEDDING_DIM, HIDDEN_DIM, NUM_RNN_LAYERS, DROPOUT, USE_GRU, USE_ATTENTION,
    ATTENTION_DIM, AE_BATCH_SIZE, AE_EPOCHS, PREDICTOR_EPOCHS,
    LEARNED_EMB_COLS, FINETUNE_DIAG_EMBEDDINGS, PRECOMPUTED_EMB_COLS,AE_OPTIMIZER,
    AE_LEARNING_RATE,AE_WEIGHT_DECAY, AE_SCHEDULER_FACTOR, AE_SCHEDULER_PATIENCE,
    AE_EARLY_STOPPING_PATIENCE, PREDICTOR_OPTIMIZER, PREDICTOR_LEARNING_RATE,
    MODELS_DIR, PREDICTOR_EARLY_STOPPING_PATIENCE, PREDICTOR_SCHEDULER_FACTOR,
    PREDICTOR_SCHEDULER_PATIENCE, PREDICTOR_WEIGHT_DECAY, PREDICTOR_FINETUNE_ENCODER,
    SCALER_PATH, ISOLATION_FOREST_PATH, IF_N_ESTIMATORS, IF_CONTAMINATION,
    OUTLIER_MODE, VISIT_ERROR_PERCENTILE,
    FINAL_ENCODED_DATA_PATH, ENCOUNTER_ID_COL, TARGET_COL, NUMERICAL_FEATURES,
    OHE_FEATURES_PREFIX, ICD9_HIERARCHY_PATH, ICD9_CHAPTERS_PATH,
    MAX_SEQ_LENGTH,  AE_MODEL_LOAD_PATH, PREDICTOR_MODEL_LOAD_PATH, RESULTS_DIR
)

In [3]:
df_final = pd.read_csv('../data/diabetic_data_no_na_diag.csv', low_memory=False)

df_raw_ids = pd.read_csv('../data/diabetic_data.csv', usecols=['encounter_id', 'patient_nbr'])
# Ensure indices align before assigning
df_final = df_final.reset_index(drop=True)
df_raw_ids = df_raw_ids.reset_index(drop=True)
df_final['encounter_id'] = df_raw_ids['encounter_id']
df_final['patient_nbr'] = df_raw_ids['patient_nbr']

df_final.reset_index(drop=True, inplace=True) # Ensure clean index



In [4]:
df_train, df_val, df_test = split_data(df_final)

[2025-05-12 08:15:24] {main_workflow.py:101} INFO - --- Splitting Data (Patient Level - Revised Index Handling) ---
[2025-05-12 08:15:24] {main_workflow.py:109} INFO - Total rows before split: 101766
[2025-05-12 08:15:24] {main_workflow.py:110} INFO - Total unique patients: 71518
[2025-05-12 08:15:24] {main_workflow.py:121} INFO - Test set created: 15108 rows, 10728 patients.
[2025-05-12 08:15:24] {main_workflow.py:138} INFO - Train set created: 71395 rows, 50062 patients.
[2025-05-12 08:15:24] {main_workflow.py:139} INFO - Validation set created: 15263 rows, 10728 patients.
[2025-05-12 08:15:24] {main_workflow.py:140} INFO - --- Data Splitting Complete ---


In [5]:
# 4. Prepare DataLoaders
data_preparer = SequenceDataPreparer(
    patient_id_col=PATIENT_ID_COL, timestamp_col=ENCOUNTER_ID_COL, target_col=TARGET_COL,
    numerical_features=NUMERICAL_FEATURES, ohe_feature_prefixes=OHE_FEATURES_PREFIX,
    learned_emb_cols=LEARNED_EMB_COLS, precomputed_emb_cols=PRECOMPUTED_EMB_COLS,
    max_seq_length=MAX_SEQ_LENGTH, scaler_path=SCALER_PATH
)
# Need a sample batch to determine dims for loading AE if not training
# Prepare loaders *before* deciding whether to train or load AE
train_loader, val_loader = prepare_dataloaders(data_preparer, df_train, df_val, AE_BATCH_SIZE)
sample_batch_for_build = next(iter(train_loader)) # Get a sample batch

[2025-05-12 08:15:27] {sequence_preparer.py:48} INFO - SequenceDataPreparer initialized. Max length: 50
[2025-05-12 08:15:27] {sequence_preparer.py:116} INFO - Scaler loaded from c:\Users\lukag\OneDrive\Desktop\Universidad\3ero\cuadrimestre2\PAID\github\IDSS-for-Diabetes-Readmission-Prediction\models\scaler.pkl
[2025-05-12 08:15:27] {main_workflow.py:160} INFO - --- Preparing Sequences and DataLoaders ---
[2025-05-12 08:15:27] {sequence_preparer.py:60} INFO - Identified 17 OHE columns.
[2025-05-12 08:15:27] {sequence_preparer.py:141} INFO - Transforming DataFrame (71395 rows) into sequences.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


[2025-05-12 08:15:46] {sequence_preparer.py:199} INFO - Created 50062 sequences for 50062 patients.
[2025-05-12 08:15:46] {sequence_preparer.py:141} INFO - Transforming DataFrame (15263 rows) into sequences.
[2025-05-12 08:15:51] {sequence_preparer.py:199} INFO - Created 10728 sequences for 10728 patients.
[2025-05-12 08:15:51] {main_workflow.py:176} INFO - Train and Validation DataLoaders created.
[2025-05-12 08:15:51] {main_workflow.py:177} INFO - --- Sequence Preparation Complete ---


In [6]:
ae_model_load_path = AE_MODEL_LOAD_PATH # Load path from config
trained_ae = load_autoencoder(ae_model_load_path, sample_batch_for_build)

predictor_model_load_path = PREDICTOR_MODEL_LOAD_PATH # Load path from config
trained_predictor = load_predictor(predictor_model_load_path, sample_batch_for_build) 

[2025-05-12 08:16:38] {main_workflow.py:202} INFO - --- Loading Pre-trained Autoencoder from c:\Users\lukag\OneDrive\Desktop\Universidad\3ero\cuadrimestre2\PAID\github\IDSS-for-Diabetes-Readmission-Prediction\models\autoencoder_best.pth ---
[2025-05-12 08:16:38] {model_builder.py:10} INFO - Building AE model architecture from config...
[2025-05-12 08:16:38] {embeddings.py:32} INFO - Initialized learned embedding for 'discharge_disposition_id' (Vocab: 26, Dim: 10)
[2025-05-12 08:16:38] {embeddings.py:32} INFO - Initialized learned embedding for 'admission_source_id' (Vocab: 17, Dim: 10)
[2025-05-12 08:16:38] {embeddings.py:45} INFO - Initialized precomputed embedding for 'diag_1' (Shape: torch.Size([916, 8]), Finetune: True)
[2025-05-12 08:16:38] {embeddings.py:45} INFO - Initialized precomputed embedding for 'diag_2' (Shape: torch.Size([916, 8]), Finetune: True)
[2025-05-12 08:16:38] {embeddings.py:45} INFO - Initialized precomputed embedding for 'diag_3' (Shape: torch.Size([916, 8]), 

  artifact = torch.load(path, map_location=device)


In [8]:
# 1. Toma 50–100 secuencias de entrenamiento
feature_seqs, _, _ = data_preparer.transform(df_train)
# 2. Aplica pad_collate_fn para obtener un batch
batch_bg = pad_collate_fn(feature_seqs[:50])  
# 3. Extrae únicamente los features y la máscara
X_bg = batch_bg['features'].numpy()       # shape (50, seq_len, feat_dim)
mask_bg = batch_bg['mask'].numpy().astype(bool)  # shape (50, seq_len)


[2025-05-12 08:29:37] {sequence_preparer.py:141} INFO - Transforming DataFrame (71395 rows) into sequences.
[2025-05-12 08:29:57] {sequence_preparer.py:199} INFO - Created 50062 sequences for 50062 patients.


TypeError: list indices must be integers or slices, not str