In [None]:
%pip install "pandas<3.0.0"
%pip install pypots benchpots pygrinder --upgrade
%pip install scikit-learn   
%pip install --upgrade jupyter ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Bibliotecas com algumas coisas interessantes ja implementadas

In [2]:
import numpy as np
from pygrinder import mcar, calc_missing_rate
from benchpots.datasets import preprocess_physionet2012
from pypots.nn.functional import calc_mae

  from .autonotebook import tqdm as notebook_tqdm


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



### Physionet2012 é um dataset bem comum nos benchmarks e comparacoes de imputacao

In [3]:

# carrega e pré-processa o conjunto de dados PhysioNet 2012
# subset='set-a' seleciona o subconjunto A do dataset
# rate=0.1 adiciona artificialmente 10% de valores faltantes aos dados para teste
data = preprocess_physionet2012(subset='set-a', rate=0.1)


# separa em conjuntos de treino, validação e teste
# cada conjunto contém séries temporais com valores faltantes (originais + artificiais)
train_X, val_X, test_X = data["train_X"], data["val_X"], data["test_X"]

# dimensões dos dados: (amostras, passos temporais, características) 
# -> por exemplo temos 10 pacientes, 48 horas de monitoramento e 2 variáveis (pressão arterial e frequência cardíaca) -> (10, 48, 2)
print(train_X.shape)
print(val_X.shape) 

# taxa de valores faltantes no conjunto de treino
print(f"Temos {calc_missing_rate(train_X):.1%} valores faltantes em train_X")

# apenas as séries temporais incompletas são necessárias
train_set = {"X": train_X}

# inclui tanto os dados incompletos quanto os originais completos
# X_ori serve como ground truth para avaliar o desempenho do modelo e selecionar o melhor checkpoint
val_set = {
    "X": val_X,
    "X_ori": data["val_X_ori"],
}

# contém apenas as séries temporais incompletas
# modelo irá imputar (preencher) os valores faltantes 
test_set = {"X": test_X}

# valores originais completos do conjunto de teste 
test_X_ori = data["test_X_ori"]

# máscara que identifica os valores que foram artificialmente removidos
# indica onde estão os valores que existem em X_ori mas foram removidos em test_X
# XOR (^) para encontrar posições que são faltantes em test_X mas presentes em test_X_ori
indicating_mask = np.isnan(test_X) ^ np.isnan(test_X_ori)




2026-02-18 14:55:12 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2026-02-18 14:55:12 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2026-02-18 14:55:12 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2026-02-18 14:55:12 [INFO]: Loaded successfully!
  X = X.groupby("RecordID").apply(apply_func)
2026-02-18 14:55:17 [INFO]: 22943 values masked out in the val set as ground truth, take 9.94% of the original observed values
2026-02-18 14:55:17 [INFO]: 29131 values masked out in the test set as ground truth, take 10.04% of the original observed values
2026-02-18 14:55:17 [INFO]: Total sample number: 3997
2026-02-18 14:55:17 [INFO]: Training set size: 2557 (63.97%)
2026-02-18 14:55:17 [INFO]: Validation set size: 640 (16.01%)
2026-02-18 14:55:17 [INFO

(2557, 48, 37)
(640, 48, 37)
Temos 79.8% valores faltantes em train_X


In [4]:
# SAITS (Self-Attention-based Imputation for Time Series)
from pypots.imputation import SAITS

# Configura e inicializa o modelo SAITS com os hiperparâmetros:
# - n_steps: número de passos temporais na série
# - n_features: número de características/variáveis em cada passo
# - n_layers: 2 camadas de transformers
# - d_model: 256 dimensões no espaço de embedding
# - n_heads: 4 cabeças de atenção paralelas
# - d_k, d_v: 64 dimensões para keys e values no mecanismo de atenção
# - d_ffn: 128 dimensões na rede feed-forward
# - dropout: 0.1 taxa de dropout para regularização
# - epochs: 5 épocas de treinamento

saits = SAITS(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    n_layers=2,
    d_model=256,
    n_heads=4,
    d_k=64,
    d_v=64,
    d_ffn=128,
    dropout=0.1,
    epochs=5
)

# modelo aprende a reconstruir valores faltantes através de mecanismos de auto-atenção
saits.fit(train_set, val_set)

# usa o modelo treinado para imputar (preencher) todos os valores faltantes no conjunto de teste
# isso inclui tanto os valores originalmente faltantes quanto os artificialmente removidos
imputation = saits.impute(test_set)

# calcula o erro absoluto médio (MAE) comparando os valores imputados com os valores verdadeiros
# avalia apenas os valores que foram artificialmente removidos (indicados pela máscara)
mae_saits = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)

print(f"MAE: {mae_saits:.4f}")

# salva o modelo treinado 
saits.save("save_it_here/saits_physionet2012.pypots", overwrite=True)

# como carregar o modelo:
saits.load("save_it_here/saits_physionet2012.pypots")

2026-02-18 14:55:17 [INFO]: No given device, using default device: cpu
2026-02-18 14:55:17 [INFO]: Using customized MAE as the training loss function.
2026-02-18 14:55:17 [INFO]: Using customized MSE as the validation metric function.
2026-02-18 14:55:17 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 1,378,358
2026-02-18 14:55:21 [INFO]: Epoch 001 - training loss (MAE): 0.9551, validation MSE: 0.3758
2026-02-18 14:55:28 [INFO]: Epoch 002 - training loss (MAE): 0.6574, validation MSE: 0.3406
2026-02-18 14:55:35 [INFO]: Epoch 003 - training loss (MAE): 0.5966, validation MSE: 0.3263
2026-02-18 14:55:42 [INFO]: Epoch 004 - training loss (MAE): 0.5616, validation MSE: 0.3190
2026-02-18 14:55:48 [INFO]: Epoch 005 - training loss (MAE): 0.5352, validation MSE: 0.3028
2026-02-18 14:55:48 [INFO]: Finished training. The best model is from epoch#5.
2026-02-18 14:55:49 [INFO]: Saved the model to save_it_here/saits_physionet2012.pypots
2026-02-18 14:5

MAE: 0.3046


In [5]:
# BRITS (Bidirectional Recurrent Imputation for Time Series)
from pypots.imputation import BRITS

# Configura e inicializa o modelo BRITS com os hiperparâmetros:
# - n_steps: número de passos temporais na série
# - n_features: número de características/variáveis em cada passo
# - rnn_hidden_size: tamanho da camada oculta da RNN bidirecional
# - epochs: número de épocas de treinamento

brits = BRITS(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    rnn_hidden_size=256,
    epochs=5,
)

# modelo aprende a reconstruir valores faltantes através de RNNs bidirecionais
brits.fit(train_set, val_set)
imputation = brits.impute(test_set)
mae_brits = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)

print(f"MAE: {mae_brits:.4f}")

brits.save("save_it_here/brits_physionet2012.pypots", overwrite=True)

2026-02-18 14:55:49 [INFO]: No given device, using default device: cpu
2026-02-18 14:55:49 [INFO]: Using customized MAE as the training loss function.
2026-02-18 14:55:49 [INFO]: Using customized MSE as the validation metric function.
2026-02-18 14:55:49 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 729,584
2026-02-18 14:56:03 [INFO]: Epoch 001 - training loss (MAE): 1.0950, validation MSE: 0.4412
2026-02-18 14:56:17 [INFO]: Epoch 002 - training loss (MAE): 0.8598, validation MSE: 0.3800
2026-02-18 14:56:28 [INFO]: Epoch 003 - training loss (MAE): 0.7915, validation MSE: 0.3389
2026-02-18 14:56:42 [INFO]: Epoch 004 - training loss (MAE): 0.7480, validation MSE: 0.3136
2026-02-18 14:56:54 [INFO]: Epoch 005 - training loss (MAE): 0.7181, validation MSE: 0.2977
2026-02-18 14:56:54 [INFO]: Finished training. The best model is from epoch#5.
2026-02-18 14:56:56 [INFO]: Saved the model to save_it_here/brits_physionet2012.pypots


MAE: 0.3083


In [8]:
# GPVAE (Gaussian Process Variational Autoencoder)
from pypots.imputation import GPVAE

# Configura e inicializa o modelo GPVAE com os hiperparâmetros:
# - n_steps: número de passos temporais na série
# - n_features: número de características/variáveis em cada passo
# - latent_size: dimensão do espaço latente no autoencoder
# - epochs: número de épocas de treinamento

gpvae = GPVAE(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    latent_size=64,
    epochs=5,
)

gpvae.fit(train_set, val_set)
imputation = gpvae.impute(test_set)
imputation = imputation.squeeze(1)
mae_gpvae = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)
print(f"MAE: {mae_gpvae:.4f}")

gpvae.save("save_it_here/gpvae_physionet2012.pypots", overwrite=True)

2026-02-18 15:02:17 [INFO]: No given device, using default device: cpu
2026-02-18 15:02:17 [INFO]: GPVAE initialized with the given hyperparameters, the number of trainable parameters: 34,533
2026-02-18 15:02:24 [INFO]: Epoch 001 - training loss (default): 72875.8758, validation loss: 42923.9221
2026-02-18 15:02:34 [INFO]: Epoch 002 - training loss (default): 40456.4156, validation loss: 39639.6799
2026-02-18 15:02:43 [INFO]: Epoch 003 - training loss (default): 39477.0707, validation loss: 39337.9111
2026-02-18 15:02:53 [INFO]: Epoch 004 - training loss (default): 39328.5746, validation loss: 39254.5598
2026-02-18 15:02:59 [INFO]: Epoch 005 - training loss (default): 39278.6070, validation loss: 39218.6219
2026-02-18 15:02:59 [INFO]: Finished training. The best model is from epoch#5.
2026-02-18 15:03:01 [INFO]: Saved the model to save_it_here/gpvae_physionet2012.pypots


MAE: 0.6204


In [14]:
# TimeMixer++
from pypots.imputation import TimeMixerPP

# Configura e inicializa o modelo TimeMixerPP com os hiperparâmetros:
# - n_steps: número de passos temporais na série
# - n_features: número de características/variáveis em cada passo
# - n_layers: número de camadas do modelo
# - d_model: dimensão do espaço de embedding
# - d_ffn: dimensão da rede feed-forward
# - top_k: número de vizinhos mais próximos a considerar para cada passo temporal
# - n_heads: número de cabeças de atenção paralelas
# - n_kernels: número de kernels para o mecanismo de mistura temporal
# - epochs: número de épocas de treinamento

timemixerpp = TimeMixerPP(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    n_layers=1,
    d_model=16,
    d_ffn=16,
    top_k=3,
    n_heads=1,
    n_kernels=4,
    epochs=5
)

timemixerpp.fit(train_set, val_set)
imputation = timemixerpp.impute(test_set)
mae_timemixerpp = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)
print(f"MAE: {mae_timemixerpp:.4f}")
timemixerpp.save("save_it_here/timemixerpp_physionet2012.pypots", overwrite=True)

2026-02-18 16:13:50 [INFO]: No given device, using default device: cpu
2026-02-18 16:13:50 [INFO]: Using customized MAE as the training loss function.
2026-02-18 16:13:50 [INFO]: Using customized MSE as the validation metric function.
2026-02-18 16:13:50 [INFO]: TimeMixerPP initialized with the given hyperparameters, the number of trainable parameters: 88,147
2026-02-18 16:14:29 [INFO]: Epoch 001 - training loss (MAE): 0.4404, validation MSE: 0.6866
2026-02-18 16:15:08 [INFO]: Epoch 002 - training loss (MAE): 0.2487, validation MSE: 0.6950
2026-02-18 16:15:47 [INFO]: Epoch 003 - training loss (MAE): 0.1567, validation MSE: 0.7558
2026-02-18 16:16:23 [INFO]: Epoch 004 - training loss (MAE): 0.1239, validation MSE: 0.8056
2026-02-18 16:17:02 [INFO]: Epoch 005 - training loss (MAE): 0.1054, validation MSE: 0.8116
2026-02-18 16:17:02 [INFO]: Finished training. The best model is from epoch#1.
2026-02-18 16:17:08 [INFO]: Saved the model to save_it_here/timemixerpp_physionet2012.pypots


MAE: 0.5862


In [15]:
# TimeMixer
from pypots.imputation import TimeMixer

# Configura e inicializa o modelo TimeMixerPP com os hiperparâmetros:
# - n_steps: número de passos temporais na série
# - n_features: número de características/variáveis em cada passo
# - n_layers: número de camadas do modelo
# - d_model: dimensão do espaço de embedding
# - d_ffn: dimensão da rede feed-forward
# - top_k: número de vizinhos mais próximos a considerar para cada passo temporal
# - epochs: número de épocas de treinamento

timemixerpp = TimeMixer(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    n_layers=1,
    d_model=16,
    d_ffn=16,
    top_k=3,
    epochs=5
)

timemixerpp.fit(train_set, val_set)
imputation = timemixerpp.impute(test_set)
mae_timemixer = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)
print(f"MAE: {mae_timemixer:.4f}")
timemixerpp.save("save_it_here/timemixer_physionet2012.pypots", overwrite=True)

2026-02-18 16:21:23 [INFO]: No given device, using default device: cpu
2026-02-18 16:21:23 [INFO]: Using customized MAE as the training loss function.
2026-02-18 16:21:23 [INFO]: Using customized MSE as the validation metric function.
2026-02-18 16:21:23 [INFO]: TimeMixer initialized with the given hyperparameters, the number of trainable parameters: 10,581
2026-02-18 16:21:24 [INFO]: Epoch 001 - training loss (MAE): 0.6829, validation MSE: 0.8131
2026-02-18 16:21:25 [INFO]: Epoch 002 - training loss (MAE): 0.6271, validation MSE: 0.7892
2026-02-18 16:21:26 [INFO]: Epoch 003 - training loss (MAE): 0.6166, validation MSE: 0.7804
2026-02-18 16:21:26 [INFO]: Epoch 004 - training loss (MAE): 0.6148, validation MSE: 0.7766
2026-02-18 16:21:27 [INFO]: Epoch 005 - training loss (MAE): 0.6124, validation MSE: 0.7757
2026-02-18 16:21:27 [INFO]: Finished training. The best model is from epoch#5.
2026-02-18 16:21:27 [INFO]: Saved the model to save_it_here/timemixer_physionet2012.pypots


MAE: 0.6059


In [17]:
# TEFN
from pypots.imputation import TEFN

tefn = TEFN(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    epochs=5,
)

tefn.fit(train_set, val_set)
imputation = tefn.impute(test_set)
mae_tefn = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)
print(f"MAE: {mae_tefn:.4f}")
tefn.save("save_it_here/tefn_physionet2012.pypots", overwrite=True)

2026-02-18 16:25:56 [INFO]: No given device, using default device: cpu
2026-02-18 16:25:56 [INFO]: Using customized MAE as the training loss function.
2026-02-18 16:25:56 [INFO]: Using customized MSE as the validation metric function.
2026-02-18 16:25:56 [INFO]: TEFN initialized with the given hyperparameters, the number of trainable parameters: 3,455
2026-02-18 16:25:56 [INFO]: Epoch 001 - training loss (MAE): 2.0163, validation MSE: 1.9419
2026-02-18 16:25:57 [INFO]: Epoch 002 - training loss (MAE): 1.5170, validation MSE: 1.2304
2026-02-18 16:25:57 [INFO]: Epoch 003 - training loss (MAE): 1.1644, validation MSE: 0.8371
2026-02-18 16:25:57 [INFO]: Epoch 004 - training loss (MAE): 0.9159, validation MSE: 0.6231
2026-02-18 16:25:58 [INFO]: Epoch 005 - training loss (MAE): 0.7506, validation MSE: 0.5039
2026-02-18 16:25:58 [INFO]: Finished training. The best model is from epoch#5.
2026-02-18 16:25:58 [INFO]: Saved the model to save_it_here/tefn_physionet2012.pypots


MAE: 0.4152


In [None]:
# TimeLLM
from pypots.imputation import TimeLLM

timellm = TimeLLM(
    n_steps=train_X.shape[1],
    n_features=train_X.shape[2],
    n_layers=2,
    llm_model_type="gpt2",
    patch_size=16,
    patch_stride=8,
    d_llm=768,
    d_model=128,
    d_ffn=256,
    n_heads=4,
    dropout=0.1,
    domain_prompt_content="PhysioNet 2012",
    epochs=5,
)

timellm.fit(train_set, val_set)
imputation = timellm.impute(test_set)
mae_timellm = calc_mae(imputation, np.nan_to_num(test_X_ori), indicating_mask)
print(f"MAE: {mae_timellm:.4f}")
timellm.save("save_it_here/timellm_physionet2012.pypots", overwrite=True)

TypeError: TimeLLM.__init__() missing 10 required positional arguments: 'n_layers', 'llm_model_type', 'patch_size', 'patch_stride', 'd_llm', 'd_model', 'd_ffn', 'n_heads', 'dropout', and 'domain_prompt_content'