## Running example from the readme


In [1]:
%load_ext autoreload
%autoreload 2

In [92]:
# Data preprocessing. Tedious, but PyPOTS can help.
import numpy as np
from sklearn.preprocessing import StandardScaler
from pygrinder import mcar
from pypots.data import load_specific_dataset
data = load_specific_dataset('physionet_2012')  # PyPOTS will automatically download and extract it.


2024-11-07 16:04:01 [INFO]: Loading the dataset physionet_2012 with TSDB (https://github.com/WenjieDu/Time_Series_Data_Beans)...
2024-11-07 16:04:01 [INFO]: Starting preprocessing physionet_2012...
2024-11-07 16:04:01 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2024-11-07 16:04:01 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2024-11-07 16:04:01 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2024-11-07 16:04:01 [INFO]: Loaded successfully!
2024-11-07 16:04:12 [INFO]: 68864 values masked out in the val set as ground truth, take 9.98% of the original observed values
2024-11-07 16:04:12 [INFO]: 86076 values masked out in the test set as ground truth, take 9.92% of the original observed values
2024-11-07 16:04:12 [INFO]: Total sample number: 11

In [93]:
X = data['train_X']
scaler = data['scaler']
X_shape = X.shape
X = scaler.transform(X.reshape(-1,X_shape[2])).reshape(X_shape)

## normalize
normalize = True
if normalize and False:
    latent_dim = X.shape[2]
    mean, std = np.zeros((1,1,latent_dim)), np.ones((1,1,latent_dim))
    for l in range(latent_dim):
        X_l = X[:,:,l][X[:,:,l]==X[:,:,l]] 
        if len(X_l) > 0:   
            mean[:,:,l] = X_l.mean()    
            std[:,:,l] = X_l.std()   
    X = (X - mean) / std

from sklearn.preprocessing import QuantileTransformer
rng = np.random.RandomState(0)
qt = QuantileTransformer(n_quantiles=10, random_state=0, output_distribution = 'normal')

if normalize:
    n_batch, n_obs, latent_dim = X.shape
    for l in range(latent_dim):
        X_l = X[:,:,l][X[:,:,l]==X[:,:,l]] 
        qt.fit(X_l.reshape(-1,1))
        
        X[:,:,l] = qt.transform(X[:,:,l].reshape(-1,1)).reshape(n_batch, n_obs)


X_ori = X  # keep X_ori for validation
X = mcar(X, 0.1)  # randomly hold out 10% observed values as ground truth
dataset = {"X": X}  # X for model input
print(X.shape)  # (11988, 48, 37), 11988 samples and each sample has 48 time steps, 37 features

(7671, 48, 37)


In [94]:

# Model training. This is PyPOTS showtime.
from pypots.imputation import GP_VAE
from pypots.utils.metrics import calc_mae
from pypots.optim.adam import Adam
from torch.optim.lr_scheduler import LRScheduler

gpvae = GP_VAE(n_steps = 48, 
            n_features = 37, 
            latent_size = 16, 
            epochs = 5, 
            batch_size = 8,
            beta = .001, 
            K = 10,  
            encoder_sizes = (64, 64), 
            decoder_sizes = (64, 64),
            optimizer = Adam(weight_decay = 1e-4) #, lr_scheduler = LRScheduler
            )
# Here I use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
gpvae.fit(dataset)  # train the model on the dataset

2024-11-07 16:04:13 [INFO]: No given device, using default device: cpu
2024-11-07 16:04:13 [INFO]: GP_VAE initialized with the given hyperparameters, the number of trainable parameters: 16,485


Model dimensions is: 
GpvaeEncoder(
  (net): Sequential(
    (0): Linear(in_features=37, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU()
  )
  (mu_layer): Linear(in_features=64, out_features=16, bias=True)
  (logvar_layer): Linear(in_features=64, out_features=16, bias=True)
)
GpvaeDecoder(
  (net): Sequential(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=37, bias=True)
  )
)


2024-11-07 16:04:20 [ERROR]: ❌ Exception: probability tensor contains either `inf`, `nan` or element < 0


RuntimeError: Training got interrupted. Model was not trained. Please investigate the error printed above.

In [95]:
import gpytorch
with gpytorch.settings.cholesky_jitter(1e-4):
    gpvae.fit_kernel(dataset)  # train the model on the dataset


fitting kernel
torch.Size([8, 48])
torch.Size([8, 8, 8]) torch.Size([8, 48])
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/louis/miniforge3/envs/pypots-env/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/7s/75p3d4f145qfrrzfq5vskssc0000gn/T/ipykernel_11367/1211517127.py", line 3, in <module>
    gpvae.fit_kernel(dataset)  # train the model on the dataset
  File "/Users/louis/Documents/Phd/Code_pour_manuscript/PyPOTS/pypots/imputation/gp_ae/model.py", line 514, in fit_kernel
    self._fit_kernel(training_loader)
  File "/Users/louis/Documents/Phd/Code_pour_manuscript/PyPOTS/pypots/imputation/gp_ae/model.py", line 484, in _fit_kernel
    self.gp.fit_kernel(training_loader)
  File "/Users/louis/Documents/Phd/Code_pour_manuscript/PyPOTS/pypots/imputation/gp_ae/gp_model.py", line 171, in fit_kernel
    loss = -self.mll[j](out, z_mu[:,:,j])
  File "/Users/louis/miniforge3/envs/pypots-env/lib/python3.9/site-packages/gpytorch/module.py", line 31, in __cal

In [None]:
gpvae.gp._fit_kernel

<pypots.imputation.gp_ae.model.ProbabilisticGP at 0x2a16fb100>

In [None]:
imputation = gpvae.impute(dataset)  # impute the originally-missing values and artificially-missing values
indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
gpvae.save("save_it_here/gpvae_physionet2012.pypots")  # save the model for future use
gpvae.load("save_it_here/gpvae_physionet2012.pypots")  # reload the serialized model file for following imputation or training

AttributeError: '_GP_VAE' object has no attribute 'encode'

In [None]:

# Model training. This is PyPOTS showtime.
from pypots.imputation import GPVAE
from pypots.utils.metrics import calc_mae
gpvae = GPVAE(n_steps=48, n_features=37, latent_size = 12, epochs=10)
# Here I use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
gpvae.fit(dataset)  # train the model on the dataset
imputation = gpvae.impute(dataset)  # impute the originally-missing values and artificially-missing values
indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
gpvae.save("save_it_here/gpvae_physionet2012.pypots")  # save the model for future use
gpvae.load("save_it_here/gpvae_physionet2012.pypots")  # reload the serialized model file for following imputation or training

2024-10-27 12:50:27 [INFO]: No given device, using default device: cpu
2024-10-27 12:50:27 [INFO]: GPVAE initialized with the given hyperparameters, the number of trainable parameters: 21,065
2024-10-27 12:50:50 [INFO]: Epoch 001 - training loss: 11284.5031
2024-10-27 12:51:14 [INFO]: Epoch 002 - training loss: 9194.3747
2024-10-27 12:51:40 [INFO]: Epoch 003 - training loss: 9184.6626
2024-10-27 12:52:04 [INFO]: Finished training. The best model is from epoch#3.


AssertionError: shape of `predictions` and `targets` must match, but got (7671, 1, 48, 37) and (7671, 48, 37)

In [None]:

# Model training. This is PyPOTS showtime.
from pypots.imputation import SAITS
from pypots.utils.metrics import calc_mae
gpvae = SAITS(n_steps=48, n_features=37, n_layers=2, d_model=256, n_heads=4, d_k=64, d_v=64, d_ffn=128, dropout=0.1, epochs=10)
# Here I use the whole dataset as the training set because ground truth is not visible to the model, you can also split it into train/val/test sets
gpvae.fit(dataset)  # train the model on the dataset
imputation = gpvae.impute(dataset)  # impute the originally-missing values and artificially-missing values
indicating_mask = np.isnan(X) ^ np.isnan(X_ori)  # indicating mask for imputation error calculation
mae = calc_mae(imputation, np.nan_to_num(X_ori), indicating_mask)  # calculate mean absolute error on the ground truth (artificially-missing values)
gpvae.save("save_it_here/saits_physionet2012.pypots")  # save the model for future use
gpvae.load("save_it_here/saits_physionet2012.pypots")  # reload the serialized model file for following imputation or training

2024-10-09 14:06:48 [INFO]: No given device, using default device: cpu
2024-10-09 14:06:48 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 1,378,358


(7671, 48, 37)


2024-10-09 14:08:43 [INFO]: Epoch 001 - training loss: 1.2981
2024-10-09 14:10:22 [INFO]: Epoch 002 - training loss: 0.3698
2024-10-09 14:11:53 [INFO]: Epoch 003 - training loss: 0.3274
2024-10-09 14:13:24 [INFO]: Epoch 004 - training loss: 0.3104
2024-10-09 14:14:56 [INFO]: Epoch 005 - training loss: 0.3024
2024-10-09 14:16:25 [INFO]: Epoch 006 - training loss: 0.2977
2024-10-09 14:18:08 [INFO]: Epoch 007 - training loss: 0.2921
2024-10-09 14:19:42 [INFO]: Epoch 008 - training loss: 0.2838
2024-10-09 14:21:32 [INFO]: Epoch 009 - training loss: 0.2714
2024-10-09 14:23:10 [INFO]: Epoch 010 - training loss: 0.2671
2024-10-09 14:23:10 [INFO]: Finished training. The best model is from epoch#10.
2024-10-09 14:23:43 [INFO]: Successfully created the given path save_it_here
2024-10-09 14:23:43 [INFO]: Saved the model to save_it_here/saits_physionet2012.pypots
2024-10-09 14:23:43 [INFO]: Model loaded successfully from save_it_here/saits_physionet2012.pypots
