In [1]:
import h5py

hf = h5py.File('data/datasets.h5', 'r')

print(
list(hf.keys()),'\n' , 
hf['val'].keys(),'\n',
hf['val']['X'],'\n',
hf['train'].keys())

['empirical_mean_for_GRUD', 'test', 'train', 'val'] 
 <KeysViewHDF5 ['X', 'X_hat', 'indicating_mask', 'missing_mask']> 
 <HDF5 dataset "X": shape (4, 9, 51), type "<f4"> 
 <KeysViewHDF5 ['X']>


In [2]:
from torch.utils.data import Dataset, DataLoader, RandomSampler
import numpy as np
import torch

class LoadDataset(Dataset):
    def __init__(self, file_path, seq_len, feature_num):
        super(LoadDataset, self).__init__()
        self.file_path   = file_path
        self.seq_len     = seq_len
        self.feature_num = feature_num
        #self.model_type  = model_type
        
        
class LoadValTestDataset(LoadDataset):
    """Loads validation set"""

    def __init__(self, file_path, set_name, seq_len, feature_num):
        super(LoadValTestDataset, self).__init__(file_path, seq_len, feature_num)
        with h5py.File(self.file_path, 'r') as hf:  # read data from h5 file
            self.X = hf[set_name]['X'][:]
            self.X_hat = hf[set_name]['X_hat'][:]
            self.missing_mask = hf[set_name]['missing_mask'][:]
            self.indicating_mask = hf[set_name]['indicating_mask'][:]

        # fill missing values with 0
        self.X     = np.nan_to_num(self.X)
        self.X_hat = np.nan_to_num(self.X_hat)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sample = (
                torch.tensor(idx),
                torch.from_numpy(self.X_hat[idx].astype('float32')),
                torch.from_numpy(self.missing_mask[idx].astype('float32')),
                torch.from_numpy(self.X[idx].astype('float32')),
                torch.from_numpy(self.indicating_mask[idx].astype('float32')),
            )
        return sample
    
    
    
    
class LoadTrainDataset(LoadDataset):
    """Loads train set"""

    def __init__(self, file_path, seq_len, feature_num, masked_imputation_task):
        super(LoadTrainDataset, self).__init__(file_path, seq_len, feature_num)
        self.masked_imputation_task = masked_imputation_task
        if masked_imputation_task:
            self.artificial_missing_rate = 0.1
            assert 0 < self.artificial_missing_rate < 1, 'artificial_missing_rate should be greater than 0 and less than 1'

        with h5py.File(self.file_path, 'r') as hf:  # read data from h5 file
            self.X = hf['train']['X'][:]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        if self.masked_imputation_task:
            X = X.reshape(-1)
            indices = np.where(~np.isnan(X))[0].tolist()
            indices = np.random.choice(indices, round(len(indices) * self.artificial_missing_rate))
            X_hat = np.copy(X)
            X_hat[indices] = np.nan  # mask values selected by indices
            missing_mask = (~np.isnan(X_hat)).astype(np.float32)
            indicating_mask = ((~np.isnan(X)) ^ (~np.isnan(X_hat))).astype(np.float32)
            X = np.nan_to_num(X)
            X_hat = np.nan_to_num(X_hat)
            # reshape into time series
            X = X.reshape(self.seq_len, self.feature_num)
            X_hat = X_hat.reshape(self.seq_len, self.feature_num)
            missing_mask = missing_mask.reshape(self.seq_len, self.feature_num)
            indicating_mask = indicating_mask.reshape(self.seq_len, self.feature_num)

            #if self.model_type in ['Transformer', 'SAITS']:
            sample = (
                    torch.tensor(idx),
                    torch.from_numpy(X_hat.astype('float32')),
                    torch.from_numpy(missing_mask.astype('float32')),
                    torch.from_numpy(X.astype('float32')),
                    torch.from_numpy(indicating_mask.astype('float32')),
                )
            #else:
            #    assert ValueError, f'Error model type: {self.model_type}'
        else:
            # if training without masked imputation task, then there is no need to artificially mask out observed values
            missing_mask = (~np.isnan(X)).astype(np.float32)
            X = np.nan_to_num(X)
            #if self.model_type in ['Transformer', 'SAITS']:
            sample = (
                    torch.tensor(idx),
                    torch.from_numpy(X.astype('float32')),
                    torch.from_numpy(missing_mask.astype('float32')),
                )
            #else:
            #    assert ValueError, f'Error model type: {self.model_type}'
        return sample
    
val    = LoadValTestDataset('data/datasets.h5', 'val', 9, 51)
loader_val = DataLoader(val, batch_size=1)
train = LoadTrainDataset('data/datasets.h5', seq_len = 9, feature_num = 51, masked_imputation_task= True)
loader_train = DataLoader(train, batch_size=1)


next(iter(loader_val))[1].shape, next(iter(loader_train))[1].shape

  from .autonotebook import tqdm as notebook_tqdm


(torch.Size([1, 9, 51]), torch.Size([1, 9, 51]))

In [3]:
class UnifiedDataLoader:
    def __init__(self, dataset_path, seq_len, feature_num, batch_size, num_workers=4,masked_imputation_task=True):
        """
        dataset_path: path of directory storing h5 dataset;
        seq_len: sequence length, i.e. time steps;
        feature_num: num of features, i.e. feature dimensionality;
        batch_size: size of mini batch;
        num_workers: num of subprocesses for data loading;
        model_type: model type, determine returned values;
        masked_imputation_task: whether to return data for masked imputation task, only for training/validation sets;
        """
        self.dataset_path = dataset_path
        self.seq_len = seq_len
        self.feature_num = feature_num
        self.batch_size = batch_size
        self.num_workers = num_workers
        #self.model_type = model_type
        self.masked_imputation_task = masked_imputation_task
        #self.train_dataset, self.train_loader = None, None
        #self.val_dataset,   self.val_loader   = None, None
        #self.test_dataset, self.test_loader, self.test_set_size = None, None, None
        
    def get_train_val_dataloader(self):
        self.train_dataset = LoadTrainDataset(self.dataset_path,        self.seq_len, self.feature_num, self.masked_imputation_task)
        self.val_dataset = LoadValTestDataset(self.dataset_path, 'val', self.seq_len, self.feature_num)
        #self.train_set_size = self.train_dataset.__len__()
        #self.val_set_size = self.val_dataset.__len__()
        
        
        train_sampler = torch.utils.data.RandomSampler(self.train_dataset , replacement=True, num_samples=200)
        val_sampler   = torch.utils.data.RandomSampler(self.val_dataset , replacement=True, num_samples=50)
        
        self.train_loader = DataLoader(self.train_dataset, self.batch_size,  num_workers=self.num_workers, sampler=train_sampler, drop_last=True)
        self.val_loader   = DataLoader(self.val_dataset, self.batch_size, num_workers=self.num_workers, sampler=val_sampler, drop_last=True)
        
        
        
        return self.train_loader, self.val_loader
    
    
ud = UnifiedDataLoader('data/datasets.h5', seq_len = 9, feature_num = 51, batch_size=1)


tr, val = ud.get_train_val_dataloader()


next(iter(tr))[1].shape, next(iter(val))[1].shape

(torch.Size([1, 9, 51]), torch.Size([1, 9, 51]))

In [9]:
from SAITS_model import SAITS
import torch.optim as optim

model_args = {'device': 'cuda',
            'MIT': True,
            'n_groups': 5,
            'n_group_inner_layers': 1,
            'd_time': 9,
            'd_feature': 51,
            'dropout': 0.0,
            'd_model': 256,
            'd_inner': 512,
            'n_head': 8,
            'd_k': 32,
            'd_v': 32,
            'input_with_mask': True,
            'diagonal_attention_mask': True,
            'param_sharing_strategy': 'inner_group'}

model = SAITS(**model_args)
if  torch.cuda.is_available() and torch.cuda.is_initialized():
    model = model.to('cuda')

optimizer               = getattr(optim, 'Adam')(model.parameters(), lr= 0.000682774550436755)




unified_dataloader                = UnifiedDataLoader('data/datasets.h5', seq_len = 9, feature_num = 51, batch_size=2)
train_dataloader, val_dataloader  = unified_dataloader.get_train_val_dataloader()


In [13]:
#total_loss_val = 0
for data in train_dataloader:
    model.train()
    model.to('cuda')
    indices, X, missing_mask, X_holdout, indicating_mask = map(lambda x: x.to('cuda'), data)
    inputs = {'indices': indices, 'X': X, 'missing_mask': missing_mask,'X_holdout': X_holdout, 'indicating_mask': indicating_mask}
    results = model(inputs, 'train')
    #results = result_processing(results, args)
    #optimizer.zero_grad()
    #results['total_loss'].backward()
    #optimizer.step()
    
    
results

{'imputed_data': tensor([[[ 0.0000e+00,  0.0000e+00,  2.4332e+02,  2.6400e+01,  3.4030e+02,
            7.3000e+01,  4.6600e+00,  1.2000e+00,  1.3700e+01,  4.9000e-01,
            2.5000e-01,  2.5000e+01,  3.0000e+01,  3.2070e+02,  2.6140e+02,
            9.3300e+02,  1.1000e+02, -1.5349e+00,  2.8873e+00, -9.8850e+00,
           -8.6182e+00, -5.9854e+00, -1.3297e+01, -9.3981e+00,  1.4324e+01,
            3.1930e+01, -2.7893e+01,  1.5711e+01, -3.2754e+01, -3.4161e+00,
            3.5652e+00, -2.1237e+01,  7.2518e+00,  2.1809e+00, -2.5737e+01,
            1.5099e+01,  1.2411e+00,  8.8086e+00, -2.7516e+01, -2.5688e+01,
            8.6853e+00,  1.1588e+01,  4.9058e-02, -2.7654e+01, -1.0823e+01,
           -7.1812e+00,  3.0640e+00, -3.1697e+00, -1.4642e+01, -6.6812e+00,
            2.2464e+01],
          [ 1.0900e+00,  2.6680e+01,  0.0000e+00, -1.2276e+01,  8.6000e+01,
            2.0700e+01,  4.1500e+00,  1.0400e+00, -5.3223e+00,  3.1000e-01,
            6.0000e-01,  3.3000e+01,  4.6000e+0