In [1]:
#dataset.to_csv('dataset.csv',index=False)

In [10]:
import pickle 
import pandas as pd
from typing import Union
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from datetime import datetime
import lightning as L
with open('example.pkl', 'rb') as f:
    dataset,cat_var_past,cat_var_fut,num_var_past = pickle.load(f)
num_var_fut = []
group = None
time_var = 'time'
target = ['signal']

In [11]:
dataset2 = dataset.copy()
dataset2['group'] = 2
dataset['group'] = 1 
dataset = pd.concat([dataset, dataset2])

1.0

In [18]:

def extend_time_df(x:pd.DataFrame,freq:Union[str,int],group:Union[str,None]=None,global_minmax:bool=False)-> pd.DataFrame:
    """Utility for generating a full dataset and then merge the real data

    Args:
        x (pd.DataFrame): dataframe containing the column time
        freq (str): frequency (in pandas notation) of the resulting dataframe
        group (string or None): if not None the min max are computed by the group column, default None
        global_minmax (bool): if True the min_max is computed globally for each group. Usually used for stacked model
    Returns:
        pd.DataFrame: a dataframe with the column time ranging from thr minumum of x to the maximum with frequency `freq`
    """

    if group is None:

        if isinstance(freq,int):
            empty = pd.DataFrame({'time':list(range(x.time.min(),x.time.max(),freq))})
        else:
            empty = pd.DataFrame({'time':pd.date_range(x.time.min(),x.time.max(),freq=freq)})

    else:
        
        if global_minmax:dataset
            _min = pd.DataFrame({group:x[group].unique(),'time':x.time.min()})
            _max = pd.DataFrame({group:x[group].unique(),'time':x.time.max()})

        else:
            _min = x.groupby(group).time.min().reset_index()
            _max = x.groupby(group).time.max().reset_index()
        empty = []
        for c in x[group].unique():
            if isinstance(freq,int):
                empty.append(pd.DataFrame({group:c,'time':np.arange(_min.time[_min[group]==c].values[0],_max.time[_max[group]==c].values[0],freq)}))

            else:
                empty.append(pd.DataFrame({group:c,'time':pd.date_range(_min.time[_min[group]==c].values[0],_max.time[_max[group]==c].values[0],freq=freq)}))
            
        empty = pd.concat(empty,ignore_index=True)
    return empty


In [7]:

class PandasTSDataSet:
    def __init__(self, df,time_var, cat_var_past, cat_var_fut, num_var_past, num_var_fut,target, group = None, metadata={},check_holes_and_duplicates=True,check_past=True):
        
        numerica_var = list(set(num_var_past).union(set(num_var_fut)).union(set(target)))
        df_norm = df[numerica_var+[time_var]].copy()
        label_encoders = {}
        ##Encoders for categorical since we want to return tensors
        for c in set(cat_var_past).union(set(cat_var_fut)):
            label_encoders[c] = LabelEncoder()
            df_norm[c] = label_encoders[c].fit_transform(df[c])
        if group is not None:
            label_encoders[group] = OrdinalEncoder()
            df_norm[group] = label_encoders[group].fit_transform(df[group])
        #complete the dataset, we can think to devide it in chunks of consecutive ts....
        if check_holes_and_duplicates:
            df_norm.drop_duplicates(subset=['time'] if group is None else [group,'time'],  keep='first', inplace=True, ignore_index=True)
            print(df_norm.shape)
            if group is None:
                differences = df_norm.time.diff()[1:]
            else:
                differences = df_norm[df_norm[group]==df_norm[group].unique()[0]].time.diff()[1:]
    
            if isinstance(df_norm.time[0], datetime):
                freq = pd.to_timedelta(differences.min())   
            else:
                if int(df_norm.time[0])==df_norm.time[0]: ##ONLY THINK THAT WORKS IN GENERAL
                    freq = int(differences.min())
                else:
                    raise TypeError("time must be integer or datetime")
            metadata['fraq'] = freq 
            if differences.nunique()>1:
                df_norm = extend_time_df(df_norm,freq,group).merge(dataset,how='left')
                print(df_norm.shape)
        
        else:
            metadata['freq'] =  dataset.time.diff()[1:].min() ##care there can be holes in data!

        if check_past:
            num_var_past = list(set(num_var_past).union(set(target)))
        if group is None:
            df_norm = df_norm.sort_values(by=time_var).reset_index()
            self.lengths = df_norm.shape[0]
        else:
            df_norm = df_norm.sort_values(by=[time_var,group]).reset_index()
        
        df_norm['valid'] = ~pd.isnull(dataset.max(axis=1))
        df_norm['valid'] = df_norm['valid'].astype(int)
        self.df = df_norm
        metadata['target'] =  target
        metadata['num_var_past'] = num_var_past
        metadata['num_var_fut'] = num_var_fut
        metadata['cat_var_past'] = cat_var_past
        metadata['cat_var_fut'] = cat_var_fut
        metadata['past_variables'] = list(set(cat_var_past).union(set(num_var_past)))
        metadata['future_variables'] = list(set(cat_var_fut).union(set(num_var_fut)).union(set(target)))
        metadata['time_var'] =time_var
        metadata['group'] = group
        
        self.metadata = metadata
        ## compute the length of each timeseries
        ll = {}
        if group is None:
            ll[0] = self.df.shape[0]
        else:
            for i,row in self.df.groupby(group)[time_var].count().reset_index().iterrows():
                ll[row[group]] = row[time_var]
        self.lengths  = ll
        

    def get_total_len(self):
        return sum(self.lengths.values())

    def get_id_ts_by_idx(self,idx):
        tmp = np.cumsum(sum(self.lengths.values()))
        idx =  min(np.where(tmp>idx)[0])
        return idx, 0 if idx==0 else tmp[idx-1]

    ## this is the most delicate part
    def __getitem__(self, idx):
        # Returns only tensors, no metadata
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        if self.metadata['group'] is None:
            df = self.df
        else:
            df = self.df[self.df.group==idx] ## ANOTHER COMMENT: IDX?? what if we have 10 timeseries with groups??
            
        result = {
            #'t': tensor(...), ##FIRST COMMENT: TIMESTAMP CANT be cast as tensor! 
            'y': torch.tensor(df[self.metadata['target']].values,dtype=torch.float32,device=device), ##SECOND COMMENT device
            'x_past': torch.tensor(df[self.metadata['num_var_past']].values,dtype=torch.float32,device=device), ##SECOND COMMENT device
            'is_valid': torch.tensor(df.valid.values,dtype=torch.int,device=device), ##SECOND COMMENT device
            #'t_f': tensor(...), ?? WHAT IS THAT?
        
            }
        if self.metadata['group'] is not None:
             result['group']= torch.tensor(idx,device=device,dtype=torch.long) 
        if len(self.metadata['num_var_fut'])>0:
             result['num_var_fut']= torch.tensor(df[self.metadata['num_var_fut']].values,dtype=torch.float32,device=device)
        if len(self.metadata['cat_var_fut'])>0:
             result['cat_var_past']= torch.tensor(df[self.metadata['cat_var_past']].values,dtype=torch.long,device=device)
        if len(self.metadata['cat_var_fut'])>0:
             result['cat_var_fut']= torch.tensor(df[self.metadata['cat_var_fut']].values,dtype=torch.long,device=device)
        return result
    def get_metadata(self):
        return self.metadata

# Layer D2



In [8]:
data_module_metadata = dict(
  perc_train= 0.7,
  perc_valid= 0.1,
  range_train= None,
  range_validation= None ,
  range_test= None,
  shift= 0,
  starting_point= None,
  skip_step= 1,
  past_steps=16,
  future_steps= 16,
  scaler= 'sklearn.preprocessing.StandardScaler()')

In [11]:
ds = PandasTSDataSet(dataset,'time', 
                     cat_var_past, 
                     cat_var_fut, 
                     num_var_past,
                     num_var_fut,
                     target,
                     group = None, 
                     metadata={}
                     ,check_holes_and_duplicates=True,check_past=True)
        

(5000, 6)


In [12]:
from torch.utils.data import Dataset, DataLoader

In [14]:
from torch.utils.data.dataloader import default_collate
import torch
def my_collate(batch):
    batch = list(filter(lambda x : x is not None, batch))
    print(len(batch))
    return default_collate(batch)

class MyDataset(Dataset):

    def __init__(self,data,metadata)->torch.utils.data.Dataset:
       
        self.data = data  
        self.metadata = metadata
        self.metadata_dataset = data.metadata
    def __len__(self):
        
        return self.data.get_total_len()-len(self.data.lengths)*self.metadata['future_steps']
    ##this getitem calls the getitem of the data as desired
    def __getitem__(self, idxs): 
        sample = {}
        IDX,difference = self.data.get_id_ts_by_idx(idxs) 
        idxs-=difference
        tmp = self.data.__getitem__(IDX)
        if idxs+self.metadata['future_steps']>self.data.lengths[IDX]:
            return None
        
        for k in tmp.keys():

            if len(tmp[k][idxs-self.metadata['past_steps']:idxs]) + len(tmp[k][idxs:idxs+self.metadata['future_steps']]) == self.metadata['future_steps']+self.metadata['past_steps']:
                if tmp['is_valid'][idxs-self.metadata['past_steps']:idxs+self.metadata['future_steps']].sum()==self.metadata['future_steps']+self.metadata['past_steps']:
                    if '_past' in k:
                        sample[k] = tmp[k][idxs-self.metadata['past_steps']:idxs]
                    elif '_fut' in k or k=='y':
                        sample[k] = tmp[k][idxs:idxs+self.metadata['future_steps']]
                    else:
                        pass
        if len(sample)==0:
            return None
        else:
            return sample

In [15]:
dl =  DataLoader(
            MyDataset(ds,data_module_metadata), ##change respect to the initial vignette, can not use standard dataset
            batch_size=32,
            collate_fn=my_collate
        )

In [16]:
import numpy as np

for x in dl:
    
    print(x['y'].shape)  #It works! 

16
torch.Size([16, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
32
torch.Size([32, 16, 1])
3

In [None]:
##WIP IN THE NEXT CELLS

In [None]:

class DecoderEncoderDataModule(L.LightningDataModule):
    def __init__(self, d1_dataset, batch_size=32, num_workers=4,metadata=None):
        super().__init__()
        # initialize other  params
        self.d1_dataset = d1_dataset
        self.batch_size = batch_size
        self.metadata = None  # Will be set in setup() NO! THhe metadata for training are not related with metadata of the data!
    
    def setup(self, stage=None):
        # Get metadata from D1 layer during setup
        #self.metadata = self.d1_dataset.get_metadata() ##NO SEE COMMENT BEFORE!
        
        if stage == 'fit' or stage is None: 
            ##CLEAR FOR ME IN CASE OF SINGLE TIMESERIES (EG PANDAS LOADER) not clear in case of chunk of data of the same 
            ## do we suppose to have NO HOLES in data? do we check it runtime?
            #I can precompute here the indexes 
            
        
                    
            pass
        
        if stage == 'test' or stage is None:
            # Any test-specific setup
            pass
            
    def train_dataloader(self):
        return DataLoader(
            MyDataset(self.d1_dataset),
            batch_size=self.batch_size,
            **other_params
        )

# Layer M
class DecoderEncoderModel(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.metadata = None
        
    def setup(self, stage=None):
        # Get metadata from datamodule during setup
        self.metadata = self.trainer.datamodule.metadata
        
        # Initialize layer T model using metadata
    
    def forward(self, x):
     # forward logic
        pass