In [2]:
import pandas as pd
import torch

df = pd.read_csv('common_10s_20231112213000.csv', parse_dates=["date"]).drop(['Unnamed: 0'], axis=1, errors='ignore')

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20009502 entries, 0 to 20009501
Data columns (total 9 columns):
 #   Column    Dtype         
---  ------    -----         
 0   ticker    object        
 1   date      datetime64[ns]
 2   open      float64       
 3   high      float64       
 4   low       float64       
 5   close     float64       
 6   average   float64       
 7   volume    int64         
 8   barcount  int64         
dtypes: datetime64[ns](1), float64(5), int64(2), object(1)
memory usage: 1.3+ GB


Unnamed: 0,ticker,date,open,high,low,close,average,volume,barcount
0,SPY,2023-09-01 09:30:00,453.17,453.2,452.89,452.9,453.064,397591,944
1,SPY,2023-09-01 09:30:10,452.91,453.01,452.89,452.95,452.956,77825,584
2,SPY,2023-09-01 09:30:20,452.95,453.09,452.95,453.02,453.034,47865,312
3,SPY,2023-09-01 09:30:30,453.01,453.13,452.9,452.91,453.001,53428,408
4,SPY,2023-09-01 09:30:40,452.92,453.1,452.91,453.03,452.981,65112,423


In [4]:
# Group by ticker and remove the ticker label
df_bytickers = [ticker.drop(columns=['ticker']) for _, ticker in df.groupby(df.ticker)]
print(f"There is {len(df_bytickers)} dataframes grouped by tickers")
df_bytickers[0].info()
df_bytickers[0].head()

There is 55 dataframes grouped by tickers
<class 'pandas.core.frame.DataFrame'>
Index: 363960 entries, 8268907 to 19377701
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   date      363960 non-null  datetime64[ns]
 1   open      363960 non-null  float64       
 2   high      363960 non-null  float64       
 3   low       363960 non-null  float64       
 4   close     363960 non-null  float64       
 5   average   363960 non-null  float64       
 6   volume    363960 non-null  int64         
 7   barcount  363960 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 25.0 MB


Unnamed: 0,date,open,high,low,close,average,volume,barcount
8268907,2023-03-27 09:30:00,13.96,14.03,13.95,14.0,13.978,246106,158
8268908,2023-03-27 09:30:10,14.0,14.02,14.0,14.02,14.015,11512,23
8268909,2023-03-27 09:30:20,14.02,14.02,14.01,14.02,14.018,13960,23
8268910,2023-03-27 09:30:30,14.02,14.03,14.01,14.01,14.013,18475,93
8268911,2023-03-27 09:30:40,14.02,14.03,14.0,14.03,14.02,32586,63


In [5]:
# Grouping by day, and reformatting the date to be time of day instead of datetime
df_bydate = [[date for _, date in dates.groupby(dates['date'].dt.date)] for dates in df_bytickers]
df_bydate = [[date.apply(lambda x: x.dt.hour + x.dt.minute/60 + x.dt.second/3600 if x.name in ['date'] else x) for date in ticker] for ticker in df_bydate]
print(f"There is roughly {len(df_bydate[0])} dataframes that correspond to days for each ticker")
df_bydate[0][0].info()
df_bydate[0][0].head()

There is roughly 156 dataframes that correspond to days for each ticker
<class 'pandas.core.frame.DataFrame'>
Index: 2340 entries, 8268907 to 8271246
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2340 non-null   float64
 1   open      2340 non-null   float64
 2   high      2340 non-null   float64
 3   low       2340 non-null   float64
 4   close     2340 non-null   float64
 5   average   2340 non-null   float64
 6   volume    2340 non-null   int64  
 7   barcount  2340 non-null   int64  
dtypes: float64(6), int64(2)
memory usage: 164.5 KB


Unnamed: 0,date,open,high,low,close,average,volume,barcount
8268907,9.5,13.96,14.03,13.95,14.0,13.978,246106,158
8268908,9.502778,14.0,14.02,14.0,14.02,14.015,11512,23
8268909,9.505556,14.02,14.02,14.01,14.02,14.018,13960,23
8268910,9.508333,14.02,14.03,14.01,14.01,14.013,18475,93
8268911,9.511111,14.02,14.03,14.0,14.03,14.02,32586,63


In [6]:
columns_with_zeros = df.eq(0).any()[lambda x: x].keys().values
eps = 1e-16
df_deltas = [[date.apply(lambda x: x + eps if x.name in columns_with_zeros else x) for date in ticker] for ticker in df_bydate]
df_deltas = [[date.apply(lambda x: x.pct_change() if x.name not in ['date'] else x).iloc[1:] for date in ticker] for ticker in df_deltas]

In [7]:
df_deltas[0][0].info()
df_deltas[0][0].head()

<class 'pandas.core.frame.DataFrame'>
Index: 2339 entries, 8268908 to 8271246
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2339 non-null   float64
 1   open      2339 non-null   float64
 2   high      2339 non-null   float64
 3   low       2339 non-null   float64
 4   close     2339 non-null   float64
 5   average   2339 non-null   float64
 6   volume    2339 non-null   float64
 7   barcount  2339 non-null   float64
dtypes: float64(8)
memory usage: 164.5 KB


Unnamed: 0,date,open,high,low,close,average,volume,barcount
8268908,9.502778,0.002865,-0.000713,0.003584,0.001429,0.002647,-0.953223,-0.85443
8268909,9.505556,0.001429,0.0,0.000714,0.0,0.000214,0.212648,0.0
8268910,9.508333,0.0,0.000713,0.0,-0.000713,-0.000357,0.323424,3.043478
8268911,9.511111,0.0,0.0,-0.000714,0.001428,0.0005,0.763789,-0.322581
8268912,9.513889,0.0,0.000713,0.001429,0.0,0.000428,-0.620236,-0.142857


In [8]:
df_deltas = df_deltas
#print(df_deltas[0][0].info())
daily_sums = list()
for tickers in df_deltas:
    for days in tickers:
        daily_sums.append(days.loc[:, 'average'].sum())
daily_average = abs(sum(daily_sums) / len(daily_sums))
features = df_deltas[0][0].columns.values
print(daily_average)
print(features)

5.892697979016721e-05
['date' 'open' 'high' 'low' 'close' 'average' 'volume' 'barcount']


In [9]:
import itertools
data = list(itertools.chain(*df_deltas))
data_2339 = list()
for v in data:
    if (len(v) == 2339):
        data_2339.append(v)

In [10]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, random_split

torch.set_default_tensor_type(torch.DoubleTensor)

class StockDataset(TensorDataset):
    def __init__(self, data, known_interval_in_tens_of_seconds=720, predict_interval_in_tens_of_seconds=180, daily_interval_in_tens_of_seconds=2339):
        self.data = data
        self.known_interval_in_tens_of_seconds = known_interval_in_tens_of_seconds
        self.predict_interval_in_tens_of_seconds = predict_interval_in_tens_of_seconds
        self.daily_length = daily_interval_in_tens_of_seconds - (known_interval_in_tens_of_seconds + predict_interval_in_tens_of_seconds)
        self.length = len(data) * self.daily_length
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        list_idx = index // self.daily_length
        df_idx = index % self.daily_length
        known_df_idx = df_idx + self.known_interval_in_tens_of_seconds
        predict_df_idx = known_df_idx + self.predict_interval_in_tens_of_seconds
        #past_values = self.data[list_idx][['average']].iloc[df_idx:known_df_idx].values
        #past_time_features = self.data[list_idx].loc[:, self.data[list_idx].columns != 'average'].iloc[df_idx:known_df_idx].values
        #future_time_features = self.data[list_idx].loc[:, self.data[list_idx].columns != 'average'].iloc[known_df_idx:predict_df_idx].values
        #future_values = self.data[list_idx][['average']].iloc[known_df_idx:predict_df_idx].values
        #past_observed_mask = np.ones(past_values.shape)
        return {"past_values": self.data[list_idx].loc[:, self.data[list_idx].columns != 'date'].iloc[df_idx:known_df_idx].values, "future_values": self.data[list_idx].loc[:, self.data[list_idx].columns != 'date'].iloc[known_df_idx:predict_df_idx].values, 
                "past_time_features": self.data[list_idx][['date']].iloc[df_idx:known_df_idx].values, "future_time_features": self.data[list_idx][['date']].iloc[known_df_idx:predict_df_idx].values}


  _C._set_default_tensor_type(t)


In [13]:
dataset = StockDataset(data_2339)
print(len(dataset))
print(dataset[0]['past_time_features'].shape)

12250207
(720, 1)


In [11]:
train_ds, eval_ds, test_ds, l = torch.utils.data.random_split(dataset, [0.4, 0.0005, 0.2, 0.3995])

In [12]:
from transformers import InformerConfig, InformerForPrediction, Trainer, TrainingArguments, DefaultDataCollator
from transformers.utils import is_sagemaker_mp_enabled
from torch import nn

model = InformerForPrediction.from_pretrained("forecasting_model_v5")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [13]:
from transformers import PreTrainedModel, PretrainedConfig
class BoundEstimatorConfig(PretrainedConfig):
    model_type = "informer"
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
        "num_hidden_layers": "encoder_layers",
    }

    def __init__(
        self,
        prediction_length = None,
        context_length = None,
        distribution_output = "student_t",
        loss = "nll",
        input_size = 1,
        lags_sequence = None,
        scaling = "mean",
        num_dynamic_real_features = 0,
        num_static_real_features = 0,
        num_static_categorical_features = 0,
        num_time_features = 0,
        cardinality = None,
        embedding_dimension = None,
        d_model = 64,
        encoder_ffn_dim = 32,
        encoder_attention_heads = 2,
        encoder_layers = 2,
        is_encoder_decoder = False,
        activation_function  = "gelu",
        dropout = 0.05,
        encoder_layerdrop = 0.1,
        attention_dropout = 0.1,
        activation_dropout = 0.1,
        num_parallel_samples = 100,
        init_std = 0.02,
        use_cache = True,
        # Informer arguments
        attention_type = "prob",
        sampling_factor = 5,
        distil = True,
        num_labels = 3,
        **kwargs,
    ):
        # time series specific configuration
        self.prediction_length = prediction_length
        self.context_length = context_length or prediction_length
        self.distribution_output = distribution_output
        self.loss = loss
        self.input_size = input_size
        self.num_time_features = num_time_features
        self.lags_sequence = lags_sequence if lags_sequence is not None else [1, 2, 3, 4, 5, 6, 7]
        self.scaling = scaling
        self.num_dynamic_real_features = num_dynamic_real_features
        self.num_static_real_features = num_static_real_features
        self.num_static_categorical_features = num_static_categorical_features

        # set cardinality
        if cardinality and num_static_categorical_features > 0:
            if len(cardinality) != num_static_categorical_features:
                raise ValueError(
                    "The cardinality should be a list of the same length as `num_static_categorical_features`"
                )
            self.cardinality = cardinality
        else:
            self.cardinality = [0]

        # set embedding_dimension
        if embedding_dimension and num_static_categorical_features > 0:
            if len(embedding_dimension) != num_static_categorical_features:
                raise ValueError(
                    "The embedding dimension should be a list of the same length as `num_static_categorical_features`"
                )
            self.embedding_dimension = embedding_dimension
        else:
            self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality]

        self.num_parallel_samples = num_parallel_samples

        # Transformer architecture configuration
        self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features
        self.d_model = d_model
        self.encoder_attention_heads = encoder_attention_heads
        self.encoder_ffn_dim = encoder_ffn_dim
        self.encoder_layers = encoder_layers

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.encoder_layerdrop = encoder_layerdrop

        self.activation_function = activation_function
        self.init_std = init_std

        self.output_attentions = False
        self.output_hidden_states = False

        self.use_cache = use_cache

        # Informer
        self.attention_type = attention_type
        self.sampling_factor = sampling_factor
        self.distil = distil
        self.num_labels = num_labels

        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
        
    @property
    def _number_of_features(self) -> int:
        return (
            sum(self.embedding_dimension)
            + self.num_dynamic_real_features
            + self.num_time_features
            + self.num_static_real_features
            + self.input_size * 2  # the log1p(abs(loc)) and log(scale) features
        )
    def from_informer_config(self, config):
        # time series specific configuration
        self.prediction_length = config.prediction_length
        self.context_length = config.context_length
        self.distribution_output = config.distribution_output
        self.loss = config.loss
        self.input_size = config.input_size
        self.num_time_features = config.num_time_features
        self.lags_sequence = config.lags_sequence
        self.scaling = config.scaling
        self.num_dynamic_real_features = config.num_dynamic_real_features
        self.num_static_real_features = config.num_static_real_features
        self.num_static_categorical_features = config.num_static_categorical_features

        # set cardinality
        self.cardinality = config.cardinality

        # set embedding_dimension
        self.embedding_dimension = config.embedding_dimension

        self.num_parallel_samples = config.num_parallel_samples

        # Transformer architecture configuration
        self.feature_size = config.feature_size
        self.d_model = config.d_model
        self.encoder_attention_heads = config.encoder_attention_heads
        self.encoder_ffn_dim = config.encoder_ffn_dim
        self.encoder_layers = config.encoder_layers

        self.dropout = config.dropout
        self.attention_dropout = config.attention_dropout
        self.activation_dropout = config.activation_dropout
        self.encoder_layerdrop = config.encoder_layerdrop

        self.activation_function = config.activation_function
        self.init_std = config.init_std

        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_attentions

        self.use_cache = config.use_cache

        # Informer
        self.attention_type = config.attention_type
        self.sampling_factor = config.sampling_factor
        self.distil = config.distil
        self.num_labels = 3


In [14]:
from typing import List, Optional, Tuple, Union
from transformers.modeling_outputs import BaseModelOutput
from transformers.activations import ACT2FN

class InformerFeatureEmbedder(nn.Module):
    """
    Embed a sequence of categorical features.
    Args: 
        cardinalities (`list[int]`):
            List of cardinalities of the categorical features.
        embedding_dims (`list[int]`):
            List of embedding dimensions of the categorical features.
    """

    def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None:
        super().__init__()

        self.num_features = len(cardinalities)
        self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)])

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        if self.num_features > 1:
            # we slice the last dimension, giving an array of length
            # self.num_features with shape (N,T) or (N)
            cat_feature_slices = torch.chunk(features, self.num_features, dim=-1)
        else:
            cat_feature_slices = [features]

        return torch.cat(
            [
                embed(cat_feature_slice.squeeze(-1))
                for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices)
            ],
            dim=-1,
        )


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Informer
class InformerStdScaler(nn.Module):
    """
    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
    by subtracting from the mean and dividing by the standard deviation.
    Args:
        dim (`int`):
            Dimension along which to calculate the mean and standard deviation.
        keepdim (`bool`, *optional*, defaults to `False`):
            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
        minimum_scale (`float`, *optional*, defaults to 1e-5):
            Default scale that is used for elements that are constantly zero along dimension `dim`.
    """

    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
        super().__init__()
        if not dim > 0:
            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
        self.dim = dim
        self.keepdim = keepdim
        self.minimum_scale = minimum_scale

    @torch.no_grad()
    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        denominator = weights.sum(self.dim, keepdim=self.keepdim)
        denominator = denominator.clamp_min(1.0)
        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator

        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
        scale = torch.sqrt(variance + self.minimum_scale)
        return (data - loc) / scale, loc, scale


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Informer
class InformerMeanScaler(nn.Module):
    """
    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
    accordingly.
    Args:
        dim (`int`):
            Dimension along which to compute the scale.
        keepdim (`bool`, *optional*, defaults to `False`):
            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
        default_scale (`float`, *optional*, defaults to `None`):
            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
        minimum_scale (`float`, *optional*, defaults to 1e-10):
            Default minimum possible scale that is used for any item.
    """

    def __init__(
        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
    ):
        super().__init__()
        self.dim = dim
        self.keepdim = keepdim
        self.minimum_scale = minimum_scale
        self.default_scale = default_scale

    @torch.no_grad()
    def forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        # shape: (N, [C], T=1)
        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
        num_observed = observed_indicator.sum(self.dim, keepdim=True)

        scale = ts_sum / torch.clamp(num_observed, min=1)

        # If `default_scale` is provided, we use it, otherwise we use the scale
        # of the batch.
        if self.default_scale is None:
            batch_sum = ts_sum.sum(dim=0)
            batch_observations = torch.clamp(num_observed.sum(0), min=1)
            default_scale = torch.squeeze(batch_sum / batch_observations)
        else:
            default_scale = self.default_scale * torch.ones_like(scale)

        # apply default scale where there are no observations
        scale = torch.where(num_observed > 0, scale, default_scale)

        # ensure the scale is at least `self.minimum_scale`
        scale = torch.clamp(scale, min=self.minimum_scale)
        scaled_data = data / scale

        if not self.keepdim:
            scale = scale.squeeze(dim=self.dim)

        return scaled_data, torch.zeros_like(scale), scale


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Informer
class InformerNOPScaler(nn.Module):
    """
    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
    Args:
        dim (`int`):
            Dimension along which to compute the scale.
        keepdim (`bool`, *optional*, defaults to `False`):
            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
    """

    def __init__(self, dim: int, keepdim: bool = False):
        super().__init__()
        self.dim = dim
        self.keepdim = keepdim

    def forward(
        self, data: torch.Tensor, observed_indicator: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
        return data, loc, scale


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
    """
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.
    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    """
    if weights is not None:
        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
    else:
        return input_tensor.mean(dim=dim)


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
    """
    Computes the negative log likelihood loss from input distribution with respect to target.
    """
    return -input.log_prob(target)


# Copied from transformers.models.bart.modeling_bart._make_causal_mask
def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
    mask_cond = torch.arange(mask.size(-1))
    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)


# Copied from transformers.models.bart.modeling_bart._expand_mask
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    """
    bsz, src_len = mask.size()
    tgt_len = tgt_len if tgt_len is not None else src_len

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)

    inverted_mask = 1.0 - expanded_mask

    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)


# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Informer
class InformerSinusoidalPositionalEmbedding(nn.Embedding):
    """This module produces sinusoidal positional embeddings of any length."""

    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
        super().__init__(num_positions, embedding_dim)
        self.weight = self._init_weight(self.weight)

    @staticmethod
    def _init_weight(out: nn.Parameter) -> nn.Parameter:
        """
        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
        the 2nd half of the vector. [dim // 2:]
        """
        n_pos, dim = out.shape
        position_enc = np.array(
            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
        )
        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
        out.detach_()
        return out

    @torch.no_grad()
    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor:
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        bsz, seq_len = input_ids_shape[:2]
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        return super().forward(positions)


# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Info
class InformerValueEmbedding(nn.Module):
    def __init__(self, feature_size, d_model):
        super().__init__()
        self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False)

    def forward(self, x):
        return self.value_projection(x)


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Informer
class InformerAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj
        query_states = self.q_proj(hidden_states) * self.scaling
        # get key, value proj
        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
        # is checking that the `sequence_length` of the `past_key_value` is the same as
        # the provided `key_value_states` to support prefix tuning
        if (
            is_cross_attention
            and past_key_value is not None
            and past_key_value[0].shape[2] == key_value_states.shape[1]
        ):
            # reuse k,v, cross_attentions
            key_states = past_key_value[0]
            value_states = past_key_value[1]
        elif is_cross_attention:
            # cross_attentions
            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
        elif past_key_value is not None:
            # reuse k, v, self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)
        else:
            # self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        if self.is_decoder:
            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
            # Further calls to cross_attention layer can then reuse all cross-attention
            # key/value_states (first "if" case)
            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
            # all previous decoder key/value_states. Further calls to uni-directional self-attention
            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
            # if encoder bi-directional self-attention `past_key_value` is always `None`
            past_key_value = (key_states, value_states)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.reshape(*proj_shape)
        value_states = value_states.reshape(*proj_shape)

        src_len = key_states.size(1)
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
                raise ValueError(
                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                    f" {layer_head_mask.size()}"
                )
            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if output_attentions:
            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
        else:
            attn_weights_reshaped = None

        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

        attn_output = torch.bmm(attn_probs, value_states)

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
        attn_output = attn_output.transpose(1, 2)

        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
        # partitioned across GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped, past_key_value


class InformerProbSparseAttention(nn.Module):
    """Probabilistic Attention mechanism to select the "active"
    queries rather than the "lazy" queries and provides a sparse Transformer thus mitigating the quadratic compute and
    memory requirements of vanilla attention"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        sampling_factor: int = 5,
        bias: bool = True,
    ):
        super().__init__()
        self.factor = sampling_factor
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # if key_value_states are provided this layer is used as a cross-attention layer
        # for the decoder
        is_cross_attention = key_value_states is not None

        bsz, tgt_len, _ = hidden_states.size()

        # get query proj
        query_states = self.q_proj(hidden_states) * self.scaling
        # get key, value proj
        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
        # is checking that the `sequence_length` of the `past_key_value` is the same as
        # the provided `key_value_states` to support prefix tuning
        if (
            is_cross_attention
            and past_key_value is not None
            and past_key_value[0].shape[2] == key_value_states.shape[1]
        ):
            # reuse k,v, cross_attentions
            key_states = past_key_value[0]
            value_states = past_key_value[1]
        elif is_cross_attention:
            # cross_attentions
            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
        elif past_key_value is not None:
            # reuse k, v, self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
            key_states = torch.cat([past_key_value[0], key_states], dim=2)
            value_states = torch.cat([past_key_value[1], value_states], dim=2)
        else:
            # self_attention
            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        if self.is_decoder:
            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
            # Further calls to cross_attention layer can then reuse all cross-attention
            # key/value_states (first "if" case)
            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
            # all previous decoder key/value_states. Further calls to uni-directional self-attention
            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
            # if encoder bi-directional self-attention `past_key_value` is always `None`
            past_key_value = (key_states, value_states)

        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        key_states = key_states.reshape(*proj_shape)
        value_states = value_states.reshape(*proj_shape)

        key_states_time_length = key_states.size(1)  # L_K
        log_key_states_time_length = np.ceil(np.log1p(key_states_time_length)).astype("int").item()  # log_L_K

        query_states_time_length = query_states.size(1)  # L_Q
        log_query_states_time_length = np.ceil(np.log1p(query_states_time_length)).astype("int").item()  # log_L_Q

        u_part = min(self.factor * query_states_time_length * log_key_states_time_length, key_states_time_length)
        u = min(self.factor * log_query_states_time_length, query_states_time_length)

        if key_states_time_length > 0:
            index_sample = torch.randint(0, key_states_time_length, (u_part,))
            k_sample = key_states[:, index_sample, :]
        else:
            k_sample = key_states

        queries_keys_sample = torch.bmm(query_states, k_sample.transpose(1, 2))  # Q_K_sampled

        # find the Top_k query with sparsity measurement
        if u > 0:
            sparsity_measurement = queries_keys_sample.max(dim=-1)[0] - torch.div(
                queries_keys_sample.sum(dim=-1), key_states_time_length
            )  # M
            top_u_sparsity_measurement = sparsity_measurement.topk(u, sorted=False)[1]  # M_top

            # calculate q_reduce: query_states[:, top_u_sparsity_measurement]
            dim_for_slice = torch.arange(query_states.size(0)).unsqueeze(-1)
            q_reduce = query_states[dim_for_slice, top_u_sparsity_measurement]
        else:
            q_reduce = query_states
            top_u_sparsity_measurement = None

        # Use q_reduce to calculate attention weights
        attn_weights = torch.bmm(q_reduce, key_states.transpose(1, 2))

        src_len = key_states.size(1)
        if attn_weights.size() != (bsz * self.num_heads, u, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, u, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            prob_mask = attention_mask.expand(bsz, self.num_heads, tgt_len, src_len).reshape(
                bsz * self.num_heads, tgt_len, src_len
            )

            if top_u_sparsity_measurement is not None:
                dim_for_slice = torch.arange(prob_mask.size(0)).unsqueeze(-1)
                prob_mask = prob_mask[dim_for_slice, top_u_sparsity_measurement, :]

            attn_weights = attn_weights.view(bsz, self.num_heads, u, src_len) + prob_mask.view(
                bsz, self.num_heads, u, src_len
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        if layer_head_mask is not None:
            if layer_head_mask.size() != (self.num_heads,):
                raise ValueError(
                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                    f" {layer_head_mask.size()}"
                )
            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, u, src_len)
            attn_weights = attn_weights.view(bsz * self.num_heads, u, src_len)

        if output_attentions:
            # this operation is a bit awkward, but it's required to
            # make sure that attn_weights keeps its gradient.
            # In order to do so, attn_weights have to be reshaped
            # twice and have to be reused in the following
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, u, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, u, src_len)
        else:
            attn_weights_reshaped = None

        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = torch.bmm(attn_probs, value_states)

        # calculate context for updating the attn_output, based on:
        # https://github.com/zhouhaoyi/Informer2020/blob/ac59c7447135473fb2aafeafe94395f884d5c7a5/models/attn.py#L74
        if self.is_decoder:
            context = value_states.cumsum(dim=-2)
        else:
            v_mean_dim_time = value_states.mean(dim=-2)
            context = (
                v_mean_dim_time.unsqueeze(dim=1)
                .expand(bsz * self.num_heads, query_states_time_length, v_mean_dim_time.size(-1))
                .clone()
            )

        if top_u_sparsity_measurement is not None:
            # update context: copy the attention output to the context at top_u_sparsity_measurement index
            dim_for_slice = torch.arange(context.size(0)).unsqueeze(-1)
            context[dim_for_slice, top_u_sparsity_measurement, :] = attn_output
            attn_output = context

        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
        attn_output = attn_output.transpose(1, 2)

        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
        # partitioned across GPUs when using tensor-parallelism.
        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped, past_key_value


# source: https://github.com/zhouhaoyi/Informer2020/blob/main/models/encoder.py
class InformerConvLayer(nn.Module):
    def __init__(self, c_in):
        super().__init__()
        self.downConv = nn.Conv1d(
            in_channels=c_in,
            out_channels=c_in,
            kernel_size=3,
            padding=1,
            padding_mode="circular",
        )
        self.norm = nn.BatchNorm1d(c_in)
        self.activation = nn.ELU()
        self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

    def forward(self, x):
        x = self.downConv(x.permute(0, 2, 1))
        x = self.norm(x)
        x = self.activation(x)
        x = self.maxPool(x)
        x = x.transpose(1, 2)
        return x


class InformerEncoderLayer(nn.Module):
    def __init__(self, config: InformerConfig):
        super().__init__()
        self.embed_dim = config.d_model
        if config.attention_type == "prob":
            self.self_attn = InformerProbSparseAttention(
                embed_dim=self.embed_dim,
                num_heads=config.encoder_attention_heads,
                dropout=config.attention_dropout,
                sampling_factor=config.sampling_factor,
            )
        else:
            self.self_attn = InformerAttention(
                embed_dim=self.embed_dim,
                num_heads=config.encoder_attention_heads,
                dropout=config.attention_dropout,
            )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        attention_mask: torch.FloatTensor,
        layer_head_mask: torch.FloatTensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs
    
class BoundEstimatorPreTrainedModel(PreTrainedModel):
    config_class = BoundEstimatorConfig
    base_model_prefix = "model"
    main_input_name = "past_values"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    def _set_gradient_checkpointing(self, module, value=False):
        if isinstance(module, InformerDecoder):
            module.gradient_checkpointing = value
            
class InformerEncoder(BoundEstimatorPreTrainedModel):
    """
    Informer encoder consisting of *config.encoder_layers* self attention layers with distillation layers. Each
    attention layer is an [`InformerEncoderLayer`].
    Args:
        config: InformerConfig
    """

    def __init__(self, config: BoundEstimatorPreTrainedModel):
        super().__init__(config)

        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop
        self.gradient_checkpointing = False
        if config.prediction_length is None:
            raise ValueError("The `prediction_length` config needs to be specified.")

        self.value_embedding = InformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model)
        self.embed_positions = InformerSinusoidalPositionalEmbedding(
            config.context_length + config.prediction_length, config.d_model
        )
        self.layers = nn.ModuleList([InformerEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        if config.distil:
            self.conv_layers = nn.ModuleList(
                    [InformerConvLayer(config.d_model) for _ in range(config.encoder_layers - 1)]
                )
            self.conv_layers.append(None)
        else:
            self.conv_layers = [None] * config.encoder_layers
        
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        hidden_states = self.value_embedding(inputs_embeds)
        embed_pos = self.embed_positions(inputs_embeds.size())

        hidden_states = self.layernorm_embedding(hidden_states + embed_pos)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        # expand attention_mask
        if attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)

        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # check if head_mask has a correct number of layers specified if desired
        if head_mask is not None:
            if head_mask.size()[0] != (len(self.layers)):
                raise ValueError(
                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
                    f" {head_mask.size()[0]}."
                )

        for idx, (encoder_layer, conv_layer) in enumerate(zip(self.layers, self.conv_layers)):
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
                layer_outputs = (None, None)
            else:
                if self.gradient_checkpointing and self.training:

                    def create_custom_forward(module):
                        def custom_forward(*inputs):
                            return module(*inputs, output_attentions)

                        return custom_forward

                    layer_outputs = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(encoder_layer),
                        hidden_states,
                        attention_mask,
                        (head_mask[idx] if head_mask is not None else None),
                    )
                    if conv_layer is not None:
                        output = torch.utils.checkpoint.checkpoint(conv_layer, layer_outputs[0])
                        layer_outputs = (output,) + layer_outputs[1:]
                else:
                    layer_outputs = encoder_layer(
                        hidden_states,
                        attention_mask,
                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                        output_attentions=output_attentions,
                    )
                    if conv_layer is not None:
                        output = conv_layer(layer_outputs[0])
                        layer_outputs = (output,) + layer_outputs[1:]

                hidden_states = layer_outputs[0]

            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

In [15]:
!pip3 install --force-reinstall --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121

Looking in indexes: https://download.pytorch.org/whl/nightly/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/nightly/cu121/torch-2.2.0.dev20231210%2Bcu121-cp311-cp311-linux_x86_64.whl (755.3 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/nightly/cu121/torchvision-0.17.0.dev20231210%2Bcu121-cp311-cp311-linux_x86_64.whl (7.1 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/nightly/cu121/torchaudio-2.2.0.dev20231210%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
Collecting filelock (from torch)
  Using cached https://download.pytorch.org/whl/nightly/filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached https://download.pytorch.org/whl/nightly/typing_extensions-4.8.0-py3-none-any.whl (31 kB)
Collecting sympy (from torch)
  Using cached https://download.pytorch.org/whl/nightly/sympy-1.11.1-py3-none-any.whl (6.5 MB)
Collecting networkx (from torch)
  Using cached http

In [36]:
from transformers import PreTrainedModel, PretrainedConfig
import torch.nn.functional as F
import math
import copy
            
class BoundEstimatorModel(BoundEstimatorPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        if config.scaling == "mean" or config.scaling:
            self.scaler = InformerMeanScaler(dim=1, keepdim=True)
        elif config.scaling == "std":
            self.scaler = InformerStdScaler(dim=1, keepdim=True)
        else:
            self.scaler = InformerNOPScaler(dim=1, keepdim=True)

        # transformer encoder and mask initializer
        self.encoder = InformerEncoder(config)
        self.ff_1 = nn.ModuleList([nn.Linear(config.d_model, 32) for _ in range(config.context_length)])
        self.ff_2 = nn.ModuleList([nn.Linear(32, 1) for _ in range(config.context_length)])
        self.classifier2 = nn.Linear(math.ceil(config.context_length/2), config.num_labels)
        self.classifier = nn.Linear(config.context_length, config.num_labels)
        
        # Initialize weights and apply final processing
        self.post_init()

    def from_informer_model(self, model):
        self.encoder = model.encoder
        self.scaler = model.scaler
        
    @property
    def _past_length(self):
        return self.config.context_length + max(self.config.lags_sequence)
    
    def get_lagged_subsequences(
        self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0
    ):
        sequence_length = sequence.shape[1]
        indices = [lag - shift for lag in self.config.lags_sequence]

        if max(indices) + subsequences_length > sequence_length:
            raise ValueError(
                f"lags cannot go further than history length, found lag {max(indices)} "
                f"while history length is only {sequence_length}"
            )

        lagged_values = []
        for lag_index in indices:
            begin_index = -lag_index - subsequences_length
            end_index = -lag_index if lag_index > 0 else None
            lagged_values.append(sequence[:, begin_index:end_index, ...])
        return torch.stack(lagged_values, dim=-1)
    
    def create_network_inputs(
        self,
        past_values: torch.Tensor,
        past_time_features: torch.Tensor,
        past_observed_mask = None,
        future_values = None,
        future_time_features = None,
    ):
        # time feature
        time_feat = (
            torch.cat(
                (
                    past_time_features[:, self._past_length - self.config.context_length :, ...],
                    future_time_features,
                ),
                dim=1,
            )
            if future_values is not None
            else past_time_features[:, self._past_length - self.config.context_length :, ...]
        )

        # target
        if past_observed_mask is None:
            past_observed_mask = torch.ones_like(past_values)

        context = past_values[:, -self.config.context_length :]
        observed_context = past_observed_mask[:, -self.config.context_length :]
        _, loc, scale = self.scaler(context, observed_context)

        inputs = (
            (torch.cat((past_values, future_values), dim=1) - loc) / scale
            if future_values is not None
            else (past_values - loc) / scale
        )

        # static features
        log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p()
        log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log()
        static_feat = torch.cat((log_abs_loc, log_scale), dim=1)

        expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1)

        # all features
        features = torch.cat((expanded_static_feat, time_feat), dim=-1)

        # lagged features
        subsequences_length = (
            self.config.context_length + self.config.prediction_length
            if future_values is not None
            else self.config.context_length
        )
        lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length)
        lags_shape = lagged_sequence.shape
        reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1)

        if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]:
            raise ValueError(
                f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match"
            )

        # transformer inputs
        transformer_inputs = torch.cat((reshaped_lagged_sequence, features), dim=-1)

        return transformer_inputs, loc, scale, static_feat       
        
    def forward(
        self,
        past_values: torch.Tensor,
        past_time_features: torch.Tensor,
        past_observed_mask: torch.Tensor,
        future_values = None,
        future_time_features = None,
        decoder_attention_mask = None,
        head_mask = None,
        decoder_head_mask = None,
        cross_attn_head_mask= None,
        past_key_values = None,
        output_hidden_states= None,
        output_attentions = None,
        use_cache = None,
        return_dict = None,
    ):
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_inputs, loc, scale, static_feat = self.create_network_inputs(
            past_values=past_values,
            past_time_features=past_time_features,
            past_observed_mask=past_observed_mask,
            future_values=future_values,
            future_time_features=future_time_features,
        )
        enc_input = transformer_inputs[:, : self.config.context_length, ...]
        encoder_outputs = self.encoder(
            inputs_embeds=enc_input,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        ff = [torch.nn.functional.relu(self.ff_1[i](encoder_outputs.last_hidden_state[:, i, :])) for i in range(encoder_outputs.last_hidden_state.size(1))]
        ff = [torch.nn.functional.relu(self.ff_2[i](ff[i])) for i in range(encoder_outputs.last_hidden_state.size(1))]
        x = torch.stack(ff, dim=1).squeeze()
        if (encoder_outputs.last_hidden_state.size(1) == self.config.context_length):
            outputs = self.classifier(x)
        else:
            outputs = self.classifier2(x)
        if not return_dict:
            return {"outputs": outputs } + encoder_outputs + (loc, scale, static_feat)

        return outputs

In [37]:
config = BoundEstimatorConfig()
config.from_informer_config(model.config)
boundModel = BoundEstimatorModel(config)
boundModel.from_informer_model(model.model)

In [42]:
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

training_args = TrainingArguments(
    output_dir='./resultsv3',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='steps',
    report_to="none",
    logging_steps=250)


class TimeSerieDataCollator:
    def __init__(self):
        self.default_data_collator = DefaultDataCollator()
 
    def __call__(self, batch):
        [x.update({'past_observed_mask': torch.ones(x["past_values"].shape)}) for x in batch]
        return self.default_data_collator(batch)

def compute_metrics(eval_pred):
    _, results = eval_pred
    predictions = np.argmax(results["actual"], -1)
    labels = results["expected"]
    non_zeros_labels = labels[labels != 0]
    non_zeros_predictions = predictions[labels != 0]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', warn_for=tuple())
    nz_precision, nz_recall, nz_f1, _ = precision_recall_fscore_support(non_zeros_labels, non_zeros_predictions, average='weighted', warn_for=tuple())
    acc = accuracy_score(labels, predictions)
    nz_acc = accuracy_score(non_zeros_labels, non_zeros_predictions)
    
    return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, "nonzeros_pred": np.count_nonzero(predictions) / len(predictions), "nonzeros_labels": np.count_nonzero(labels) / len(labels),
            'nz_precision': nz_precision, 'nz_recall': nz_recall, 'nz_f1': nz_f1, 'nz_accuracy': nz_acc }


class StockTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # forward pass
        loss = nn.CrossEntropyLoss()
        outputs = model(**inputs)
        label = torch.zeros((inputs["future_values"].size(0))).long()
        average = torch.zeros((inputs["future_values"].size(0)))
        for i, v in enumerate(inputs["future_values"]):
            for j in v:
                average[i] += j[features.tolist().index("average") - 1].detach().cpu()
                if average[i] >= (0.01 + daily_average):
                    label[i] = 2
                elif average[i] <= -(0.01 + daily_average):
                    label[i] = 1
        label = label.to(model.device)
        loss = loss(outputs, label)
        return (loss, outputs) if return_outputs else loss
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys= None):
        label = torch.zeros((inputs["future_values"].size(0))).long()
        average = torch.zeros((inputs["future_values"].size(0)))
        for i, v in enumerate(inputs["future_values"]):
            for j in v:
                average[i] += j[features.tolist().index("average") - 1].detach().cpu()
                if average[i] >= (0.01 + daily_average):
                    label[i] = 2
                elif average[i] <= -(0.01 + daily_average):
                    label[i] = 1
        with torch.no_grad():
            outputs = model(**inputs)
        return (None, list(), { "actual": F.softmax(outputs, dim=1).detach().cpu(), "expected": label })


trainer = StockTrainer(
    model=boundModel,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=TimeSerieDataCollator(),
    compute_metrics=compute_metrics,)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Nonzeros Pred,Nonzeros Labels,Nz Precision,Nz Recall,Nz F1,Nz Accuracy
250,0.3762,No log,0.911035,0.868623,0.829985,0.911035,0.0,0.088965,0.0,0.0,0.0,0.0
500,0.3837,No log,0.911035,0.868623,0.829985,0.911035,0.0,0.088965,0.0,0.0,0.0,0.0
750,0.3278,No log,0.911035,0.868623,0.829985,0.911035,0.0,0.088965,0.0,0.0,0.0,0.0
1000,0.3182,No log,0.911035,0.868623,0.829985,0.911035,0.0,0.088965,0.0,0.0,0.0,0.0
1250,0.3662,No log,0.911035,0.868623,0.829985,0.911035,0.0,0.088965,0.0,0.0,0.0,0.0
1500,0.3394,No log,0.911035,0.868623,0.829985,0.911035,0.0,0.088965,0.0,0.0,0.0,0.0
1750,0.3169,No log,0.913157,0.887039,0.863509,0.913157,0.030362,0.088965,0.250528,0.122936,0.164936,0.122936
2000,0.3301,No log,0.914136,0.883739,0.859095,0.914136,0.019752,0.088965,0.2338,0.082569,0.122039,0.082569
2250,0.3329,No log,0.912014,0.878771,0.864608,0.912014,0.013222,0.088965,0.394233,0.047706,0.084528,0.047706
2500,0.292,No log,0.91332,0.881339,0.862163,0.91332,0.015834,0.088965,0.314908,0.06422,0.10367,0.06422


In [None]:
trainer.save_model("boundEstimatorModel")

In [None]:
boundModel2 = BoundEstimatorModel.from_pretrained("boundEstimatorModel")