In [1]:
import pandas as pd

df = pd.read_csv('common_10s_20231112213000.csv', parse_dates=["date"]).drop(['Unnamed: 0'], axis=1, errors='ignore')

In [2]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20009502 entries, 0 to 20009501
Data columns (total 9 columns):
 #   Column    Dtype         
---  ------    -----         
 0   ticker    object        
 1   date      datetime64[ns]
 2   open      float64       
 3   high      float64       
 4   low       float64       
 5   close     float64       
 6   average   float64       
 7   volume    int64         
 8   barcount  int64         
dtypes: datetime64[ns](1), float64(5), int64(2), object(1)
memory usage: 1.3+ GB


Unnamed: 0,ticker,date,open,high,low,close,average,volume,barcount
0,SPY,2023-09-01 09:30:00,453.17,453.2,452.89,452.9,453.064,397591,944
1,SPY,2023-09-01 09:30:10,452.91,453.01,452.89,452.95,452.956,77825,584
2,SPY,2023-09-01 09:30:20,452.95,453.09,452.95,453.02,453.034,47865,312
3,SPY,2023-09-01 09:30:30,453.01,453.13,452.9,452.91,453.001,53428,408
4,SPY,2023-09-01 09:30:40,452.92,453.1,452.91,453.03,452.981,65112,423


In [3]:
# Group by ticker and remove the ticker label
df_bytickers = [ticker.drop(columns=['ticker']) for _, ticker in df.groupby(df.ticker)]
print(f"There is {len(df_bytickers)} dataframes grouped by tickers")
df_bytickers[0].info()
df_bytickers[0].head()

There is 55 dataframes grouped by tickers
<class 'pandas.core.frame.DataFrame'>
Index: 363960 entries, 8268907 to 19377701
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   date      363960 non-null  datetime64[ns]
 1   open      363960 non-null  float64       
 2   high      363960 non-null  float64       
 3   low       363960 non-null  float64       
 4   close     363960 non-null  float64       
 5   average   363960 non-null  float64       
 6   volume    363960 non-null  int64         
 7   barcount  363960 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(2)
memory usage: 25.0 MB


Unnamed: 0,date,open,high,low,close,average,volume,barcount
8268907,2023-03-27 09:30:00,13.96,14.03,13.95,14.0,13.978,246106,158
8268908,2023-03-27 09:30:10,14.0,14.02,14.0,14.02,14.015,11512,23
8268909,2023-03-27 09:30:20,14.02,14.02,14.01,14.02,14.018,13960,23
8268910,2023-03-27 09:30:30,14.02,14.03,14.01,14.01,14.013,18475,93
8268911,2023-03-27 09:30:40,14.02,14.03,14.0,14.03,14.02,32586,63


In [4]:
# Grouping by day, and reformatting the date to be time of day instead of datetime
df_bydate = [[date for _, date in dates.groupby(dates['date'].dt.date)] for dates in df_bytickers]
df_bydate = [[date.apply(lambda x: x.dt.hour + x.dt.minute/60 + x.dt.second/3600 if x.name in ['date'] else x) for date in ticker] for ticker in df_bydate]
print(f"There is roughly {len(df_bydate[0])} dataframes that correspond to days for each ticker")
df_bydate[0][0].info()
df_bydate[0][0].head()

There is roughly 156 dataframes that correspond to days for each ticker
<class 'pandas.core.frame.DataFrame'>
Index: 2340 entries, 8268907 to 8271246
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2340 non-null   float64
 1   open      2340 non-null   float64
 2   high      2340 non-null   float64
 3   low       2340 non-null   float64
 4   close     2340 non-null   float64
 5   average   2340 non-null   float64
 6   volume    2340 non-null   int64  
 7   barcount  2340 non-null   int64  
dtypes: float64(6), int64(2)
memory usage: 164.5 KB


Unnamed: 0,date,open,high,low,close,average,volume,barcount
8268907,9.5,13.96,14.03,13.95,14.0,13.978,246106,158
8268908,9.502778,14.0,14.02,14.0,14.02,14.015,11512,23
8268909,9.505556,14.02,14.02,14.01,14.02,14.018,13960,23
8268910,9.508333,14.02,14.03,14.01,14.01,14.013,18475,93
8268911,9.511111,14.02,14.03,14.0,14.03,14.02,32586,63


In [5]:
columns_with_zeros = df.eq(0).any()[lambda x: x].keys().values
eps = 1e-16
df_deltas = [[date.apply(lambda x: x + eps if x.name in columns_with_zeros else x) for date in ticker] for ticker in df_bydate]
df_deltas = [[date.apply(lambda x: x.pct_change() if x.name not in ['date'] else x).iloc[1:] for date in ticker] for ticker in df_deltas]

In [6]:
df_deltas[0][0].info()
df_deltas[0][0].head()

<class 'pandas.core.frame.DataFrame'>
Index: 2339 entries, 8268908 to 8271246
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2339 non-null   float64
 1   open      2339 non-null   float64
 2   high      2339 non-null   float64
 3   low       2339 non-null   float64
 4   close     2339 non-null   float64
 5   average   2339 non-null   float64
 6   volume    2339 non-null   float64
 7   barcount  2339 non-null   float64
dtypes: float64(8)
memory usage: 164.5 KB


Unnamed: 0,date,open,high,low,close,average,volume,barcount
8268908,9.502778,0.002865,-0.000713,0.003584,0.001429,0.002647,-0.953223,-0.85443
8268909,9.505556,0.001429,0.0,0.000714,0.0,0.000214,0.212648,0.0
8268910,9.508333,0.0,0.000713,0.0,-0.000713,-0.000357,0.323424,3.043478
8268911,9.511111,0.0,0.0,-0.000714,0.001428,0.0005,0.763789,-0.322581
8268912,9.513889,0.0,0.000713,0.001429,0.0,0.000428,-0.620236,-0.142857


In [7]:
df_deltas = df_deltas
#print(df_deltas[0][0].info())
daily_sums = list()
for tickers in df_deltas:
    for days in tickers:
        daily_sums.append(days.loc[:, 'average'].sum())
daily_average = abs(sum(daily_sums) / len(daily_sums))
features = df_deltas[0][0].columns.values
print(daily_average)
print(features)

5.892697979016721e-05
['date' 'open' 'high' 'low' 'close' 'average' 'volume' 'barcount']


In [8]:
import itertools
data = list(itertools.chain(*df_deltas))
data_2339 = list()
for v in data:
    if (len(v) == 2339):
        data_2339.append(v)

In [9]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, random_split

torch.set_default_tensor_type(torch.DoubleTensor)

class StockDataset(TensorDataset):
    def __init__(self, data, known_interval_in_tens_of_seconds=720, predict_interval_in_tens_of_seconds=180, daily_interval_in_tens_of_seconds=2339):
        self.data = data
        self.known_interval_in_tens_of_seconds = known_interval_in_tens_of_seconds
        self.predict_interval_in_tens_of_seconds = predict_interval_in_tens_of_seconds
        self.daily_length = daily_interval_in_tens_of_seconds - (known_interval_in_tens_of_seconds + predict_interval_in_tens_of_seconds)
        self.length = len(data) * self.daily_length
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        list_idx = index // self.daily_length
        df_idx = index % self.daily_length
        known_df_idx = df_idx + self.known_interval_in_tens_of_seconds
        predict_df_idx = known_df_idx + self.predict_interval_in_tens_of_seconds
        #past_values = self.data[list_idx][['average']].iloc[df_idx:known_df_idx].values
        #past_time_features = self.data[list_idx].loc[:, self.data[list_idx].columns != 'average'].iloc[df_idx:known_df_idx].values
        #future_time_features = self.data[list_idx].loc[:, self.data[list_idx].columns != 'average'].iloc[known_df_idx:predict_df_idx].values
        #future_values = self.data[list_idx][['average']].iloc[known_df_idx:predict_df_idx].values
        #past_observed_mask = np.ones(past_values.shape)
        return {"past_values": self.data[list_idx].loc[:, self.data[list_idx].columns != 'date'].iloc[df_idx:known_df_idx].values, "future_values": self.data[list_idx].loc[:, self.data[list_idx].columns != 'date'].iloc[known_df_idx:predict_df_idx].values, 
                "past_time_features": self.data[list_idx][['date']].iloc[df_idx:known_df_idx].values, "future_time_features": self.data[list_idx][['date']].iloc[known_df_idx:predict_df_idx].values}


In [10]:
dataset = StockDataset(data_2339)
print(dataset[0]['past_time_features'].shape)

(720, 1)


In [11]:
train_ds, eval_ds = torch.utils.data.random_split(dataset, [0.99999, 0.00001])

In [12]:
from transformers import InformerConfig, InformerForPrediction, Trainer, TrainingArguments, DefaultDataCollator
from transformers.utils import is_sagemaker_mp_enabled
from evaluate import load
from torch import nn

mase_metric = load("evaluate-metric/mase", "multilist")
smape_metric = load("evaluate-metric/smape", "multilist")

alpha = 2

configuration = InformerConfig(
    context_length=720-7,
    prediction_length=180,
    input_size=7,
    num_time_features=1)

model = InformerForPrediction.from_pretrained("forecasting_model_v4")

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',
    report_to="none",
    logging_steps=10_000)


class TimeSerieDataCollator:
    def __init__(self):
        self.default_data_collator = DefaultDataCollator()
 
    def __call__(self, batch):
        [x.update({'past_observed_mask': torch.ones(x["past_values"].shape)}) for x in batch]
        return self.default_data_collator(batch)

def compute_metrics(eval_pred):
    _, labels = eval_pred
    mase_results = list()
    smape_results = list()
    for expected, actual, train in zip(list(np.moveaxis(labels["expected"], 0, 0)), list(np.moveaxis(labels["actual"], 0, 0)), list(np.moveaxis(labels["train"], 0, 0))):
        mase_results.append(mase_metric.compute(predictions=actual, references=expected, training=train))
        smape_results.append(smape_metric.compute(predictions=actual, references=expected))
    mase = sum([min(x['mase'], alpha) for x in mase_results]) / len(smape_results)
    smape = sum([min(x['smape'], alpha) for x in smape_results]) / len(smape_results)
    return {'mase': mase, 'smape': smape}

class StockTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # forward pass
        outputs = model(**inputs)
        loss = outputs.loss
        return (loss, outputs.encoder_last_hidden_state) if return_outputs else loss
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys= None):
        results = {"expected": inputs.pop("future_values", None)}
        with torch.no_grad():
            if is_sagemaker_mp_enabled():
                raw_outputs = smp_forward_only(model.generate, inputs)
                results.update({"actual": torch.mean(outputs.sequences.detach().cpu(), dim=1), "train": inputs["past_values"].detach().cpu()})
            else:
                with self.compute_loss_context_manager():
                    outputs = model.generate(**inputs)
                    results.update({"actual": torch.mean(outputs.sequences.detach().cpu(), dim=1), "train": inputs["past_values"].detach().cpu()})
        return (None, list(), results)

trainer = StockTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=TimeSerieDataCollator(),
    compute_metrics=compute_metrics,)

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mase,Smape
1,22.4968,No log,0.739452,1.803197
2,21.3004,No log,0.759617,1.790564


TrainOutput(global_step=3062522, training_loss=22.3127311392404, metrics={'train_runtime': 77167.3618, 'train_samples_per_second': 317.494, 'train_steps_per_second': 39.687, 'total_flos': 1.038580199224848e+17, 'train_loss': 22.3127311392404, 'epoch': 2.0})

In [14]:
trainer.save_model("forecasting_model_v5")