In [2]:
!pip install tables



In [3]:
import pandas as pd
import numpy as np

uids = np.arange(1, 2001)
periods = pd.period_range(start="2018-01", end="2021-12", freq="M")
times = pd.date_range("09:30", "16:00", freq="5min").time
seconds = np.array([t.hour*3600 + t.minute*60 for t in times])
monthly_dfs = []

for p in periods:
    days = pd.date_range(start=p.start_time.normalize(), periods=21, freq="D")
    all_dates = np.repeat(days.values, len(times))
    all_seconds = np.tile(seconds, len(days))
    dt_index = pd.to_datetime(all_dates.astype("datetime64[D]")) + pd.to_timedelta(all_seconds, unit="s")
    midx = pd.MultiIndex.from_product([dt_index, uids], names=["DateTime", "UID"])
    data = np.random.randn(len(midx), 10)
    cols = [f"F{i}" for i in range(1, 10)] + ["T"]
    df = pd.DataFrame(data, index=midx, columns=cols)
    monthly_dfs.append(df)

In [4]:
# assume `monthly_dfs` is your list of 36 DataFrames in order Jan 2018…Dec 2021
for df in monthly_dfs:
    month = df.index.levels[0][0].strftime("%Y-%m")  # e.g. "2018-01"
    df.to_hdf(f"data_{month}.h5",
              key="data",
              mode="w",
              format="table",
              data_columns=["UID"])   # allows fast groupby on UID

In [9]:
# 1) Imports & Hyperparams
import glob
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import IterableDataset, DataLoader

# Adjust paths & training params here
DATA_DIR    = "/content"        # where your data_YYYY-MM.h5 live
WINDOW_SIZE = 78 * 5                 # e.g. 5-tick lookback (modify as needed)
STEP        = 1
BATCH_SIZE  = 256
NUM_WORKERS = 4
LR          = 1e-3
EPOCHS      = 3
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# 2) Dataset: streams each HDF5 shard, one UID at a time, sliding windows
class IntradayDataset(IterableDataset):
    def __init__(self, h5_paths, window_size, step=1):
        self.h5_paths    = h5_paths
        self.window_size = window_size
        self.step        = step

    def __iter__(self):
        worker = torch.utils.data.get_worker_info()
        wid, n_w = (worker.id, worker.num_workers) if worker else (0,1)

        # simple sharding of files across workers
        my_files = [
            p for idx, p in enumerate(self.h5_paths)
            if idx % n_w == wid
        ]

        for path in my_files:
            store = pd.HDFStore(path, mode="r")
            df    = store["data"]  # your MultiIndex DF

            # group by UID (inner index level)
            for uid, sub in df.groupby(level=1):
                arr = sub.values   # shape (Timestamps, 10)

                # slide windows
                for start in range(0, arr.shape[0] - self.window_size + 1, self.step):
                    x = arr[start:start+self.window_size, :9]
                    y = arr[start+self.window_size-1, 9]
                    yield torch.from_numpy(x.astype("float32")), torch.tensor(float(y), dtype=torch.float32)

            store.close()

In [10]:
# 3) DataLoader & Dataset instantiation
h5_paths = sorted(glob.glob(f"{DATA_DIR}/data_20*.h5"))
dataset  = IntradayDataset(h5_paths, window_size=WINDOW_SIZE, step=STEP)
loader   = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    prefetch_factor=2,
)

In [11]:
h5_paths

['/content/data_2018-01.h5',
 '/content/data_2018-02.h5',
 '/content/data_2018-03.h5',
 '/content/data_2018-04.h5',
 '/content/data_2018-05.h5',
 '/content/data_2018-06.h5',
 '/content/data_2018-07.h5',
 '/content/data_2018-08.h5',
 '/content/data_2018-09.h5',
 '/content/data_2018-10.h5',
 '/content/data_2018-11.h5',
 '/content/data_2018-12.h5',
 '/content/data_2019-01.h5',
 '/content/data_2019-02.h5',
 '/content/data_2019-03.h5',
 '/content/data_2019-04.h5',
 '/content/data_2019-05.h5',
 '/content/data_2019-06.h5',
 '/content/data_2019-07.h5',
 '/content/data_2019-08.h5',
 '/content/data_2019-09.h5',
 '/content/data_2019-10.h5',
 '/content/data_2019-11.h5',
 '/content/data_2019-12.h5',
 '/content/data_2020-01.h5',
 '/content/data_2020-02.h5',
 '/content/data_2020-03.h5',
 '/content/data_2020-04.h5',
 '/content/data_2020-05.h5',
 '/content/data_2020-06.h5',
 '/content/data_2020-07.h5',
 '/content/data_2020-08.h5',
 '/content/data_2020-09.h5',
 '/content/data_2020-10.h5',
 '/content/dat

In [12]:
# 4) LSTM model definition
class LSTMRegressor(nn.Module):
    def __init__(self, input_size=9, hidden_size=128, num_layers=2, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(input_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout)
        self.fc   = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x: [batch, seq_len, features]
        out, _ = self.lstm(x)       # out: [batch, seq_len, hidden]
        last   = out[:, -1, :]      # take last time step
        return self.fc(last).squeeze(1)  # [batch]

In [13]:
# 5) Training loop
model = LSTMRegressor().to(DEVICE)
opt   = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for i, (x, y) in enumerate(loader, 1):
        x, y = x.to(DEVICE), y.to(DEVICE)
        opt.zero_grad()
        pred = model(x)
        loss = loss_fn(pred, y)
        loss.backward()
        opt.step()

        total_loss += loss.item()
        if i % 50 == 0:
            print(f"Epoch {epoch} | Batch {i} | Avg Loss {(total_loss/i):.6f}")

    print(f"End Epoch {epoch} | Avg Loss {total_loss/i:.6f}")

Epoch 1 | Batch 50 | Avg Loss 1.008062
Epoch 1 | Batch 100 | Avg Loss 1.003806


KeyboardInterrupt: 

In [None]:
# 6) Save your model
torch.save(model.state_dict(), "lstm_regressor.pth")
print("Done!")