## Test out the DataLoader

In [21]:
import polars as pl
from features.DataLoader import DataLoader
import logging

In [22]:
data_path = "../data/raw"
time_id = 5
logger = logging.getLogger(__name__)
book_example = DataLoader.load_data_for_time_id(f'{data_path}/book_train.parquet/stock_id=0', time_id, logger)
trade_example =  DataLoader.load_data_for_time_id(f'{data_path}/trade_train.parquet/stock_id=0', time_id, logger)


2025-08-19 18:13:39,435 [INFO] Loading data for time_id
2025-08-19 18:13:39,479 [INFO] Parquet is getting read
2025-08-19 18:13:39,480 [INFO] shape: (5, 10)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ time_id ┆ seconds_in ┆ bid_price ┆ ask_price ┆ … ┆ bid_size1 ┆ ask_size1 ┆ bid_size2 ┆ ask_size2 │
│ ---     ┆ _bucket    ┆ 1         ┆ 1         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i16     ┆ ---        ┆ ---       ┆ ---       ┆   ┆ i32       ┆ i32       ┆ i32       ┆ i32       │
│         ┆ i16        ┆ f32       ┆ f32       ┆   ┆           ┆           ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 5       ┆ 0          ┆ 1.001422  ┆ 1.002301  ┆ … ┆ 3         ┆ 226       ┆ 2         ┆ 100       │
│ 5       ┆ 1          ┆ 1.001422  ┆ 1.002301  ┆ … ┆ 3         ┆ 100       ┆ 2         ┆ 100       │
│ 5       ┆ 5          ┆ 1.001422  

In [23]:
book_example.head()

time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
i16,i16,f32,f32,f32,f32,i32,i32,i32,i32
5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100
5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100
5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100
5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100
5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100


In [24]:
trade_example.head()

time_id,seconds_in_bucket,price,size,order_count
i16,i16,f32,i32,i16
5,21,1.002301,326,12
5,46,1.002778,128,4
5,50,1.002818,55,1
5,57,1.003155,121,5
5,68,1.003646,4,1


In [25]:
# Check for duplicate seconds_in_bucket in book_example
print(book_example.filter(
    pl.col("seconds_in_bucket").is_duplicated()
))

# Check for duplicate seconds_in_bucket in trade_example
print(trade_example.filter(
    pl.col("seconds_in_bucket").is_duplicated()
))

shape: (0, 10)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ time_id ┆ seconds_in ┆ bid_price ┆ ask_price ┆ … ┆ bid_size1 ┆ ask_size1 ┆ bid_size2 ┆ ask_size2 │
│ ---     ┆ _bucket    ┆ 1         ┆ 1         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i16     ┆ ---        ┆ ---       ┆ ---       ┆   ┆ i32       ┆ i32       ┆ i32       ┆ i32       │
│         ┆ i16        ┆ f32       ┆ f32       ┆   ┆           ┆           ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘
shape: (0, 5)
┌─────────┬───────────────────┬───────┬──────┬─────────────┐
│ time_id ┆ seconds_in_bucket ┆ price ┆ size ┆ order_count │
│ ---     ┆ ---               ┆ ---   ┆ ---  ┆ ---         │
│ i16     ┆ i16               ┆ f32   ┆ i32  ┆ i16         │
╞═════════╪═════════

In [26]:
from features.FeatureGenerator import FeatureGenerator

feature_generator = FeatureGenerator(book_example, trade_example)
feature_generator.training_sample.head(22)


seconds_in_bucket,size,order_count,wap_last,seconds_since_last_wap,log_return,rv_30s,rv_120s,spread_last,mid_last,rel_spread_last,ofi_last,seconds_since_last_trade,last_trade_gap_geq_cap,order_intensity_60s
i64,i32,i16,f64,i64,f64,f64,f64,f32,f32,f32,f64,i64,i8,i64
0,0,0,1.001434,0,0.0,0.0,0.0,0.000879,1.001862,0.000878,-0.973799,1,0,0
1,0,0,1.001448,0,0.000014,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,2,0,0
2,0,0,1.001448,1,0.0,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,3,0,0
3,0,0,1.001448,2,0.0,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,4,0,0
4,0,0,1.001448,3,0.0,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,5,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17,0,0,1.001448,0,0.000005,0.000017,0.000017,0.000879,1.001862,0.000878,-0.941748,15,1,0
18,0,0,1.001443,0,-0.000005,0.000018,0.000018,0.000879,1.001862,0.000878,-0.953488,15,1,0
19,0,0,1.001443,0,0.0,0.000018,0.000018,0.000879,1.001862,0.000878,-0.953488,15,1,0
20,0,0,1.001443,1,0.0,0.000018,0.000018,0.000879,1.001862,0.000878,-0.953488,15,1,0


In [27]:
assert feature_generator.training_sample.height == 600
assert feature_generator.training_sample.select(pl.col("log_return").is_finite().all()).item()
assert feature_generator.training_sample.select(pl.col("rv_30s").is_finite().all()).item()
assert feature_generator.training_sample.select(pl.col("rv_120s").is_finite().all()).item()
assert feature_generator.training_sample["seconds_since_last_wap"].max() <= 15
assert feature_generator.training_sample["seconds_since_last_trade"].max() <= 15


In [28]:
train_labels = DataLoader._csv(f'{data_path}/train.csv')

In [29]:
train_labels.head()

stock_id,time_id,target
i64,i64,f64
0,5,0.004136
0,11,0.001445
0,16,0.002168
0,31,0.002195
0,62,0.001747


In [30]:
train_labels.shape

(428932, 3)

In [31]:
428942 * 0.8

343153.60000000003

In [32]:
train_labels.group_by("stock_id").len().sort("len")

stock_id,len
i64,u32
38,3815
80,3820
75,3829
13,3829
100,3829
…,…
41,3830
101,3830
83,3830
119,3830


In [33]:
print(train_labels.group_by("stock_id").len().filter(pl.col("stock_id") == 13))

shape: (1, 2)
┌──────────┬──────┐
│ stock_id ┆ len  │
│ ---      ┆ ---  │
│ i64      ┆ u32  │
╞══════════╪══════╡
│ 13       ┆ 3829 │
└──────────┴──────┘


## Testing index.py

In [34]:
train_index = pl.read_parquet('../data/train_index.parquet')
val_index = pl.read_parquet('../data/validation_index.parquet')
evaluation_index = pl.read_parquet('../data/eval_index.parquet')

In [35]:
train_index.head()
print(train_index.shape)
print(train_index.group_by("stock_id").len().sort("len"))

(303873, 5)
shape: (112, 2)
┌──────────┬──────┐
│ stock_id ┆ len  │
│ ---      ┆ ---  │
│ i64      ┆ u32  │
╞══════════╪══════╡
│ 13       ┆ 2112 │
│ 30       ┆ 2122 │
│ 11       ┆ 2144 │
│ 26       ┆ 2173 │
│ 116      ┆ 2175 │
│ …        ┆ …    │
│ 82       ┆ 2962 │
│ 123      ┆ 2966 │
│ 122      ┆ 2966 │
│ 42       ┆ 2970 │
│ 61       ┆ 2974 │
└──────────┴──────┘


In [36]:
val_index.head()
print(val_index.shape)
print(val_index.group_by("stock_id").len().filter(pl.col("stock_id") == 13))

(41336, 5)
shape: (1, 2)
┌──────────┬─────┐
│ stock_id ┆ len │
│ ---      ┆ --- │
│ i64      ┆ u32 │
╞══════════╪═════╡
│ 13       ┆ 457 │
└──────────┴─────┘


In [37]:
evaluation_index.head()
print(evaluation_index.shape)
print(evaluation_index.group_by("stock_id").len().filter(pl.col("stock_id") == 13))

(83723, 5)
shape: (1, 2)
┌──────────┬──────┐
│ stock_id ┆ len  │
│ ---      ┆ ---  │
│ i64      ┆ u32  │
╞══════════╪══════╡
│ 13       ┆ 1260 │
└──────────┴──────┘


In [38]:
2112 + 1260 + 457

3829

In [39]:
train_index.head()[0]

stock_id,time_id,target,book_path,trade_path
i64,i64,f64,str,str
0,11,0.001445,"""data/raw/book_train.parquet/st…","""data/raw/trade_train.parquet/s…"


In [None]:
from pipeline.build import _fg_one
feature_cols = feature_generator.training_sample.columns[1:]
book_path = train_index.head()[0]["book_path"][0]
trade_path = train_index.head()[0]["trade_path"][0]
stock_id = train_index.head()[0]["stock_id"][0]
time_id = train_index.head()[0]["time_id"]
print(book_path)
print(time_id)
print(trade_path)
print(_fg_one((book_path, trade_path, stock_id, time_id, feature_cols)))

2025-08-19 18:13:39,716 [INFO] data/raw/book_train.parquet/stock_id=0
2025-08-19 18:13:39,717 [INFO] Loading data for time_id
2025-08-19 18:13:39,760 [INFO] Parquet is getting read
2025-08-19 18:13:39,763 [INFO] shape: (5, 10)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ time_id ┆ seconds_in ┆ bid_price ┆ ask_price ┆ … ┆ bid_size1 ┆ ask_size1 ┆ bid_size2 ┆ ask_size2 │
│ ---     ┆ _bucket    ┆ 1         ┆ 1         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i16     ┆ ---        ┆ ---       ┆ ---       ┆   ┆ i32       ┆ i32       ┆ i32       ┆ i32       │
│         ┆ i16        ┆ f32       ┆ f32       ┆   ┆           ┆           ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 11      ┆ 0          ┆ 0.999473  ┆ 1.000176  ┆ … ┆ 205       ┆ 100       ┆ 20        ┆ 30        │
│ 11      ┆ 3          ┆ 0.999473  ┆ 1.000176  ┆ … ┆ 200       ┆ 1

data/raw/book_train.parquet/stock_id=0
shape: (1,)
Series: 'time_id' [i64]
[
	11
]
data/raw/trade_train.parquet/stock_id=0
[[ 2.          2.          0.99994534 ...  0.          0.
   2.        ]
 [ 0.          0.          0.99994534 ...  1.          0.
   2.        ]
 [ 0.          0.          0.99994534 ...  2.          0.
   2.        ]
 ...
 [ 0.          0.          1.0003057  ... 15.          1.
  11.        ]
 [ 0.          0.          1.0003057  ... 15.          1.
   8.        ]
 [ 0.          0.          1.0003057  ... 15.          1.
   8.        ]]
