## Test out the DataLoader

In [1]:
import polars as pl
from features.DataLoader import DataLoader

In [2]:
data_path = "../data/raw"
time_id = 5
book_example = DataLoader.load_data_for_time_id(f'{data_path}/book_train.parquet/stock_id=0', time_id)
trade_example =  DataLoader.load_data_for_time_id(f'{data_path}/trade_train.parquet/stock_id=0', time_id)


In [3]:
book_example.head()

time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2
i16,i16,f32,f32,f32,f32,i32,i32,i32,i32
5,0,1.001422,1.002301,1.00137,1.002353,3,226,2,100
5,1,1.001422,1.002301,1.00137,1.002353,3,100,2,100
5,5,1.001422,1.002301,1.00137,1.002405,3,100,2,100
5,6,1.001422,1.002301,1.00137,1.002405,3,126,2,100
5,7,1.001422,1.002301,1.00137,1.002405,3,126,2,100


In [4]:
trade_example.head()

time_id,seconds_in_bucket,price,size,order_count
i16,i16,f32,i32,i16
5,21,1.002301,326,12
5,46,1.002778,128,4
5,50,1.002818,55,1
5,57,1.003155,121,5
5,68,1.003646,4,1


In [5]:
# Check for duplicate seconds_in_bucket in book_example
print(book_example.filter(
    pl.col("seconds_in_bucket").is_duplicated()
))

# Check for duplicate seconds_in_bucket in trade_example
print(trade_example.filter(
    pl.col("seconds_in_bucket").is_duplicated()
))

shape: (0, 10)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ time_id ┆ seconds_in ┆ bid_price ┆ ask_price ┆ … ┆ bid_size1 ┆ ask_size1 ┆ bid_size2 ┆ ask_size2 │
│ ---     ┆ _bucket    ┆ 1         ┆ 1         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│ i16     ┆ ---        ┆ ---       ┆ ---       ┆   ┆ i32       ┆ i32       ┆ i32       ┆ i32       │
│         ┆ i16        ┆ f32       ┆ f32       ┆   ┆           ┆           ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
└─────────┴────────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘
shape: (0, 5)
┌─────────┬───────────────────┬───────┬──────┬─────────────┐
│ time_id ┆ seconds_in_bucket ┆ price ┆ size ┆ order_count │
│ ---     ┆ ---               ┆ ---   ┆ ---  ┆ ---         │
│ i16     ┆ i16               ┆ f32   ┆ i32  ┆ i16         │
╞═════════╪═════════

In [6]:
from features.FeatureGenerator import FeatureGenerator

feature_generator = FeatureGenerator(book_example, trade_example)
feature_generator.training_sample.head(22)


seconds_in_bucket,size,order_count,wap_last,seconds_since_last_wap,log_return,rv_30s,rv_120s,spread_last,mid_last,rel_spread_last,ofi_last,seconds_since_last_trade,last_trade_gap_geq_cap,order_intensity_60s
i64,i32,i16,f64,i64,f64,f64,f64,f32,f32,f32,f64,i64,i8,i64
0,0,0,1.001434,0,0.0,0.0,0.0,0.000879,1.001862,0.000878,-0.973799,1,0,0
1,0,0,1.001448,0,0.000014,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,2,0,0
2,0,0,1.001448,1,0.0,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,3,0,0
3,0,0,1.001448,2,0.0,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,4,0,0
4,0,0,1.001448,3,0.0,0.000014,0.000014,0.000879,1.001862,0.000878,-0.941748,5,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17,0,0,1.001448,0,0.000005,0.000017,0.000017,0.000879,1.001862,0.000878,-0.941748,15,1,0
18,0,0,1.001443,0,-0.000005,0.000018,0.000018,0.000879,1.001862,0.000878,-0.953488,15,1,0
19,0,0,1.001443,0,0.0,0.000018,0.000018,0.000879,1.001862,0.000878,-0.953488,15,1,0
20,0,0,1.001443,1,0.0,0.000018,0.000018,0.000879,1.001862,0.000878,-0.953488,15,1,0


In [7]:
assert feature_generator.training_sample.height == 600
assert feature_generator.training_sample.select(pl.col("log_return").is_finite().all()).item()
assert feature_generator.training_sample.select(pl.col("rv_30s").is_finite().all()).item()
assert feature_generator.training_sample.select(pl.col("rv_120s").is_finite().all()).item()
assert feature_generator.training_sample["seconds_since_last_wap"].max() <= 15
assert feature_generator.training_sample["seconds_since_last_trade"].max() <= 15


In [8]:
train_labels = DataLoader._csv(f'{data_path}/train.csv')

In [9]:
train_labels.head()

stock_id,time_id,target
i64,i64,f64
0,5,0.004136
0,11,0.001445
0,16,0.002168
0,31,0.002195
0,62,0.001747


In [10]:
train_labels.shape

(428932, 3)

In [11]:
428942 * 0.8

343153.60000000003

In [12]:
train_labels.group_by("stock_id").len().sort("len")

stock_id,len
i64,u32
38,3815
80,3820
75,3829
13,3829
100,3829
…,…
104,3830
95,3830
107,3830
122,3830
