In [1]:
import pandas as pd
import polars as pl
import numpy as np
import gc
from matplotlib import pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
class CONFIG:
    target_col = "responder_6"
    lag_cols_original = ["date_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    lag_cols_rename = { f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)}
    valid_ratio = 0.05
    start_dt = 1100

In [3]:
# Use last 2 parquets
train = pl.scan_parquet(
    f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
).select(
    pl.int_range(pl.len(), dtype=pl.UInt32).alias("id"),
    pl.all(),
).filter(
    pl.col("date_id").gt(CONFIG.start_dt)
)

In [4]:
lags = train.select(pl.col(CONFIG.lag_cols_original))
lags = lags.rename(CONFIG.lag_cols_rename)
lags = lags.with_columns(
    date_id = pl.col('date_id') + 1,  # lagged by 1 day
    )
lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).first()  # pick up last record of previous date

In [5]:
train = train.join(lags, on=["date_id", "symbol_id"],  how="inner")

In [6]:
train_df = train.collect().to_pandas()

In [7]:
training_data = train_df.loc [ train_df["date_id"] <1600 , ]
validation_data = train_df.loc [ train_df["date_id"] >1600 , ]

In [8]:
training_data.to_parquet(
    "training.parquet",
    engine="pyarrow",  # pyarrow 또는 fastparquet 사용 가능
    partition_cols=["date_id"]  # 'date_id'로 데이터 파티션 분리
)

In [9]:
validation_data.to_parquet(
    "validation.parquet",
    engine="pyarrow",  # pyarrow 또는 fastparquet 사용 가능
    partition_cols=["date_id"]  # 'date_id'로 데이터 파티션 분리
)