In [8]:
%%capture

import polars as pl
import pandas as pd
import numpy as np
import re 

from gc import collect
from pprint import pprint

In [25]:
TARGET = "responder_6"
RANDOM_STATE = 42
DEV_START_ID = 4_500_000
VERSION_NUMBER  = "V1_1"

# **DATA LOADING**

Here, we load the data and describe the CV scheme. We don't need to specify the sub-directory paths while importing the datasets; polars knows to import all training components as this is a **hive** dataset. Specifying the train path is enough. Weights parameter is important here — this is a sample weight used in our custom eval-metric. ric

In [21]:
%%time 

id_col = pl.int_range(pl.len(), dtype=pl.UInt32).alias("id") # Generate an id column
all_cols = pl.all() # Select all columns

# Read the parquet file and select the specified columns
file_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
train = pl.scan_parquet(file_path).select(id_col, all_cols)

all_col_names = train.collect_schema().names()
cols_of_disinterest = ("weight", "id", "date_id", "time_id", "partition_id")
target_columns, selected_columns = [], []

# Factory for loop to classify train and target column names
for col in all_col_names: 
    if col.startswith("responder"):
        target_columns.append(col)
        
    elif not col.startswith(cols_of_disinterest):
        selected_columns.append(col)
        
sample_weight = train.select(pl.col("weight")).collect().to_series()
collect()

CPU times: user 244 ms, sys: 105 ms, total: 349 ms
Wall time: 166 ms


477

In [27]:
date_column = train.select(pl.col("date_id")).collect()

train_length = date_column.shape[0]
offline_train_length = train_length - DEV_START_ID
last_train_date  = date_column.row(offline_train_length)[0]

print(f"\n---> Last offline train date = {last_train_date}\n")

train_XY = train.filter(pl.col("date_id").le(last_train_date))
test_XY = train.filter(pl.col("date_id").gt(last_train_date))


---> Last offline train date = 1577



In [29]:
# train_XY.collect()

id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,…,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
u32,i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64
0,0,0,1,3.889038,,,,,,0.851033,0.242971,0.2634,-0.891687,11,7,76,-0.883028,0.003067,-0.744703,,-0.169586,,-1.335938,-1.707803,0.91013,,1.636431,1.522133,-1.551398,-0.229627,,,1.378301,-0.283712,0.123196,,…,,,-0.808103,,-2.037683,0.727661,,-0.989118,-0.345213,-1.36224,,,,,,-1.251104,-0.110252,-0.491157,-1.02269,0.152241,-0.659864,,,-0.261412,-0.211486,-0.335556,-0.281498,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504,0
1,0,0,7,1.370613,,,,,,0.676961,0.151984,0.192465,-0.521729,11,7,76,-0.865307,-0.225629,-0.582163,,0.317467,,-1.250016,-1.682929,1.412757,,0.520378,0.744132,-0.788658,0.641776,,,0.2272,0.580907,1.128879,,…,,,-1.625862,,-1.410017,1.063013,,0.888355,0.467994,-1.36224,,,,,,-1.065759,0.013322,-0.592855,-1.052685,-0.393726,-0.741603,,,-0.281207,-0.182894,-0.245565,-0.302441,2.965889,1.190077,-0.523998,3.849921,2.626981,5.0,0.703665,0.216683,0.778639,0
2,0,0,9,2.285698,,,,,,1.056285,0.187227,0.249901,-0.77305,11,7,76,-0.675719,-0.199404,-0.586798,,-0.814909,,-1.296782,-2.040234,0.639589,,1.597359,0.657514,-1.350148,0.364215,,,-0.017751,-0.317361,-0.122379,,…,,,-0.72542,,-2.29417,1.764551,,-0.120789,-0.063458,-1.36224,,,,,,-0.882604,-0.072482,-0.617934,-0.86323,-0.241892,-0.709919,,,0.377131,0.300724,-0.106842,-0.096792,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828,0
3,0,0,10,0.690606,,,,,,1.139366,0.273328,0.306549,-1.262223,42,5,150,-0.694008,3.004091,0.114809,,-0.251882,,-1.902009,-0.979447,0.241165,,-0.392359,-0.224699,-2.129397,-0.855287,,,0.404142,-0.578156,0.105702,,…,,,1.313203,,-0.810125,2.939022,,3.988801,1.834661,-1.36224,,,,,,-0.697595,1.074309,-0.206929,-0.530602,4.765215,0.571554,,,-0.226891,-0.251412,-0.215522,-0.296244,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516,0
4,0,0,14,0.44057,,,,,,0.9552,0.262404,0.344457,-0.613813,44,3,16,-0.947351,-0.030018,-0.502379,,0.646086,,-1.844685,-1.58656,-0.182024,,-0.969949,-0.673813,-1.282132,-1.399894,,,0.043815,-0.320225,-0.031713,,…,,,0.476195,,-0.771732,2.843421,,1.379815,0.411827,-1.36224,,,,,,-0.948601,-0.136814,-0.447704,-1.141761,0.099631,-0.661928,,,3.678076,2.793581,2.61825,3.418133,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.0,-3.57282,-1.089123,-5.0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
42632909,1577,967,34,3.660226,2.519176,-2.278475,2.875026,3.315682,1.025185,-0.370229,0.653189,0.713246,0.62322,42,5,150,-0.055511,0.31459,0.247763,-0.974051,-0.860685,-0.855924,-0.199643,-0.778679,0.40343,-0.143742,0.947486,0.705441,3.107152,1.085474,-0.178259,1.341825,0.905374,-0.544969,-0.820301,-0.205174,…,-1.026273,0.075233,0.07799,-0.469423,-0.987165,-0.183636,-0.293783,0.116868,-0.23298,-0.064444,-0.333025,-0.491472,-0.516414,0.776839,-0.549226,0.021567,0.305745,0.20329,-0.15644,0.206233,0.056439,0.148967,0.050929,0.143107,0.049475,0.169593,0.058465,-0.195265,0.075032,1.955506,0.177934,0.062308,-0.024571,-0.476895,-0.210596,-0.918143,9
42632910,1577,967,35,0.955174,2.738421,-1.775275,3.060891,3.218405,0.986526,-0.601476,1.016235,0.425546,1.179295,11,7,76,1.902385,0.379721,1.662683,-0.336796,-0.352103,-0.279848,2.797304,0.098806,-0.630761,-0.006123,-0.988341,0.662364,2.1973,0.849408,0.149374,-1.596333,-1.681866,0.103714,-0.549737,-0.005916,…,1.527505,0.116301,0.491493,1.666221,-1.552811,-1.272698,-0.368834,0.155043,0.159979,-0.064444,-0.260491,-0.216017,-0.377262,2.353767,-0.48391,2.212185,0.575356,1.578336,2.030581,0.094061,1.395494,-0.20224,-0.293032,-0.099027,-0.261575,-0.051335,-0.23956,0.10242,0.389871,0.148695,0.312291,0.119847,0.074623,-0.069854,-0.004745,-0.190371,9
42632911,1577,967,36,1.52769,2.490472,-2.053605,2.967873,3.217054,1.259045,-0.335589,0.778718,0.575043,0.90865,49,7,297,1.06592,0.561817,0.633637,-1.008319,-0.518402,-0.53679,2.190968,-0.074627,-0.231568,0.077402,-0.456071,-0.230571,2.765544,1.294147,-0.367349,-1.282116,-1.233662,-0.549751,-0.410354,0.087597,…,1.360409,1.110435,-0.484696,0.753621,0.160502,-2.222699,0.178232,0.802404,1.917594,-0.064444,-0.34007,-0.165555,-0.333742,3.380794,-0.408364,1.342207,0.534511,0.904423,0.985025,1.08752,0.953389,-0.128052,-0.44821,0.080202,-0.146954,-0.13581,-0.397466,-0.525235,-0.687315,1.314439,0.641124,0.398823,0.758516,0.011844,0.038919,0.016978,9
42632912,1577,967,37,1.46837,2.370544,-2.288105,2.708388,3.043063,0.937361,-0.510386,0.7662,0.741773,0.890082,34,4,214,0.37889,0.482138,0.572032,-0.824915,-0.458182,-0.63762,-0.868831,1.803008,-0.986681,-0.251199,-0.807371,-1.207248,0.955806,-1.010138,0.471763,-0.411053,-0.924838,-0.606865,-0.906782,-0.227054,…,-1.448433,-0.09036,-0.870799,-0.279411,1.845173,0.468682,-0.075559,-0.384053,-0.34292,-0.064444,-0.356085,-0.217463,-0.374308,-1.123177,1.742532,0.2062,0.474088,0.603882,0.399289,0.240918,0.489607,-0.790253,-0.951341,-0.562744,-0.384101,-1.245237,-1.075663,-0.637954,-0.788437,1.849123,0.403603,0.220398,0.933471,-0.131771,-0.01773,-0.252235,9
