In [1]:
# Packages

import os
import polars as pl
import numpy as np 
import pandas as pd 
import glob
import dask.dataframe as dd
import pyarrow as pa
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from dask.distributed import Client
from xgboost import dask as dxgb

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# Loading and concatenating the whole dataset

class LoadData:
    
    def __init__(self, file_paths):
        self.file_paths = file_paths
        
    def load_and_concat(self):
        # Use `scan_parquet` for lazy loading
        partitioned_data = [pl.scan_parquet(file_path) for file_path in self.file_paths]
        df = pl.concat(partitioned_data, rechunk=False)  # Keep lazy mode with rechunk=False
        
        return df
    
# Specify file paths
file_paths = sorted(glob.glob('Data/train.parquet/*/*.parquet'))

# Initialize the loader and load data as a lazy frame
loader = LoadData(file_paths)
df_full = loader.load_and_concat()  # df_train is now a lazy frame

In [3]:
# Feature engineering the daily mean value for each responder

responder_columns = [col for col, dtype in df_full.schema.items() if col.startswith("responder_")]
aggregations = [pl.col(responder).mean().alias(f"daily_{responder}_mean") for responder in responder_columns]
df_daily_means = df_full.group_by("date_id").agg(aggregations)
df_full = df_full.join(df_daily_means, on="date_id")
df_full = df_full.collect()
df_full['date_id'].describe()

  responder_columns = [col for col, dtype in df_full.schema.items() if col.startswith("responder_")]


statistic,value
str,f64
"""count""",47127338.0
"""null_count""",0.0
"""mean""",1005.479389
"""std""",445.181943
"""min""",0.0
"""25%""",679.0
"""50%""",1060.0
"""75%""",1376.0
"""max""",1698.0


In [4]:
# Splitting the whole dataset into train and validation at 75th percentile of the date_id value

train_df = df_full.filter(pl.col('date_id') < (np.percentile(df_full['date_id'].to_numpy(), 75)))
val_df = df_full.filter(pl.col('date_id') >= (np.percentile(df_full['date_id'].to_numpy(), 75)))

print(train_df.shape)
print(val_df.shape)

(35309026, 101)
(11818312, 101)


In [5]:
# Convert val_df to pandas DataFrame
val_df_pandas = val_df.collect().to_pandas() if isinstance(val_df, pl.LazyFrame) else val_df.to_pandas()

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# Print the indices for each fold to understand the structure
for fold, (train_index, test_index) in enumerate(tscv.split(val_df_pandas), start=1):
    print(f"Fold {fold}")
    print("Train indices:", train_index[:10], "...", train_index[-10:])  # Show first & last 10 indices
    print("Test indices:", test_index[:10], "...", test_index[-10:])    # Show first & last 10 indices
    print("Train set size:", len(train_index))
    print("Test set size:", len(test_index))
    print("=" * 50)

Fold 1
Train indices: [0 1 2 3 4 5 6 7 8 9] ... [1969712 1969713 1969714 1969715 1969716 1969717 1969718 1969719 1969720
 1969721]
Test indices: [1969722 1969723 1969724 1969725 1969726 1969727 1969728 1969729 1969730
 1969731] ... [3939430 3939431 3939432 3939433 3939434 3939435 3939436 3939437 3939438
 3939439]
Train set size: 1969722
Test set size: 1969718
Fold 2
Train indices: [0 1 2 3 4 5 6 7 8 9] ... [3939430 3939431 3939432 3939433 3939434 3939435 3939436 3939437 3939438
 3939439]
Test indices: [3939440 3939441 3939442 3939443 3939444 3939445 3939446 3939447 3939448
 3939449] ... [5909148 5909149 5909150 5909151 5909152 5909153 5909154 5909155 5909156
 5909157]
Train set size: 3939440
Test set size: 1969718
Fold 3
Train indices: [0 1 2 3 4 5 6 7 8 9] ... [5909148 5909149 5909150 5909151 5909152 5909153 5909154 5909155 5909156
 5909157]
Test indices: [5909158 5909159 5909160 5909161 5909162 5909163 5909164 5909165 5909166
 5909167] ... [7878866 7878867 7878868 7878869 7878870 787

In [None]:
%%time

# Training the baseline XGBoost model

client = Client()

# Convert Polars DataFrame to Dask DataFrame
train_dask_df = dd.from_pandas(train_df.to_pandas(), npartitions=10)  # Choose a suitable partition size

# Define features and target
X_train = train_dask_df.drop(columns=['responder_6'])
y_train = train_dask_df['responder_6']

# Train using XGBoost Dask interface
params = {
    "objective": "reg:squarederror",
    "max_depth": 6,
    "learning_rate": 0.1,
    "n_estimators": 100,
    "random_state": 42
}
xgb_model = dxgb.train(client, params, dtrain=dxgb.DaskDMatrix(client, X_train, y_train))