In [None]:
import pandas as pd
import polars as pl
import numpy as np
import os, gc
import time
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_lightning import (LightningDataModule, LightningModule, Trainer)
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

import kaggle_evaluation.jane_street_inference_server

In [None]:
class NN(LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1)) 
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []

    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)  

    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  #
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")

In [None]:
class MyDataset(Dataset):
    def __init__(self, X_data, Y_data):
        """
        初始化数据集，X_data 和 Y_data 是两个列表或数组
        X_data: 输入特征
        Y_data: 目标标签
        """
        self.X_data = X_data
        self.Y_data = Y_data

    def __len__(self):
        """返回数据集的大小"""
        return len(self.X_data)

    def __getitem__(self, idx):
        """返回指定索引的数据"""
        x = torch.tensor(self.X_data[idx], dtype=torch.float32)  # 转换为 Tensor
        y = torch.tensor(self.Y_data[idx], dtype=torch.float32)
        return x, y



In [None]:
feature_names = [f"feature_{i:02d}" for i in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)]
feature_names = [item for item in feature_names if item not in ['feature_09','feature_10','feature_11']]

target_name = 'responder_6'
weight_name = 'weight'

device = torch.device(f'cuda:0' if torch.cuda.is_available() else 'cpu')

checkpoint_path = '/kaggle/input/no-feature09-10-11/nn_train_1100_valid_1638.model (4).ckpt'


DEBUG = False

retrain_every_n_days = 30
retrain_after_n_days = 30
retrain_last_n_days_data = 30

retrain_epochs = 1
retrain_bs = 8196
retrain_lr = 1e-4
retrain_wd = retrain_lr/5

In [None]:
# Initialize global vars
cache = None
cache_list = []
day_count = 0
model = NN.load_from_checkpoint(checkpoint_path).to(device)
lags_ : pl.DataFrame | None = None
labels : pl.DataFrame | None = None
lags_last : pl.DataFrame | None = None

loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=retrain_lr, weight_decay=retrain_wd)
# optimizer = torch.optim.Adam(model.parameters(), lr=retrain_lr)

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global cache          # Declare the global cache
    global cache_list
    global day_count
    global model
    global lags_
    global labels
    global lags_last
    global loss_fn
    global optimizer
    
    id_column_types = {
        'date_id': pl.Int16,
        'time_id': pl.Int16,
        'symbol_id': pl.Int16
    }
    test = test.cast(id_column_types)
    
    if lags is not None:
        lags_ = lags
        lags = lags.cast(id_column_types)
        lags_last = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last() # pick up last record of previous date
        day_count += 1
        # store ground truth from previous day
        update_labels = lags["date_id", "symbol_id", "time_id","responder_6_lag_1"]
        update_labels = update_labels.rename({"responder_6_lag_1": "responder_6"})
        if labels is not None:
            labels = pl.concat([labels, update_labels], rechunk=True)
        else:
            labels = update_labels
    
    # join lag feature
    test = test.join(lags_last, on=["date_id", "symbol_id"],  how="left")
    
    # store data for each batch
    cache_list.append(test)

    # ======================== retrain part ====================================================
    # re-train a model on the fly every N days
    if lags is not None and day_count >= retrain_after_n_days and day_count % retrain_every_n_days == 0 :
        # 更新cache 1.合并旧cache和cache_update 2.保留cache最新的time_window天
        if cache is not None:
            cache_update = pl.concat(cache_list, rechunk=True)
            cache = pl.concat([cache, cache_update], rechunk=True)
        else:
            cache = pl.concat(cache_list, rechunk=True)
        
        # store only last time_window days
        days = np.sort(np.unique(cache["date_id"].to_numpy()))
        days = days[-retrain_last_n_days_data:]
        min_day = np.min(days) - 1 # 因为cache_list已经append今天time_id=0时的test了，所以cache中的最大date_id是今天,如果用一天的数据训练,应该-1
        cache = cache.filter(pl.col("date_id") >= min_day)
        print('cache最小最大date_id',np.min(cache["date_id"].to_numpy()),np.max(cache["date_id"].to_numpy()))
        # filter labels
        # move data back to the previous day (we receive the lags at the same day but they are the ground truth of the previous day)
        labels_ = labels.with_columns(
            (pl.col("date_id") -1).alias("date_id")
        )
        labels_ = labels_.filter(pl.col("date_id") >= np.min(cache["date_id"].to_numpy()))
        
        # prepare data for training
        train_df = cache.join(labels_, on=["date_id", "symbol_id", "time_id"],  how="left")
        X_train = train_df[feature_names].to_numpy()
        y_train = train_df[target_name].to_numpy().flatten()
        X_train = np.nan_to_num(X_train, nan=0.0)
        y_train = np.nan_to_num(y_train, nan=0.0)
        dataset = MyDataset(X_train, y_train)
        dataloader = DataLoader(dataset, batch_size=retrain_bs, shuffle=True)
        
        
        # Re-train the model
        for epoch in range(retrain_epochs):
            model.train()
            for batch, (X, y) in enumerate(dataloader):
                pred = model(X.to(device))
                loss = loss_fn(pred, y.to(device))
                # Backpropagation
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
        # reset counter otherwise we will retrain for each time_id of the same day
        # empty cache list
        cache_list = []
    
    # fill na 0
    test_input = test[feature_names].to_pandas()
    test_input = test_input.fillna(0)

    preds = np.zeros((test.shape[0],))
    test_input = torch.FloatTensor(test_input.values).to(device)
    with torch.no_grad():
        model.eval()
        preds += model(test_input).cpu().numpy()
    # print(f"predict> preds.shape =", preds.shape)
    
    predictions = \
    test.select('row_id').\
    with_columns(
        pl.Series(
            name   = 'responder_6', 
            values = np.clip(preds, a_min = -5, a_max = 5),
            dtype  = pl.Float64,
        )
    )

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

# debug

In [None]:
def makelag(date_id):
    """
    Making lag at the previout day

    Args:
    date_id (int): date_id at the previout day
    
    Returns:
    pl.dataframe
    """
    responder_cols = [s for s in train.columns if "responder" in s]
    lag = alltraindata.filter(pl.col("date_id")==date_id).select(["date_id","time_id","symbol_id"] + responder_cols).collect()
    lag.columns = lag_sample.columns
    
    return lag

def weighted_zero_mean_r2(y_true, y_pred, weights):
    """
    Calculate the sample weighted zero-mean R-squared score.

    Parameters:
    y_true (numpy.ndarray): Ground-truth values for responder_6.
    y_pred (numpy.ndarray): Predicted values for responder_6.
    weights (numpy.ndarray): Sample weight vector.

    Returns:
    float: The weighted zero-mean R-squared score.
    """
    numerator = np.sum(weights * (y_true - y_pred)**2)
    denominator = np.sum(weights * y_true**2)
    
    r2_score = 1 - numerator / denominator
    return r2_score


if DEBUG:
    all_submission_dataframe = []
    lag_sample = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet/date_id=0/part-0.parquet")
    alltraindata = pl.scan_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet")
    # pick 50 days
    nb_days = 60
    train = alltraindata.filter(pl.col("date_id")>1698-nb_days).collect()
    train = train.with_columns(pl.Series(range(len(train))).alias("row_id"))

    ## Step 1 The data is split by day using group_by.
    for num_days, df_per_day in train.group_by("date_id",maintain_order=True):
        
        ## Step 2 The data is split by time_id using group_by, and the lag is generated (for time_id == 0).
        
        for time_id, test in df_per_day.group_by("time_id",maintain_order=True):
            
            ## when time_id == 0, makelags
            
            if time_id[0] == 0:
                lag = makelag(num_days[0] - 1)
            else:
                lag = None
            start_time = time.time()
            submission_dataframe = predict(test, lag)
            elapsed = time.time() - start_time
            if elapsed>30:
                print(num_days,time_id)
                print('耗时：{}'.format(elapsed))
            all_submission_dataframe.append(submission_dataframe)
            
    all_submission_dataframe = pl.concat(all_submission_dataframe)
    

    print(weighted_zero_mean_r2(train.select("responder_6").to_numpy().reshape(-1), all_submission_dataframe.select("responder_6").to_numpy().reshape(-1), train.select("weight").to_numpy().reshape(-1)))