In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
import lightgbm as lgb
import pickle, random, os, logging,glob
import kaggle_evaluation.jane_street_inference_server
from joblib import Parallel, delayed
from sklearnex import patch_sklearn
import torch,json, math
import warnings
from concurrent.futures import ThreadPoolExecutor


patch_sklearn()
logging.getLogger('sklearnex').setLevel(logging.WARNING)
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.storage")

with open("/kaggle/input/feature-importance/features_information_v2.json", mode="r") as file:
    feature_importance = json.load(file)

top_k_comb = sorted(feature_importance.items(), key=lambda x: (x[-1], x[0]), reverse=False)[:10] #best feature combination
cols = top_k_comb[0][0].split("/")


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
def get_slopes(n):
    def get_slopes_power_of_2(n):
        start = (2**(-2**-(math.log2(n)-3)))
        ratio = start
        return [start*ratio**i for i in range(n)]

    if math.log2(n).is_integer():
        return get_slopes_power_of_2(n)                   #In the paper, we only train models that have 2^a heads for some a. This function has
    else:                                                 #some good properties that only occur when the input is a power of 2. To maintain that even
        closest_power_of_2 = 2**math.floor(math.log2(n))  #when the number of heads is not a power of 2, we use this workaround.
        return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2*closest_power_of_2)[0::2][:n-closest_power_of_2]

def get_alibi_slope(num_heads):
    return torch.Tensor(get_slopes(num_heads)).unsqueeze(1).unsqueeze(1)

def get_alibi_bias(num_heads, seq_len):
    relative_pos = torch.arange(seq_len)[None, :] - torch.arange(seq_len)[:, None]
    m = get_alibi_slope(num_heads)
    return m * relative_pos
    
def init_weights(m):
    if isinstance(m, torch.nn.Linear):
      torch.nn.init.kaiming_normal_(m.weight, a=0, mode='fan_out', nonlinearity='leaky_relu') #torch.nn.init.normal_(m.weight, mean=0.0, std=1/math.sqrt(6))
      if m.bias is not None:
        nn.init.constant_(m.bias, 0)
    elif isinstance(m, torch.nn.Embedding):
        torch.nn.init.kaiming_normal_(m.weight, a=0, mode='fan_out', nonlinearity='leaky_relu') #torch.nn.init.normal_(m.weight, mean=0.0, std=1/math.sqrt(6))


class MultiHeadMaskAttention(torch.nn.Module):
    def __init__(self, dim_in, dim_out, num_heads, window, bias, num_layers):
        super().__init__()
        self.num_heads = num_heads
        self.pos_emb = torch.nn.Embedding(window, dim_in)
        self.W_query = torch.nn.Linear(dim_in, dim_out*num_heads, bias=bias)
        self.W_key = torch.nn.Linear(dim_in, dim_out*num_heads, bias=bias)
        self.W_value = torch.nn.Linear(dim_in, dim_out*num_heads, bias=bias)
        self.out_proj = torch.nn.Linear(dim_out, dim_out)
        self.attn_scale = math.sqrt(dim_in)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(window,window), diagonal=1).bool()
        )
        
        self.register_buffer(
            "bias",
            get_alibi_bias(num_heads, window)
        )

    def forward(self, inp):
        batch, window, emb = inp.shape
        pos_emb = self.pos_emb(torch.arange(window, device=inp.device))

        queries = self.W_query(inp)
        keys = self.W_key(inp)
        values = self.W_value(inp)

        keys = keys.view(batch, window, self.num_heads, emb)
        queries = queries.view(batch, window, self.num_heads, emb)
        values = values.view(batch, window, self.num_heads, emb)

        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2) + pos_emb

        attn_scores = (queries @ keys.mT)/self.attn_scale + self.bias
        attn_scores = attn_scores.masked_fill(self.mask, -torch.inf)
        attn_weights = self.softmax(attn_scores)
        context_vectors = (attn_weights @ values).transpose(1,2).mean(dim=-2)
        return self.out_proj(context_vectors)/0.01

class LayerNorm(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.eps = 1e-5
        self.shift = torch.nn.Parameter(torch.zeros(dim))
        self.scale = torch.nn.Parameter(torch.ones(dim))

    def forward(self, inp):
        mean = inp.mean(dim=-1, keepdim=True)
        var = inp.var(dim=-1, keepdim=True, unbiased=False)
        inp_norm = (inp - mean)/(torch.sqrt(var)+self.eps)

        return inp_norm + self.shift * self.scale

class FeedForwardLayer(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(cfg["n_features"], cfg["n_features"] * cfg["window"]),
            torch.nn.LeakyReLU(), #torch.nn.GELU(approximate="tanh"),
            torch.nn.Linear(cfg["n_features"] * cfg["window"], cfg["n_features"]),
        )


    def forward(self, inp):
        return self.layer(inp)

class TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.attention = MultiHeadMaskAttention(dim_in=cfg["n_features"], dim_out=cfg["n_features"], num_heads=cfg["num_heads"],
                                                window=cfg["window"], bias=cfg["qkv_bias"], num_layers=cfg["n_layers"])
        self.forward_layer = FeedForwardLayer(cfg)
        self.norm1 = LayerNorm(cfg["n_features"])
        self.norm2 = LayerNorm(cfg["n_features"])
        self.dropout = torch.nn.Dropout(cfg["drop_rate"])

    def forward(self, inp):
        shortcut = inp
        inp = self.norm1(inp)
        inp = self.attention(inp)
        inp = inp + shortcut

        shortcut = inp
        inp = self.norm2(inp)
        inp = self.forward_layer(inp)
        out = self.dropout(inp)
        return out + shortcut

In [3]:
class JaneStreetModelV1(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.blocks = torch.nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.norm =  LayerNorm(cfg["n_features"])
        self.final_norm = LayerNorm(cfg["n_features"])
        self.out_proj = torch.nn.Linear(cfg["n_features"], 1, bias=False)

    def forward(self, inp):
        inp = self.blocks(self.norm(inp))
        inp = inp.mean(dim=1)/0.1
        inp = self.final_norm(inp)

        return self.out_proj(inp)

In [4]:
class LayerNorm2(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.eps = 1e-5
        self.shift = torch.nn.Parameter(torch.zeros(dim))
        self.scale = torch.nn.Parameter(torch.ones(dim))

    def forward(self, inp):
        mean = inp.mean(dim=-1, keepdim=True)
        var = inp.var(dim=-1, keepdim=True, unbiased=False)
        inp_norm = (inp - mean)/(torch.sqrt(var)+self.eps)

        return inp_norm + self.shift * self.scale

class FeedForwardLayer2(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.pos_emb = torch.nn.Embedding(1, dim)
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(dim, dim * 4),
            torch.nn.LeakyReLU(), #torch.nn.GELU(approximate="tanh"),
            torch.nn.Linear(dim * 4, dim),
        )

    def forward(self, inp):
        pos_emb = self.pos_emb(torch.tensor(0, device=inp.device))
        return self.layer(inp) + pos_emb

class TransformerBlock2(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.forward_layer = FeedForwardLayer2(cfg["n_features"])
        self.norm = LayerNorm2(cfg["n_features"])
        self.dropout = torch.nn.Dropout(cfg["drop_rate"])

    def forward(self, inp):
        shortcut = inp
        inp = self.norm(inp)
        inp = self.forward_layer(inp)/0.1
        out = self.dropout(inp)
        return out + shortcut

class JaneStreetModelV2(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.blocks = torch.nn.Sequential(
            *[TransformerBlock2(cfg) for _ in range(cfg["n_layers"])]
        )
        self.norm =  LayerNorm2(cfg["n_features"])
        self.final_norm = LayerNorm2(cfg["n_features"])
        self.out_proj = torch.nn.Linear(cfg["n_features"], 1, bias=False)

    def forward(self, inp):
        inp = self.blocks(self.norm(inp))
        inp = self.final_norm(inp)

        return self.out_proj(inp)

In [5]:
def load_models(checkpoints, MODEL_CONFIG):
    models = {}
    for symbol, state in checkpoints.items():
        model = JaneStreetModelV1(MODEL_CONFIG)
        model.to(MODEL_CONFIG["device"])
        model.load_state_dict(state)
        model.eval()
        models[symbol] = model
    return models

def load_model(checkpoint, MODEL_CONFIG):
    model = JaneStreetModelV2(MODEL_CONFIG)
    model.to(MODEL_CONFIG["device"])
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    return model

In [6]:
MODEL_CONFIG = {
    "n_features": len(cols),
    "window": 4,
    "num_heads": 3,
    "n_layers": 3,
    "drop_rate": 0.05,
    "qkv_bias": False,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

MODEL_CONFIG2 = {
    "n_features": len(cols),
    "n_layers": 6,
    "drop_rate": 0.1,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

In [7]:
checkpoints = torch.load("/kaggle/input/llm-transformer/models.pth", weights_only=True, map_location=MODEL_CONFIG["device"])
checkpoint2 = torch.load("/kaggle/input/llm-transformer/model.pth", weights_only=True, map_location=MODEL_CONFIG2["device"])
models = load_models(checkpoints, MODEL_CONFIG)
model2 = load_model(checkpoint2, MODEL_CONFIG2)

In [8]:
class JaneStreetSubmissionV1():
    def __init__(self, cols, path_name, agg_means, nan_means, model, model_plain, 
                 window=None, history=None):
        self.test_columns = ["row_id",'symbol_id', 'date_id', 'time_id']
        self.columns = cols
        self.fillna = nan_means
        self.agg_means = agg_means
        self.window = window
        self.history = history
        self.model = model
        self.model_plain = model_plain
        
        if self.window:
            self.df = self.get_history(path_name, symbols=list(range(39)), size=self.window*10)
            

    def get_history(self, path, symbols, size=1000):
        parquet_file = pl.scan_parquet(path).select(
            ['symbol_id', 'date_id', 'time_id']+self.columns
        )
        # Step 1: Filter and sort the data
        df = {}
        data = (
            parquet_file
            .filter(pl.col("symbol_id").is_in(symbols))  # Filter for relevant symbols
            .sort(["date_id", "time_id", "symbol_id"])
            .group_by("symbol_id").tail(size)
        )
        data = data.collect().sort(["date_id", "time_id", "symbol_id"])
        for symbol in symbols:
            df[symbol] = data.filter(pl.col("symbol_id") == symbol).select(self.columns).to_numpy()
        return df

    def rolling_test(self, data):
        size = data.shape[0] - self.window + 1
        inputs = np.lib.stride_tricks.sliding_window_view(data, 
                                                          (self.window, data.shape[1]), 
                                                          axis=(0, 1)).reshape(size, self.window, data.shape[1])
        return inputs

    def numpy_fillna(self, arr, fillna_dict):
        arr_copy = arr.copy()  # Avoid modifying the original array
        for idx, col in enumerate(self.columns):
            arr_copy[:, idx] = np.nan_to_num(arr_copy[:, idx], nan=fillna_dict[col])
        return arr_copy

    def _build(self, test_np, symbol):
        if self.window:
            self.df[symbol] = np.vstack((self.df[symbol][-self.history:, :], test_np))
            test_np_filled = self.numpy_fillna(self.df[symbol], self.fillna[symbol])
            return self.rolling_test(test_np_filled)[-1:]
        else:
            test_np_filled = self.numpy_fillna(test_np, self.fillna[symbol])
            return test_np_filled
            
    def _forecast(self, test_np, symbol, row_ids):
        test = self._build(test_np=test_np, symbol=symbol)
        predictions = self.model[symbol](torch.tensor(test, device=MODEL_CONFIG["device"], dtype=torch.float32)).detach().cpu().numpy() 
        return np.column_stack((row_ids, predictions))

    def predict(self, test):
        # Convert test data to numpy and filter based on symbol_id
        test_np = test.select(self.test_columns + self.columns).to_numpy()
        test_np_symbols = test_np[test_np[:, 1] < 39]  # Known symbols with models
        test_np_plain = test_np[test_np[:, 1] >= 39]  # Unknown symbols without models
    
        # Get unique symbols for known symbols
        unique_symbols = np.unique(test_np_symbols[:, 1])
    
        with torch.no_grad():
            results = [
                self._forecast(
                    test_np=test_np_symbols[test_np_symbols[:, 1] == symbol][:, 4:],
                    symbol=symbol,
                    row_ids=test_np_symbols[test_np_symbols[:, 1] == symbol][:, 0]
                )
                for symbol in unique_symbols
            ]
    
        # Process unknown symbols (without models)
        if test_np_plain.size > 0: 
            row_ids = test_np_plain[:, 0]
            test_np_plain_filled = self.numpy_fillna(test_np_plain[:, 4:], self.agg_means)
            predictions = self.model_plain(torch.tensor(test_np_plain_filled, device=MODEL_CONFIG["device"], dtype=torch.float32)).detach().cpu().numpy()
            results.append(np.column_stack((row_ids, predictions)))
    
        return np.vstack(results)

In [9]:
path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=9/part-0.parquet"
agg_means = pickle.load(open('/kaggle/input/fillnans/agg_means.p', 'rb'))
nan_means = pickle.load(open('/kaggle/input/fillnans/nan_means.p', 'rb'))
submit_model = JaneStreetSubmissionV1(cols=cols, path_name=path, agg_means=agg_means,nan_means=nan_means, 
                                      model=models, model_plain=model2, window=4, history=15)

In [10]:
lags_ : pl.DataFrame | None = None

# Prediction function
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pd.DataFrame:
    
    predictions = submit_model.predict(test)

    return pd.DataFrame(predictions, columns=["row_id", "responder_6"])

In [11]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )