In [None]:
!pip install kaggle



In [None]:
from google.colab import drive
drive.mount('content')

Mounted at content


In [None]:
!mkdir ~/.kaggle
!cp "/content/content/MyDrive/Kaggle Competitions/Jane Street/kaggle.json" ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c jane-street-real-time-market-data-forecasting

Downloading jane-street-real-time-market-data-forecasting.zip to /content
100% 11.4G/11.5G [01:35<00:00, 222MB/s]
100% 11.5G/11.5G [01:35<00:00, 129MB/s]


In [None]:
!unzip /content/jane-street-real-time-market-data-forecasting.zip

Archive:  /content/jane-street-real-time-market-data-forecasting.zip
  inflating: features.csv            
  inflating: kaggle_evaluation/__init__.py  
  inflating: kaggle_evaluation/core/__init__.py  
  inflating: kaggle_evaluation/core/base_gateway.py  
  inflating: kaggle_evaluation/core/generated/__init__.py  
  inflating: kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py  
  inflating: kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py  
  inflating: kaggle_evaluation/core/kaggle_evaluation.proto  
  inflating: kaggle_evaluation/core/relay.py  
  inflating: kaggle_evaluation/core/templates.py  
  inflating: kaggle_evaluation/jane_street_gateway.py  
  inflating: kaggle_evaluation/jane_street_inference_server.py  
  inflating: lags.parquet/date_id=0/part-0.parquet  
  inflating: responders.csv          
  inflating: sample_submission.csv   
  inflating: test.parquet/date_id=0/part-0.parquet  
  inflating: train.parquet/partition_id=0/part-0.parquet  
  inflating

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
import tqdm, pickle, time, logging, glob, pathlib
import torch
from torch.utils.data import  Dataset, DataLoader
import json


with open("/content/content/MyDrive/Kaggle Competitions/Jane Street/features_information.json", mode="r") as file:
    feature_importance = json.load(file)

with open('/content/content/MyDrive/Kaggle Competitions/Jane Street/nan_means.p', 'rb') as fp:
    nan_means = pickle.load(fp)

path_name = "/content/train.parquet/partition_id=*/part-0.parquet"
symbols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
           30, 31, 32, 33, 34, 35, 36, 37, 38]
top_k_comb = sorted(feature_importance.items(), key=lambda x: (x[-1], x[0]), reverse=False)[:10] #best feature combination
cols = top_k_comb[0][0].split("/")

In [None]:
cfg = {
    "path_name": path_name,
    "cols":cols,
    "symbols":symbols,
    "window": 4
}

In [None]:
def numpy_fillna(arr, fillna_dict, cols):
    arr_copy = arr.copy()  # Avoid modifying the original array
    for idx, col in enumerate(cols):
        arr_copy[:, idx] = np.nan_to_num(arr_copy[:, idx], nan=fillna_dict[col])
    return arr_copy

def rolling_window(data, window):
    size = data.shape[0] - window + 1
    emb = data.shape[1]
    inputs = np.lib.stride_tricks.sliding_window_view(data,
                                                      (window, emb),
                                                      axis=(0, 1)).reshape(size, window, emb)

    return inputs

def get_numpy_from_parquet(path, cols, instrument=2):
    parquet_file = pl.scan_parquet(path)
    instrument_data = parquet_file.filter(pl.col("symbol_id") == instrument).collect().sort(["date_id", "time_id"])
    return instrument_data.select(cols+['responder_6']).to_numpy()

def get_financial_instrument(path_name, cols, instrument):
    data = np.empty((0, len(cols)+1), dtype=np.float32)  # Start with an empty array with the correct number of columns
    for path in glob.glob(path_name):
        array_to_concat = get_numpy_from_parquet(path=path, cols=cols, instrument=instrument)
        data = np.vstack((data, array_to_concat))
    return data

def save_symbol_data(path_name, cols, symbols, fillna, window=4):
    for symbol in tqdm.tqdm(symbols, desc="Saving financial instruments"):
        parquet_np = get_financial_instrument(path_name, cols, instrument=symbol)
        windowed_data, targets = parquet_np[:, :-1], parquet_np[:, -1]
        windowed_data = numpy_fillna(windowed_data, fillna[symbol], cols=cols)
        data = rolling_window(windowed_data, window=window)
        targets = targets[window-1:]
        # Use '/kaggle/working' for saving output files

        path = pathlib.Path(f"/content/content/MyDrive/Kaggle Competitions/Jane Street/train.npy/symbol={symbol}/window={window}")
        path.mkdir(parents=True, exist_ok=True)
        np.savez_compressed(path/"part-0.npz", data=data, target=targets)


In [None]:
save_symbol_data(path_name=cfg["path_name"], cols=cfg["cols"], fillna=nan_means, symbols=cfg["symbols"], window=cfg["window"])

Saving financial instruments: 100%|██████████| 39/39 [55:45<00:00, 85.78s/it]
