In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import polars as pl
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
import tqdm
import pickle
from sklearnex import patch_sklearn
import logging

patch_sklearn()
logging.getLogger('sklearnex').setLevel(logging.WARNING)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Get summary of missing values across the data files

In [2]:
seed = 123
np.random.seed(123)
cols= ['feature_06', 'feature_22', 'feature_24', 'feature_38', 'feature_14', 'feature_28', 'feature_05', 'feature_67', 'feature_30', 'feature_60', 'feature_20', 
       'feature_61', 'feature_23', 'feature_70', 'feature_25', 'feature_29', 'feature_36', 'feature_09', 'feature_10', 'feature_11', 'feature_69', 'feature_07', 
       'feature_72', 'feature_71']

In [3]:
path_name = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=*/part-0.parquet"

In [4]:
def get_missing_summary(path_name):
    data = {}
    total_length = 0
    for path in glob.glob(path_name):
        parquet_file = pl.scan_parquet(path).collect()
        parquet_dict = parquet_file.null_count().to_dicts()[0]
        for feat, missing in parquet_dict.items():
            data[feat] = data.get(feat, 0) + missing
        total_length += len(parquet_file)
    for feat, value in data.items():
        data[feat] = data[feat]/total_length * 100

    return pd.Series(data, name="Missing Data Summary").sort_values(ascending=False).to_frame()

def get_total_symbols(path_name):
    symbols = []
    for path in glob.glob(path_name):
        parquet_file = pd.read_parquet(path)
        symbols = list(set(symbols + parquet_file['symbol_id'].unique().tolist()))
    return symbols

def get_numpy_from_parquet(path, cols, instrument=2):
    parquet_file = pl.scan_parquet(path)
    instrument_data = parquet_file.filter(pl.col("symbol_id") == instrument).collect().sort(["date_id", "time_id"])
    return instrument_data.select(cols+['responder_6']).to_numpy()

def get_financial_instrument(path_name, emb_dim, cols, instrument):
    data = np.empty((0, emb_dim+1), dtype=np.float32)  # Start with an empty array with the correct number of columns
    for path in glob.glob(path_name):
        array_to_concat = get_numpy_from_parquet(path=path, cols=cols, instrument=instrument)
        data = np.vstack((data, array_to_concat))
    return data

def get_missing_summary_by_symbol(path_name):
    series = []
    for symbol in tqdm.tqdm(symbols, desc="Processing financial instruments"):
        parquet_file = get_financial_parquet(path_name, instrument=symbol)
        parquet_file = parquet_file.null_count()/len(parquet_file)*100
        series.append(pd.Series(parquet_file.to_dicts()[0], name=f"Symbol {symbol}").to_frame())

    return pd.concat(series, axis=1, join="inner")

def get_financial_parquet(path_name, instrument=2):
    data = []
    for path in glob.glob(path_name):
        parquet_file = pl.scan_parquet(path)
        instrument_data = parquet_file.filter(pl.col("symbol_id") == instrument).collect().sort(["date_id", "time_id"])
        data.append(instrument_data)
    return pl.concat(data, how="vertical")

def get_mean(path_name, cols):
    hash_map = {}
    for symbol in tqdm.tqdm(symbols, desc="Processing financial instruments"):
        parquet_file = get_financial_parquet(path_name, instrument=symbol)
        hash_map[symbol] = parquet_file.select(cols).mean().to_dicts()[0]
        
    return hash_map

def rolling_window(data, window):
    size = data.shape[0] - window + 1
    emb = data.shape[1]-1
    inputs = np.lib.stride_tricks.sliding_window_view(data[:, :-1], 
                                                      (window, emb), 
                                                      axis=(0, 1)).reshape(size, window*emb)
    targets = data[window-1:, -1]
    
    return inputs, targets

In [5]:
symbols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]


In [6]:
missing_summary = get_missing_summary(path_name)
# symbols = get_total_symbols(path_name)

In [7]:
print(f"There are a total of {len(symbols)} financial instruments across the data")

There are a total of 39 financial instruments across the data


## Features with no missing values

In [8]:
print(sorted(missing_summary.tail(45).index.tolist()))

['date_id', 'feature_05', 'feature_06', 'feature_07', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_20', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_28', 'feature_29', 'feature_30', 'feature_34', 'feature_35', 'feature_36', 'feature_38', 'feature_48', 'feature_49', 'feature_59', 'feature_60', 'feature_61', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71', 'feature_72', 'responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_6', 'responder_7', 'responder_8', 'symbol_id', 'time_id', 'weight']


In [9]:
len(missing_summary.index) #Total Features

92

In [10]:
clean_cols = sorted(missing_summary.tail(45).index.tolist())

In [12]:
print(clean_cols)

['date_id', 'feature_05', 'feature_06', 'feature_07', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_20', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_28', 'feature_29', 'feature_30', 'feature_34', 'feature_35', 'feature_36', 'feature_38', 'feature_48', 'feature_49', 'feature_59', 'feature_60', 'feature_61', 'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71', 'feature_72', 'responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_6', 'responder_7', 'responder_8', 'symbol_id', 'time_id', 'weight']


In [11]:
missing_summary

Unnamed: 0,Missing Data Summary
feature_27,17.900406
feature_21,17.900406
feature_31,17.900406
feature_26,17.900406
feature_39,9.125593
...,...
feature_36,0.000000
feature_38,0.000000
time_id,0.000000
feature_48,0.000000


#### Columns with missing values < 1%

In [12]:
fill_cols = sorted(missing_summary.loc[(missing_summary["Missing Data Summary"] < 20) & (missing_summary["Missing Data Summary"] >= 1)].index.tolist())

In [13]:
print(fill_cols)

['feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_15', 'feature_21', 'feature_26', 'feature_27', 'feature_31', 'feature_32', 'feature_33', 'feature_39', 'feature_41', 'feature_42', 'feature_44', 'feature_50', 'feature_52', 'feature_53', 'feature_55', 'feature_58', 'feature_73', 'feature_74']


In [14]:
# symbol_summary = get_missing_summary_by_symbol(path_name)

## Assess the missing values summary for each of the financial instrument

In [15]:
all_feats = sorted(missing_summary.index.tolist())[1:-12]

In [16]:
# %time nan_means = get_mean(path_name, all_feats)
# with open('/kaggle/working/nan_means.p', 'wb') as fp:
#     pickle.dump(nan_means, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
# with open('/kaggle/working/agg_means.p', 'wb') as fp:
#     pickle.dump(pd.DataFrame(nan_means).mean(axis=1).to_dict(), fp, protocol=pickle.HIGHEST_PROTOCOL)

# Select best features

In [18]:
with open('/kaggle/input/fillnans/nan_means.p', 'rb') as fp:
    nan_means = pickle.load(fp)

In [19]:
import json
with open("/kaggle/input/feature-importance/features_information_v2.json", mode="r") as file:
    feature_importance = json.load(file)

top_k_comb = sorted(feature_importance.items(), key=lambda x: (x[-1], x[0]), reverse=False)[:10] #best feature combination
cols = top_k_comb[0][0].split("/")

In [20]:
def numpy_fillna(arr, fillna_dict, cols):
    arr_copy = arr.copy()  # Avoid modifying the original array
    for idx, col in enumerate(cols):
        arr_copy[:, idx] = np.nan_to_num(arr_copy[:, idx], nan=fillna_dict[col])
    return arr_copy

def rolling_window(data, window):
    size = data.shape[0] - window + 1
    emb = data.shape[1]
    inputs = np.lib.stride_tricks.sliding_window_view(data, 
                                                      (window, emb), 
                                                      axis=(0, 1)).reshape(size, window*emb)
    
    return inputs

def evaluate_model(symbol, path_name, params, cols, fillna, window=4, 
                   n_splits=5, seed=seed, shuffle=True):
    
    kfold = KFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
    scores = []
    data = get_financial_instrument(path_name, emb_dim=len(cols), cols=cols, instrument=symbol)
    windowed_data = numpy_fillna(data[:, :-1], fillna, cols=cols)
    windowed_data = rolling_window(windowed_data, window)
    targets = data[window-1:, -1]
        
    r2_train = []
    r2_test = []
    #Define cross validation
    for train_ind, test_ind in kfold.split(windowed_data):
        #Split train data for cross validation
        X_train, y_train, X_test, y_test = windowed_data[train_ind], targets[train_ind], windowed_data[test_ind], targets[test_ind]
        model = lgb.LGBMRegressor(**params)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

        #Generate predictions 
        y_preds = model.predict(X_test)
        preds = model.predict(X_train)
        r2_train.append(r2_score(y_train, preds)) #append r2_score
        r2_test.append(r2_score(y_test, y_preds))
        scores.append(mean_absolute_error(y_test, y_preds)) # append model error
    
    score = np.mean(scores)
    print(f"""Financial Instrument: {symbol}\nFeature Combination: {cols}\nMean absolute error: {score:.3f}\nR2 Score Train: {np.mean(r2_train):.2f}\nR2 Score Test: {np.mean(r2_test):.2f}""")
    print()
    
    return score

In [22]:
params = {"boosting_type": 'gbdt',"num_leaves": 77, "max_depth": 7, "colsample_bytree": 0.45, "learning_rate": 0.45,
          'min_child_samples': 20,'min_split_gain': 0.45, "n_estimators": 200,"verbose": -1, "metric": "mae",
          "force_col_wise": True # "device": "gpu"  # Enable GPU, if available
         }
best_scores = {'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50': 0.4256425408209205, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33': 0.4256577751378833, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39': 0.4257329168282258, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21': 0.4256889860553527, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73': 0.42578237985077905, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53': 0.42552762277622824, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15': 0.4256377532475385, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15/feature_32': 0.42526072871252313, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15/feature_32/feature_41': 0.4259224310317046, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15/feature_32/feature_41/feature_55': 0.42625380089287435, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15/feature_32/feature_41/feature_55/feature_02': 0.426230088333463, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15/feature_32/feature_41/feature_55/feature_02/feature_42': 0.42629852915466265}
best_features = ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42']
symbol = 1
feature_cols = sorted(cols+fill_cols)

for _ in tqdm.tqdm(range(len(best_features),len(feature_cols)), desc="Evaluating feature combinations"):
    feature_score = {}
    for col in feature_cols:
        if col not in best_features:  # Check if the column is not already evaluated
            current_features = best_features + [col]  # Create the current feature list
            feature_score["/".join(current_features)] = evaluate_model(
                symbol, path_name, params, cols=current_features, fillna=nan_means[symbol],
                window=4, n_splits=5, seed=seed, shuffle=True
            )
    
    # Select the best feature combination from the current iteration
    k, v = sorted(feature_score.items(), key=lambda v: v[-1], reverse=False)[0]
    best_scores[k] = v

    # Add the new best features to the set
    best_features.append(k.split("/")[-1])

Evaluating feature combinations:   0%|          | 0/4 [00:00<?, ?it/s]

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42', 'feature_00']
Mean absolute error: 0.427
R2 Score Train: 0.37
R2 Score Test: 0.25

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20',

Evaluating feature combinations:  25%|██▌       | 1/4 [39:02<1:57:08, 2342.93s/it]

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42', 'feature_52']
Mean absolute error: 0.427
R2 Score Train: 0.37
R2 Score Test: 0.25

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20',

Evaluating feature combinations:  50%|█████     | 2/4 [1:08:18<1:06:34, 1997.29s/it]

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42', 'feature_44', 'feature_52']
Mean absolute error: 0.427
R2 Score Train: 0.37
R2 Score Test: 0.25

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05',

Evaluating feature combinations:  75%|███████▌  | 3/4 [1:28:24<27:16, 1636.18s/it]  

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42', 'feature_44', 'feature_52', 'feature_03']
Mean absolute error: 0.427
R2 Score Train: 0.37
R2 Score Test: 0.25



Evaluating feature combinations: 100%|██████████| 4/4 [1:38:40<00:00, 1480.02s/it]

Financial Instrument: 1
Feature Combination: ['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42', 'feature_44', 'feature_52', 'feature_03', 'feature_00']
Mean absolute error: 0.428
R2 Score Train: 0.37
R2 Score Test: 0.24






In [23]:
print(best_features)

['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32', 'feature_41', 'feature_55', 'feature_02', 'feature_42', 'feature_44', 'feature_52', 'feature_03', 'feature_00']


In [24]:
print(best_scores)

{'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50': 0.4256425408209205, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33': 0.4256577751378833, 'feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/featur

In [25]:
import json
with open("/kaggle/working/features_information_v3.json", mode="w") as file:
    json.dump(best_scores, file)

In [26]:
best_features = sorted(best_scores.items(), key=lambda x: (x[-1], x[0]), reverse=False)[0] #best feature combination
cols = best_features[0].split("/")
print(cols)

['feature_06', 'feature_22', 'feature_24', 'feature_37', 'feature_47', 'feature_70', 'feature_30', 'feature_05', 'feature_20', 'feature_14', 'feature_28', 'feature_29', 'feature_08', 'feature_25', 'feature_23', 'feature_67', 'feature_38', 'feature_61', 'feature_78', 'feature_62', 'feature_19', 'feature_36', 'feature_69', 'feature_72', 'feature_65', 'feature_09', 'feature_10', 'feature_11', 'feature_07', 'feature_60', 'feature_56', 'feature_01', 'feature_58', 'feature_31', 'feature_26', 'feature_27', 'feature_04', 'feature_74', 'feature_50', 'feature_33', 'feature_39', 'feature_21', 'feature_73', 'feature_53', 'feature_15', 'feature_32']


In [28]:
sorted(best_scores.items(), key=lambda x: (x[-1], x[0]), reverse=False)

[('feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/feature_73/feature_53/feature_15/feature_32',
  0.42526072871252313),
 ('feature_06/feature_22/feature_24/feature_37/feature_47/feature_70/feature_30/feature_05/feature_20/feature_14/feature_28/feature_29/feature_08/feature_25/feature_23/feature_67/feature_38/feature_61/feature_78/feature_62/feature_19/feature_36/feature_69/feature_72/feature_65/feature_09/feature_10/feature_11/feature_07/feature_60/feature_56/feature_01/feature_58/feature_31/feature_26/feature_27/feature_04/feature_74/feature_50/feature_33/feature_39/feature_21/

# Select best hyperparameter

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import polars as pl
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import tqdm
import pickle
from sklearn.metrics import r2_score
from sklearnex import patch_sklearn
import logging

patch_sklearn()
logging.getLogger('sklearnex').setLevel(logging.WARNING)

In [32]:
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=*/part-0.parquet'

with open('/kaggle/input/fillnans/nan_means.p', 'rb') as fp:
    nan_means = pickle.load(fp)

def numpy_fillna(arr, fillna_dict, cols):
    arr_copy = arr.copy()  # Avoid modifying the original array
    for idx, col in enumerate(cols):
        arr_copy[:, idx] = np.nan_to_num(arr_copy[:, idx], nan=fillna_dict[col])
    return arr_copy

def rolling_window(data, window):
    size = data.shape[0] - window + 1
    emb = data.shape[1]
    inputs = np.lib.stride_tricks.sliding_window_view(data, 
                                                      (window, emb), 
                                                      axis=(0, 1)).reshape(size, window*emb)
    
    return inputs

def evaluate_parameter(path_name, params, hyperparameters, name, cols, fillna, 
                       window=4, n_splits=5, seed=123, shuffle=True, symbol=2):
    
    kfold = KFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)
    scores = []
    data = get_financial_instrument(path_name, emb_dim=len(cols), cols=cols, instrument=symbol)
        
    #Define cross validation
    for parameter in tqdm.tqdm(hyperparameters, desc="Evaluating parameter"):
        params[name] = parameter
        windowed_data, targets = data[:, :-1], data[:, -1]#rolling_window(data, window)
        windowed_data = numpy_fillna(windowed_data, fillna, cols=cols)
        
        error = []
        r2_train = []
        r2_test = []
        for train_ind, test_ind in kfold.split(windowed_data):
            #Split train data for cross validation
            X_train, y_train, X_test, y_test = windowed_data[train_ind], targets[train_ind], windowed_data[test_ind], targets[test_ind]
            model = lgb.LGBMRegressor(**params)
            model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

            #Generate predictions 
            y_preds = model.predict(X_test)
            preds = model.predict(X_train)
            r2_train.append(r2_score(y_train, preds)) #append r2_score
            r2_test.append(r2_score(y_test, y_preds))
            error.append(mean_absolute_error(y_test, y_preds)) # append model error      
        
        scores.append(np.mean(error))
        print(f"""Financial Instrument: {symbol}\n{name.capitalize()} Hyperparameter: {parameter}\nMean absolute error: {np.mean(error):.3f}\nR2 Score Train: {np.mean(r2_train):.2f}\nR2 Score Test: {np.mean(r2_test):.2f}""")
        print()
        
    print(f"Financial Instrument: {symbol}\nBest {name.capitalize()}: {hyperparameters[np.argmin(scores)]}\nError: {min(scores)}")
    return scores

In [36]:
params = {"boosting_type": 'gbdt',"num_leaves": 77, "max_depth": 7, "colsample_bytree": 0.45, "learning_rate": 0.45,
          'min_child_samples': 20,'min_split_gain': 0.45, "n_estimators": 350,"verbose": -1, "metric": "rmse",
          "force_col_wise": True # "device": "gpu"  # Enable GPU, if available
         }

hps = list(range(700, 1500, 50))
scores = evaluate_parameter(path_name=train_path, params=params, cols=cols, fillna=nan_means[2], 
                            window=4, hyperparameters=hps, name="n_estimators", n_splits=5, 
                            shuffle=True, symbol=2)

Evaluating parameter:   6%|▋         | 1/16 [10:25<2:36:22, 625.48s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 700
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  12%|█▎        | 2/16 [20:42<2:24:48, 620.64s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 750
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  19%|█▉        | 3/16 [31:11<2:15:15, 624.24s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 800
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  25%|██▌       | 4/16 [42:06<2:07:18, 636.58s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 850
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  31%|███▏      | 5/16 [52:58<1:57:41, 641.97s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 900
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  38%|███▊      | 6/16 [1:04:03<1:48:18, 649.88s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 950
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  44%|████▍     | 7/16 [1:15:18<1:38:44, 658.24s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1000
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  50%|█████     | 8/16 [1:26:43<1:28:54, 666.76s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1050
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  56%|█████▋    | 9/16 [1:38:21<1:18:55, 676.47s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1100
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  62%|██████▎   | 10/16 [1:50:10<1:08:39, 686.54s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1150
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  69%|██████▉   | 11/16 [2:02:07<57:58, 695.71s/it]  

Financial Instrument: 2
N_estimators Hyperparameter: 1200
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  75%|███████▌  | 12/16 [2:14:16<47:03, 705.99s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1250
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  81%|████████▏ | 13/16 [2:26:37<35:49, 716.44s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1300
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  88%|████████▊ | 14/16 [2:39:13<24:17, 728.57s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1350
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter:  94%|█████████▍| 15/16 [2:51:58<12:19, 739.47s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1400
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37



Evaluating parameter: 100%|██████████| 16/16 [3:04:57<00:00, 693.61s/it]

Financial Instrument: 2
N_estimators Hyperparameter: 1450
Mean absolute error: 0.502
R2 Score Train: 0.50
R2 Score Test: 0.37

Financial Instrument: 2
Best N_estimators: 700
Error: 0.5018830022376225





In [1]:
import json
with open("/kaggle/input/feature-importance/features_information.json", mode="r") as file:
    feature_importance = json.load(file)
    

In [2]:
top_k_comb = sorted(feature_importance.items(), key=lambda x: (x[-1], x[0]), reverse=False)[:10] #best feature combination

In [14]:
cols = top_k_comb[1][0].split("/")

In [17]:
print(cols)

['feature_06', 'feature_23', 'feature_24', 'feature_60', 'feature_20', 'feature_37', 'feature_05', 'feature_08', 'feature_22', 'feature_70', 'feature_47', 'feature_28', 'feature_61', 'feature_69', 'feature_29', 'feature_25', 'feature_67', 'feature_38', 'feature_72', 'feature_30', 'feature_62', 'feature_77', 'feature_09', 'feature_10', 'feature_11', 'feature_36', 'feature_65', 'feature_07', 'feature_64', 'feature_19', 'feature_18', 'feature_66']


# Train Model
#### Note: The dates in the data are not consecutive, there are skips and 92 cols

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import polars as pl
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
import tqdm
import pickle
from sklearn.metrics import r2_score
from sklearnex import patch_sklearn
import logging

patch_sklearn()
logging.getLogger('sklearnex').setLevel(logging.WARNING)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
train_path = '/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=*/part-0.parquet'

with open('/kaggle/input/fillnans/nan_means.p', 'rb') as fp:
    nan_means = pickle.load(fp)

def numpy_fillna(arr, fillna_dict, cols):
    arr_copy = arr.copy()  # Avoid modifying the original array
    for idx, col in enumerate(cols):
        arr_copy[:, idx] = np.nan_to_num(arr_copy[:, idx], nan=fillna_dict[col])
    return arr_copy

def rolling_window(data, window):
    size = data.shape[0] - window + 1
    emb = data.shape[1]
    inputs = np.lib.stride_tricks.sliding_window_view(data, 
                                                      (window, emb), 
                                                      axis=(0, 1)).reshape(size, window*emb)
    
    return inputs

def get_numpy_from_parquet(path, cols, instrument=2):
    parquet_file = pl.scan_parquet(path)
    instrument_data = parquet_file.filter(pl.col("symbol_id") == instrument).collect().sort(["date_id", "time_id"])
    #instrument_data.select(cols+['responder_6']).to_dummies(["feature_09", "feature_10", "feature_11"], drop_first=True).to_numpy()
    return instrument_data.select(cols+['responder_6']).to_numpy()

def get_financial_instrument(path_name, cols, instrument):
    data = np.empty((0, len(cols)+1), dtype=np.float32)  # Start with an empty array with the correct number of columns
    for path in glob.glob(path_name):
        array_to_concat = get_numpy_from_parquet(path=path, cols=cols, instrument=instrument)
        data = np.vstack((data, array_to_concat))
    return data

In [3]:
def train_model(path_name, cols, fillna, params, window):
    models = {}
    scores = []
    
    for symbol in tqdm.tqdm(range(39), desc="Model training"):
        data = get_financial_instrument(path_name=path_name, cols=cols, instrument=symbol) #np.load(path, allow_pickle=True)
        windowed_data, targets = data[:, :-1], data[:, -1] #rolling_window(data, window)
        windowed_data = numpy_fillna(windowed_data, fillna[symbol], cols=cols)
        windowed_data = rolling_window(windowed_data, window)
        targets = targets[window-1:]
        model = lgb.LGBMRegressor(**params)
        model.fit(windowed_data, targets)
        models[symbol] = model
        y_preds = model.predict(windowed_data)
        score = r2_score(targets, y_preds)
        scores.append(score)
        print(f"Financial Instrument: {symbol}\nTraining R2 Score: {score:.3f}")
    
    print(f"Model Training R2 Score across all financial instruments: {np.mean(scores):.3f}")
    return models, scores

In [None]:
import json

with open("/kaggle/input/feature-importance/features_information_v3.json", mode="r") as file:
    feature_importance = json.load(file)
top_k_comb = sorted(feature_importance.items(), key=lambda x: (x[-1], x[0]), reverse=False)[:10] #best feature combination
cols = top_k_comb[0][0].split("/")
params = {"boosting_type": 'gbdt',"num_leaves": 77, "max_depth": 7, "colsample_bytree": 0.45, "learning_rate": 0.45,
          'min_child_samples': 20,'min_split_gain': 0.45, "n_estimators": 700,"verbose": -1, "metric": "rmse",
          "force_col_wise": True, "random_state":123 # "device": "gpu"  # Enable GPU, if available
         }

train_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=*/part-0.parquet"
model, scores = train_model(train_path, fillna=nan_means, cols=cols, params=params, window=4)

Model training:   3%|▎         | 1/39 [02:39<1:41:13, 159.82s/it]

Financial Instrument: 0
Training R2 Score: 0.494


Model training:   5%|▌         | 2/39 [09:55<3:18:44, 322.30s/it]

Financial Instrument: 1
Training R2 Score: 0.335


Model training:   8%|▊         | 3/39 [22:03<5:04:33, 507.61s/it]

Financial Instrument: 2
Training R2 Score: 0.117


Model training:  10%|█         | 4/39 [38:49<6:50:41, 704.05s/it]

Financial Instrument: 3
Training R2 Score: -0.323


Model training:  13%|█▎        | 5/39 [49:26<6:25:18, 679.95s/it]

Financial Instrument: 4
Training R2 Score: -0.039


Model training:  15%|█▌        | 6/39 [1:09:42<7:54:08, 862.08s/it]

Financial Instrument: 5
Training R2 Score: -0.992


Model training:  18%|█▊        | 7/39 [1:24:22<7:43:02, 868.19s/it]

Financial Instrument: 6
Training R2 Score: -1.862


#### Model Training R2 Score across all financial instruments: 0.626 (window)

In [11]:
with open('/kaggle/working/models.p', 'wb') as fp:
    pickle.dump(model, fp, protocol=pickle.HIGHEST_PROTOCOL)

# Deep Learning

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl
import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import tqdm, pickle, time, logging, glob, pathlib
from sklearn.metrics import r2_score
from sklearnex import patch_sklearn
import torch, math
from torch.utils.data import  Dataset, DataLoader
from functools import lru_cache 
import matplotlib.pyplot as plt
import random

root = pathlib.Path("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet")
def get_paths(nums: list) -> list:
    return [(root/f"partition_id={i}/part-0.parquet") for i in nums] 

kfold_paths = get_paths(range(4))

In [2]:
import json
with open("/kaggle/input/feature-importance/features_information_v2.json", mode="r") as file:
    feature_importance = json.load(file)

with open('/kaggle/input/fillnans/nan_means.p', 'rb') as fp:
    nan_means = pickle.load(fp)

top_k_comb = sorted(feature_importance.items(), key=lambda x: (x[-1], x[0]), reverse=False)[:10] #best feature combination
cols = top_k_comb[0][0].split("/")

In [3]:
def numpy_fillna(arr, fillna_dict, cols):
    arr_copy = arr.copy()  # Avoid modifying the original array
    for idx, col in enumerate(cols):
        arr_copy[:, idx] = np.nan_to_num(arr_copy[:, idx], nan=fillna_dict[col])
    return arr_copy

In [4]:
class JaneStreetDatasetV2(Dataset):
    def __init__(self, paths, cols, frac=None):
        self.file_mappings = []
        self.total_rows = 0
        self.columns = cols

        for path in tqdm.tqdm(paths, desc="Loading financial instruments"):
            symbol = int(path.parts[-2].split("=")[-1])
            lazy = pl.scan_parquet(path).select(self.columns+["responder_6"])
            size = int(lazy.select(pl.len()).collect().item() * frac) if frac else lazy.select(pl.len()).collect().item()
            np.savez_compressed(f"preprocessed_symbol_{symbol}.npz", data=lazy.collect().to_numpy())
            
            self.file_mappings.append((symbol, self.total_rows, self.total_rows + size))
            self.total_rows += size

    @lru_cache(maxsize=10)
    def _load_file(self, symbol):
        preprocessed_path = f"preprocessed_symbol_{symbol}.npz"
        data = np.load(preprocessed_path,mmap_mode="r",allow_pickle=True)["data"]
        inputs = numpy_fillna(
            data[:, :-1], fillna_dict=nan_means[symbol], cols=self.columns
        )
        return inputs, data[:, -1:]
        
    def __len__(self):
        return self.total_rows

    def __getitem__(self, idx):
        for (symbol, offset, sum_k) in self.file_mappings:
            if idx < sum_k:
                idx = idx - offset
                inputs, target = self._load_file(symbol)
                return (
                    torch.tensor(inputs[idx], dtype=torch.float32),
                    torch.tensor(target[idx], dtype=torch.float32)
                )

In [5]:
def create_dataloader_v2(paths, cfg):
    dataset = JaneStreetDatasetV2(
        paths=paths, cols=cfg["cols"], frac=cfg["frac"]
    )
    dataloader = DataLoader(dataset=dataset,batch_size=cfg["batch_size"],
                            shuffle=cfg["shuffle"],num_workers=cfg["num_workers"],
                            prefetch_factor=cfg["prefetch_factor"],
                            pin_memory=cfg["pin_memory"],drop_last=cfg["drop_last"]
                           )
    return dataloader

In [16]:
cfg = {
    "cols": cols,
    "frac": None,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "batch_size": 96,
    "num_workers": 1,
    "shuffle": False,
    "pin_memory": True,
    "drop_last": False,
    "prefetch_factor": 2
}

In [17]:
def init_weights(m):
    if isinstance(m, torch.nn.Linear):
      torch.nn.init.kaiming_normal_(m.weight, a=0, mode='fan_out', nonlinearity='leaky_relu') #torch.nn.init.normal_(m.weight, mean=0.0, std=1/math.sqrt(6))
      if m.bias is not None:
        nn.init.constant_(m.bias, 0)
    elif isinstance(m, torch.nn.Embedding):
        torch.nn.init.kaiming_normal_(m.weight, a=0, mode='fan_out', nonlinearity='leaky_relu') #torch.nn.init.normal_(m.weight, mean=0.0, std=1/math.sqrt(6))


class LayerNorm(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.eps = 1e-5
        self.shift = torch.nn.Parameter(torch.zeros(dim))
        self.scale = torch.nn.Parameter(torch.ones(dim))

    def forward(self, inp):
        mean = inp.mean(dim=-1, keepdim=True)
        var = inp.var(dim=-1, keepdim=True, unbiased=False)
        inp_norm = (inp - mean)/(torch.sqrt(var)+self.eps)

        return inp_norm + self.shift * self.scale

class FeedForwardLayer(torch.nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.pos_emb = torch.nn.Embedding(1, dim)
        self.layer = torch.nn.Sequential(
            torch.nn.Linear(dim, dim * 4),
            torch.nn.LeakyReLU(), #torch.nn.GELU(approximate="tanh"),
            torch.nn.Linear(dim * 4, dim),
        )

    def forward(self, inp):
        pos_emb = self.pos_emb(torch.tensor(0, device=inp.device))
        return self.layer(inp) + pos_emb

class TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.forward_layer = FeedForwardLayer(cfg["n_features"])
        self.norm = LayerNorm(cfg["n_features"])
        self.dropout = torch.nn.Dropout(cfg["drop_rate"])

    def forward(self, inp):
        shortcut = inp
        inp = self.norm(inp)
        inp = self.forward_layer(inp)/0.1
        out = self.dropout(inp)
        return out + shortcut

In [18]:
class JaneStreetModelV2(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.blocks = torch.nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.norm =  LayerNorm(cfg["n_features"])
        self.final_norm = LayerNorm(cfg["n_features"])
        self.out_proj = torch.nn.Linear(cfg["n_features"], 1, bias=False)

    def forward(self, inp):
        inp = self.blocks(self.norm(inp))
        inp = self.final_norm(inp)

        return self.out_proj(inp)

In [19]:
MODEL_CONFIG = {
    "n_features": len(cfg["cols"]),
    "n_layers": 6,
    "drop_rate": 0.1
}

In [20]:
def calc_loss_batch(inp, targets, device):
    inp = inp.to(device)
    targets = targets.to(device)
    outputs = model(inp)
    loss = torch.nn.functional.l1_loss(outputs, targets)

    return loss.item(), outputs

def evaluate_model(val_loader, num_batch, device, choices):
    total_loss = 0
    for i, (inp, targets) in enumerate(val_loader):
        if i < num_batch:
            with torch.no_grad():
                loss, outputs = calc_loss_batch(
                    inp=inp, targets=targets, device=device
                )
            total_loss += loss
        else:
            break
    outputs = outputs.flatten().detach().cpu().numpy()
    targets = targets.flatten().detach().cpu().numpy()
    print(f"Sample predictions range: (max:{max(outputs)}, min:{min(outputs)}) - Targets range: (max:{max(targets)}, min:{min(targets)})")
    return total_loss/num_batch

def train_epoch(train_loader, device, freq, choices, global_step,
                half_loops, start_lr, min_lr):
    last_loss, running_loss = 0., 0.
    time_step = 0.
    scores = []
    total_len = len(train_loader)

    for batch, (input_batch, targets_batch) in enumerate(train_loader):
        #compute time per step
        start_time = time.time()
        #transfer data to device
        input_batch = input_batch.to(device)
        targets_batch = targets_batch.to(device)
        #zero gradients for every batch
        optimizer.zero_grad()
        global_step += 1
        if global_step <= half_loops:
            # Apply cosine decay to halfway point
            optimizer.param_groups[0]["lr"] = min_lr + 0.5 * (start_lr - min_lr) * (1 + math.cos(math.pi * global_step / half_loops))
        else:
            # Maintain the minimum learning rate beyond halfway point
            optimizer.param_groups[0]["lr"] = min_lr
        outputs = model(input_batch)
        loss = torch.nn.functional.smooth_l1_loss(outputs, targets_batch)
        loss.backward()
        optimizer.step()
        time_step += time.time() - start_time

        running_loss += loss.item()
        if batch % freq == freq-1:
            last_loss = running_loss / freq
            print(f"Batch {batch}/{total_len} - {time_step/freq:.3f}s/step - loss: {last_loss} - lr: {optimizer.param_groups[0]['lr']}")
            # print(f"Model prediction: {outputs[choices].flatten().detach().cpu().numpy()} - Target: {targets_batch[choices].flatten().cpu().numpy()} ")
            running_loss = 0.
            time_step = 0.
            scores.append(last_loss)

    return np.mean(scores), global_step

def train_model(epochs, train_loader, freq, device, frac=4, min_lr=5e-4, num_batch=None, val_loader=None):

    global_step = -1
    total_loops = len(train_loader) * epochs
    half_loops = total_loops // frac
    start_lr = optimizer.param_groups[0]["lr"]
    history = {"loss": []}

    choices = random.choices(range(cfg["batch_size"]-10),k=3)
    for i in range(epochs):
        print(f"Epoch {i+1}/{epochs}")
        #set model to training mode
        model.train(True)
        #train model
        avg_loss, global_step = train_epoch(
            train_loader=train_loader, device=device, freq=freq, choices=choices,
            global_step=global_step, half_loops=half_loops, start_lr=start_lr, min_lr=min_lr
        )

        if val_loader:
            model.eval()
            avg_vloss = evaluate_model(
                val_loader=val_loader, num_batch=num_batch, choices=choices, device=device
            )
            history["val_loss"] = history.get('val_loss', []) + [avg_vloss]
            print(f"train loss: {avg_loss} - val loss: {avg_vloss}")
        else:
            print(f"train loss: {avg_loss}")
        history["loss"].append(avg_loss)

    return history

In [21]:
kfold_paths = get_paths(range(4))

In [76]:
cfg["frac"] = 0.1
folds = 3
kfold_paths = np.array(kfold_paths)
kfold = KFold(n_splits=folds, random_state=123, shuffle=True)
historys = {"val_loss": [], "loss": [], "val_score":[], "score":[]}
for train_path, test_path in kfold.split(kfold_paths):
    train = create_dataloader_v2(kfold_paths[train_path], cfg)
    test = create_dataloader_v2(kfold_paths[test_path], cfg)

    torch.manual_seed(11) #11 #32
    model = JaneStreetModelV2(MODEL_CONFIG)
    # model.apply(init_weights)
    model.to(cfg["device"])
    optimizer = torch.optim.AdamW(
        model.parameters(), lr=5e-3, weight_decay=1e-4
    )
    num_epochs = 5
    history = train_model(epochs=num_epochs, train_loader=train, val_loader=test, frac=1, 
                          freq=len(train)//2, min_lr=3e-4, num_batch=len(test), device=cfg["device"])

    print(f'Epochs({num_epochs}): Train error: {np.mean(history["loss"]):.7f} - Validation Error: {np.mean(history["val_loss"]):.7f}')
    print()
    historys["val_score"].append(history["val_loss"][-1])
    historys["score"].append(history["loss"][-1])
    historys["val_loss"].append(history["val_loss"])
    historys["loss"].append(history["loss"])

print(f"""KFold Cross Validation: {folds} - Mean Train error: {np.mean(historys["score"]):.7f} - Mean Validation error: {np.mean(historys["val_score"]):.7f}""")

Loading financial instruments: 100%|██████████| 2/2 [00:35<00:00, 17.69s/it]
Loading financial instruments: 100%|██████████| 2/2 [00:35<00:00, 17.66s/it]

Epoch 1/5





Batch 3041/6085 - 0.020s/step - loss: 0.3392965414568679 - lr: 0.004885095262720301
Batch 6083/6085 - 0.020s/step - loss: 0.4026450035195947 - lr: 0.0045514751529218914
Sample predictions range: (max:-0.005906291306018829, min:-0.012672403827309608) - Targets range: (max:3.612952470779419, min:-2.362999439239502)
train loss: 0.3709707724882313 - val loss: 0.6780029000867392
Epoch 2/5
Batch 3041/6085 - 0.021s/step - loss: 0.3367704408767435 - lr: 0.004031589792924509
Batch 6083/6085 - 0.019s/step - loss: 0.4025800101370854 - lr: 0.0033766514763024125
Sample predictions range: (max:0.006581532768905163, min:-0.03425678610801697) - Targets range: (max:3.612952470779419, min:-2.362999439239502)
train loss: 0.36967522550691445 - val loss: 0.6782650322873811
Epoch 3/5
Batch 3041/6085 - 0.021s/step - loss: 0.336541765247013 - lr: 0.0026503639807414826
Batch 6083/6085 - 0.020s/step - loss: 0.4024213417760788 - lr: 0.0019242716337107002
Sample predictions range: (max:-0.006083087995648384, min:

Loading financial instruments: 100%|██████████| 3/3 [00:53<00:00, 17.69s/it]
Loading financial instruments: 100%|██████████| 1/1 [00:16<00:00, 16.01s/it]

Epoch 1/5





Batch 4685/9373 - 0.020s/step - loss: 0.3480450748209573 - lr: 0.004885055822147457
Batch 9371/9373 - 0.019s/step - loss: 0.4006757119623209 - lr: 0.004551375109805974
Sample predictions range: (max:0.033861614763736725, min:0.030985042452812195) - Targets range: (max:1.5336878299713135, min:-2.151968240737915)
train loss: 0.37436039339163907 - val loss: 0.6515785204724732
Epoch 2/5
Batch 4685/9373 - 0.020s/step - loss: 0.34585671072346413 - lr: 0.004031486505145601
Batch 9371/9373 - 0.020s/step - loss: 0.40040510519563904 - lr: 0.0033764895741494423
Sample predictions range: (max:0.023781605064868927, min:0.022822190076112747) - Targets range: (max:1.5336878299713135, min:-2.151968240737915)
train loss: 0.3731309079595516 - val loss: 0.650973986456914
Epoch 3/5
Batch 4685/9373 - 0.019s/step - loss: 0.34571542999040566 - lr: 0.0026502362981774296
Batch 9371/9373 - 0.018s/step - loss: 0.4002550813839288 - lr: 0.001924109713640261
Sample predictions range: (max:0.00827879924327135, min:0

Loading financial instruments: 100%|██████████| 3/3 [00:52<00:00, 17.65s/it]
Loading financial instruments: 100%|██████████| 1/1 [00:20<00:00, 20.09s/it]

Epoch 1/5





Batch 4564/9131 - 0.019s/step - loss: 0.31943855316919834 - lr: 0.004885057756802568
Batch 9129/9131 - 0.019s/step - loss: 0.3870529151359432 - lr: 0.004551380017004218
Sample predictions range: (max:0.02684212103486061, min:0.025023117661476135) - Targets range: (max:2.2443041801452637, min:-3.0326690673828125)
train loss: 0.35324573415257077 - val loss: 0.7287970761498895
Epoch 2/5
Batch 4564/9131 - 0.019s/step - loss: 0.3172455501976257 - lr: 0.004031491571352114
Batch 9129/9131 - 0.020s/step - loss: 0.3868061108031154 - lr: 0.003376497515297334
Sample predictions range: (max:0.017521582543849945, min:0.016856638714671135) - Targets range: (max:2.2443041801452637, min:-3.0326690673828125)
train loss: 0.3520258305003705 - val loss: 0.7283092429331597
Epoch 3/5
Batch 4564/9131 - 0.018s/step - loss: 0.31695883865340696 - lr: 0.0026502425608166524
Batch 9129/9131 - 0.019s/step - loss: 0.38665210187174015 - lr: 0.0019241176554892156
Sample predictions range: (max:0.008396630175411701, mi

KFold Cross Validation: 3 - Mean Squared error: 0.6846301 (96 batch size, 6 layers, 0.1 droprate)

In [22]:
torch.manual_seed(11) #11 #32
paths = glob.glob(str(root/"partition_id=*/part-0.parquet"))
paths = [pathlib.Path(path) for path in paths]
data_loader = create_dataloader_v2(paths, cfg)
model = JaneStreetModelV2(MODEL_CONFIG)
model.to(cfg["device"])
optimizer = torch.optim.AdamW(
    model.parameters(), lr=5e-3, weight_decay=1e-4
)
num_epochs = 5
history = train_model(epochs=num_epochs, train_loader=data_loader, frac=1, min_lr=3e-4,
                      freq=len(data_loader)//50, device=cfg["device"])

print(f"""Epochs: {num_epochs}\nTrain error: {np.mean(history["loss"]):.7f}""")

Loading financial instruments: 100%|██████████| 10/10 [03:43<00:00, 22.36s/it]


Epoch 1/5
Batch 9817/490910 - 0.010s/step - loss: 0.2726002912890613 - lr: 0.0049998144992312345
Batch 19635/490910 - 0.010s/step - loss: 0.2631366495103749 - lr: 0.004999257950632049
Batch 29453/490910 - 0.010s/step - loss: 0.3321751893886355 - lr: 0.004998330442082415
Batch 39271/490910 - 0.010s/step - loss: 0.5015549261607101 - lr: 0.004997032120040717
Batch 49089/490910 - 0.010s/step - loss: 0.19788664134693962 - lr: 0.004995363189518705
Batch 58907/490910 - 0.010s/step - loss: 0.25735669867829003 - lr: 0.004993323914049116
Batch 68725/490910 - 0.010s/step - loss: 0.32372269358479866 - lr: 0.004990914615644069
Batch 78543/490910 - 0.010s/step - loss: 0.1994130615659861 - lr: 0.004988135674744213
Batch 88361/490910 - 0.010s/step - loss: 0.3652098274196504 - lr: 0.00498498753015865
Batch 98179/490910 - 0.010s/step - loss: 0.2561894743387426 - lr: 0.004981470678995655
Batch 107997/490910 - 0.010s/step - loss: 0.27337701796986774 - lr: 0.004977585676584168
Batch 117815/490910 - 0.010s/

In [24]:
torch.save({
 "model_state_dict": model.state_dict()
 },
 "/kaggle/working/model.pth"
)