# Mainly for model training

Depending on the size of your training set, you will need an [inference notebook](https://www.kaggle.com/code/regisvargas/inference-jane-street-a-beginner-s-notebook).

In [2]:
import pandas as pd
import gc
# Initialize a list to hold samples from each file
samples = []
# Load a sample from each file
for i in range(10):
#for i in [7]:
    file_path = f"/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"
    chunk = pd.read_parquet(file_path)
    
    # Take a sample of the data (adjust sample size as needed)
    #sample_chunk = chunk.sample(n=500000, random_state=42)  # For example, 100 rows
    sample_chunk = chunk[:500000]
    samples.append(sample_chunk)
# Concatenate all samples into one DataFrame if needed
del chunk
gc.collect()  # Forces garbage collection
sample_df = pd.concat(samples, ignore_index=True)
del samples
gc.collect()

0

In [3]:
sample_df.head()

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
0,0,0,1,3.889038,,,,,,0.851033,...,-0.281498,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504
1,0,0,7,1.370613,,,,,,0.676961,...,-0.302441,2.965889,1.190077,-0.523998,3.849921,2.626981,5.0,0.703665,0.216683,0.778639
2,0,0,9,2.285698,,,,,,1.056285,...,-0.096792,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828
3,0,0,10,0.690606,,,,,,1.139366,...,-0.296244,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516
4,0,0,14,0.44057,,,,,,0.9552,...,3.418133,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.0,-3.57282,-1.089123,-5.0


# Prepare data

In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
# Separate features and responders
features = sample_df.filter(regex='^feature_')
responders = sample_df.filter(regex='^responder_')
weights = sample_df['weight']
# Convert to numpy arrays for TensorFlow
X = features.values  # Features for input
#y = responders.values  # Responders for output
# Assuming you have a DataFrame `y_train` with all responders
y = responders[['responder_6']].values  # Keep only responder_6
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
y = np.nan_to_num(y, nan=0.0, posinf=0.0, neginf=0.0)

2025-08-30 18:32:07.831769: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
Is_keras = True

In [8]:
train_size = int(len(X) * 0.8)

# Sequential split
X_train = X[:train_size]
X_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]
weights_train = weights[:train_size]
weights_val = weights[train_size:]

print(f"Train shapes: {X_train.shape}, {y_train.shape}, {weights_train.shape}")
print(f"Validation shapes: {X_val.shape}, {y_val.shape}, {weights_val.shape}")

Train shapes: (4000000, 79), (4000000, 1), (4000000,)
Validation shapes: (1000000, 79), (1000000, 1), (1000000,)


# Training Autoencoder for compact representation

In [9]:
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Define the Autoencoder model
input_dim = X_train.shape[1]  # Number of features
latent_dim = 32  # Dimension of the bottleneck layer
encoder_input = layers.Input(shape=(input_dim,))
x = layers.Dense(128, activation='relu')(encoder_input)
x = layers.Dense(64, activation='relu')(x)
bottleneck = layers.Dense(latent_dim, activation='linear', name='bottleneck')(x)  # Encoder output
# Decoder
x = layers.Dense(64, activation='relu')(bottleneck)
x = layers.Dense(128, activation='relu')(x)
decoder_output = layers.Dense(input_dim, activation='linear')(x)
autoencoder = models.Model(encoder_input, decoder_output, name="Autoencoder")
# Compile the Autoencoder
autoencoder.compile(optimizer="adam", loss="mse")
autoencoder.summary()
# Define callbacks
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True, min_delta = 0.00001)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)
# Train the Autoencoder
history = autoencoder.fit(
    X_train, X_train,
    validation_data=(X_val, X_val),
    epochs=1,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr]
)
# Extract the encoder
encoder = models.Model(encoder_input, bottleneck, name="Encoder")
encoder.save("/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/Beginner.keras")

[1m125000/125000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 1ms/step - loss: 0.2611 - val_loss: 0.1243 - learning_rate: 0.0010


# XGBoost

See [Feature engineering, xgboost](https://www.kaggle.com/code/dlarionov/feature-engineering-xgboost#Part-2,-xgboost) and [🥇🥇Jane Street Baseline lgb, xgb and catboost🥇🥇](https://www.kaggle.com/code/yuanzhezhou/jane-street-baseline-lgb-xgb-and-catboost)for details.

In [None]:
# Define a learning rate schedule
def learning_rate_scheduler_xgb(epoch):
    initial_rate = 0.3
    decay_rate = 0.999
    return initial_rate * (decay_rate ** (np.log(epoch)))

In [None]:
from xgboost import XGBRegressor
# Create an XGBoost model
model_xgb = XGBRegressor(
    n_estimators=5000,
    learning_rate=learning_rate_scheduler_xgb,
    tree_method='hist',
    max_depth=6,
    random_state=42
)
# Fit the model with sample weights and validation dataset
model_xgb.fit(
    X_train,
    y_train,
 #   sample_weight=weights_train,
    eval_set=[(X_val, y_val)],
#    sample_weight_eval_set=[weights_train, weights_val],
    eval_metric='rmse',
    early_stopping_rounds=10,
    verbose=False
)

In [None]:
y_pred = model_xgb.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_val, y_pred, squared=False)
r2 = r2_score(y_val, y_pred)
print(f"RMSE: {mse}")
print(f"R²: {r2}")

In [None]:
import joblib
# Save the model
joblib.dump(model_xgb, "xgboost_sklearn.pkl")

# Build the Autoencoder Model

Gradient Centralization for Better Training Performance

See https://keras.io/examples/vision/gradient_centralization/ for details. 

In [None]:
from keras.optimizers import RMSprop
class GCRMSprop(RMSprop):
    def get_gradients(self, loss, params):
        # We here just provide a modified get_gradients() function since we are
        # trying to just compute the centralized gradients.
        grads = []
        gradients = super().get_gradients()
        for grad in gradients:
            grad_len = len(grad.shape)
            if grad_len > 1:
                axis = list(range(grad_len - 1))
                grad -= ops.mean(grad, axis=axis, keep_dims=True)
            grads.append(grad)
        return grads
optimizer = GCRMSprop(learning_rate=1e-4)

In [None]:
from tensorflow.keras.regularizers import l2
# Define the number of input and output nodes
input_dim = X.shape[1]  # Number of features (79)
output_dim = y.shape[1]  # Number of responders (9)
# Define the model
model = models.Sequential([
    layers.Input(shape=(input_dim,)), # Input layer
   # layers.LayerNormalization(),
   # layers.BatchNormalization(),
  #  layers.Dense(128, activation='relu'),
  #  layers.Dropout(0.2),
    layers.Dense(64, activation='linear'),  # Encoder
    layers.Dense(32, activation='gelu'),  # Bottleneck layer (compression)
    layers.Dense(output_dim, activation='linear'),  # Decoder
#    layers.Dense(128, activation='relu'), 
    layers.Dropout(0.2)#,
#    layers.Dense(output_dim, activation='linear'#, kernel_regularizer=l2(0.001))  # Output layer for responders
])
model.compile(optimizer="adam", loss='mse')

# Train Autoencoder Model

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler
def step_decay(epoch):
    initial_lr = 0.01
    drop = 0.5
    epochs_drop = 5
    lr = initial_lr * (drop ** (epoch // epochs_drop))
    return lr
lr_scheduler = LearningRateScheduler(step_decay)

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
# Define EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',    # Monitor validation loss
    patience=10,            # Number of epochs to wait for improvement
    min_delta=0.00001,       # Minimum change to qualify as an improvement
    restore_best_weights=True  # Restore weights from the best epoch
)

In [None]:
if Is_keras:
    history = model.fit(
    X_train, y_train,
   # sample_weight=weights_train,  # Training sample weights
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val#, 
                     #weights_val
                    ),  # Validation data with sample weights
    callbacks=[early_stopping, reduce_lr]
)

In [None]:
if Is_keras:
    model.save("/kaggle/working/model.keras")

# Submission

See [Jane Street RMF Demo Submission](https://www.kaggle.com/code/ryanholbrook/jane-street-rmf-demo-submission) for details.

Depending on the size of your training set, you will need an [inference notebook](https://www.kaggle.com/code/regisvargas/inference-jane-street-a-beginner-s-notebook).

In [None]:
import os
import polars as pl
import kaggle_evaluation.jane_street_inference_server

In [None]:
import polars as pl
import numpy as np
# Assuming `model` is your trained model
# Assuming features required by the model are named 'feature_00', 'feature_01', etc.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    global lags_
    if lags is not None:
        lags_ = lags
    # Extract the features for the model input
    feature_columns = [col for col in test.columns if col.startswith("feature_")]
    features = test.select(feature_columns).to_numpy()  # Convert to numpy array for model input
    features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
    # Generate predictions using the model
    #model_predictions = model.predict(features)
    if Is_keras:
        responder_6_predictions = model.predict(features)[:,0]
    else:
        responder_6_predictions = model_xgb.predict(features)
   # print(responder_6_predictions)    
    #responder_6_predictions = model_predictions[:, 6]  # Assuming responder_6 is at index 6
    # Create a new Polars DataFrame with row_id and responder_6 predictions
    predictions = test.select("row_id").with_columns(
        pl.Series("responder_6", responder_6_predictions)
    )
    print(predictions)
    # Ensure the output format and length requirements
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    
    assert len(predictions) == len(test)
    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )