## Created by <a href="https://github.com/yunsuxiaozi/">yunsuxiaozi </a>  2024/10/24

#### We will use Ridge as the baseline here.

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Import Libraries</h1></span>

In [None]:
#necessary
import polars as pl#similar to pandas, but with better performance when dealing with large datasets.
import pandas as pd#read csv,parquet
import numpy as np#for scientific computation of matrices
#model
from sklearn.linear_model import Ridge
import os#Libraries that interact with the operating system
import gc
import warnings#avoid some negligible errors
#The filterwarnings () method is used to set warning filters, which can control the output method and level of warning information.
warnings.filterwarnings('ignore')
#environment provided by competition hoster
import sys
import os

# Add the parent directory to the Python path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)


import random#provides some functions for generating random numbers
#set random seed,make sure model can be recurrented.
def seed_everything(seed):
    np.random.seed(seed)#numpy random seed
    random.seed(seed)#python built-in random seed
seed_everything(seed=2025)

# <span><h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Fit and Predict</h1></span>

In [None]:
# ------------------------------
# 1. Custom metric
# ------------------------------
def custom_metric(y_true, y_pred, weight):
    return 1 - (np.sum(weight * (y_true - y_pred) ** 2) / np.sum(weight * y_true ** 2))

# ------------------------------
# 2. Load training partitions
# ------------------------------
print("< read parquet >")
datas = []
weights = []

for i in range(6, 10):  # partitions 6–9
    train = pl.read_parquet(
        f"/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"
    )
    train = train.to_pandas().sample(frac=0.97, random_state=2025)

    weights += list(train["weight"].values)
    train.drop(["weight"], axis=1, inplace=True)
    datas.append(train)

train = pd.concat(datas)
del datas
gc.collect()
print(f"train.shape: {train.shape}")

# ------------------------------
# 3. Features and target
# ------------------------------
cols = [f"feature_0{i}" if i < 10 else f"feature_{i}" for i in range(79)]
X = train[cols].fillna(3).values
y = train["responder_6"].values
del train
gc.collect()

# ------------------------------
# 4. Train/validation split
# ------------------------------
split = 400000  # around 2%
train_X, test_X = X[:-split], X[-split:]
train_y, test_y = y[:-split], y[-split:]
train_weight, test_weight = weights[:-split], weights[-split:]

print(f"train_X.shape: {train_X.shape}, test_X.shape: {test_X.shape}")

# ------------------------------
# 5. Fit Ridge model
# ------------------------------
print("< fit and predict >")
model = Ridge()
model.fit(train_X, train_y)

train_pred = model.predict(train_X)
test_pred = model.predict(test_X)

print(f"train weighted_r2: {custom_metric(train_y, train_pred, weight=train_weight)}")
print(f"test weighted_r2: {custom_metric(test_y, test_pred, weight=test_weight)}")


In [None]:

# ------------------------------
# 6. Define predict() for test data
# ------------------------------
def predict(test_df, lags_df):
    # Keep row_id for alignment
    predictions = test_df[["row_id"]].copy()

    # Features
    X_test = test_df[cols].fillna(3).values

    # Model inference
    test_preds = model.predict(X_test)

    # Attach predictions
    predictions["responder_6"] = test_preds
    return predictions


In [None]:

# ------------------------------
# 7. Load test data
# ------------------------------
test_data = pd.read_parquet(
    "/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/test.parquet",
    engine="fastparquet",
)
lags_data = pd.read_parquet(
    "/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/lags.parquet",
    engine="fastparquet",
)

# Fix dtypes
test_data["date_id"] = test_data["date_id"].astype("int32")
lags_data["date_id"] = lags_data["date_id"].astype("int32")

# ------------------------------
# 8. Run final prediction
# ------------------------------
final_predictions = predict(test_data, lags_data)

print("Prediction DataFrame:")
print(final_predictions.head())

# Save to CSV if needed
final_predictions.to_csv("ridge_predictions.csv", index=False)
print("Predictions saved to ridge_predictions.csv")


< read parquet >
train.shape: (24205450, 91)
train_X.shape: (23805450, 79), test_X.shape: (400000, 79)
< fit and predict >
train weighted_r2: 0.006264686584472656
test weighted_r2: 0.003151834011077881
Prediction DataFrame:
   row_id  responder_6
0       0     0.111303
1       1     0.111303
2       2     0.111303
3       3     0.111303
4       4     0.111303
Predictions saved to ridge_predictions.csv


####  We can replace Ridge with a more complex neural network.