# Ridge with Preprocessing

In [1]:
import gc
import pandas as pd
import numpy as np
import polars as pl
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [2]:

# ------------------------------
# 1. Custom metric (weighted R², Kaggle-style)
# ------------------------------
def custom_metric(y_true, y_pred, weight):
    return 1 - (np.sum(weight * (y_true - y_pred) ** 2) / np.sum(weight * y_true ** 2))


In [3]:
# =========================================================
# 2. Hand-rolled Preprocessor
# =========================================================
class RobustPreprocessor(BaseEstimator, TransformerMixin):
    """
    Hand-rolled preprocessing:
    - Fill NaNs with median
    - Robust scaling (median/IQR)
    - Winsorization (clip to quantiles)
    """
    def __init__(self, clip_quantiles=(0.01, 0.99)):
        self.clip_quantiles = clip_quantiles

    def fit(self, X, y=None):
        X = np.asarray(X, dtype=np.float64)
        
        # --- NaN-safe statistics ---
        self.medians_ = np.nanmedian(X, axis=0)  # median per feature, ignoring NaNs
        q75 = np.nanpercentile(X, 75, axis=0)
        q25 = np.nanpercentile(X, 25, axis=0)
        self.iqrs_ = q75 - q25
        self.iqrs_[self.iqrs_ == 0] = 1e-6  # avoid div by zero
        
        # Winsorization bounds
        self.lowers_ = np.nanpercentile(X, self.clip_quantiles[0]*100, axis=0)
        self.uppers_ = np.nanpercentile(X, self.clip_quantiles[1]*100, axis=0)
        return self

    def transform(self, X):
        X = np.asarray(X, dtype=np.float64).copy()
        
        # --- Step 1: Fill NaNs with training medians ---
        mask = np.isnan(X)
        if mask.any():
            med = np.broadcast_to(self.medians_, X.shape)
            X[mask] = med[mask]
        
        # --- Step 2: Robust scaling (median/IQR) ---
        X = (X - self.medians_) / self.iqrs_
        
        # --- Step 3: Winsorization (clip extremes) ---
        X = np.clip(X, self.lowers_, self.uppers_)
        
        # --- Final safety check: replace any leftover weird values ---
        X = np.nan_to_num(X, nan=0.0, posinf=10.0, neginf=-10.0)
        return X


In [4]:
# ------------------------------
# 3. Load training partitions
# ------------------------------
print("< read parquet >")
datas = []
weights = []

for i in range(6, 10):  # partitions 6–9
    train = pl.read_parquet(
        f"/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={i}/part-0.parquet"
    )
    train = train.to_pandas().sample(frac=0.97, random_state=2025)

    weights += list(train["weight"].values)
    train.drop(["weight"], axis=1, inplace=True)
    datas.append(train)

train = pd.concat(datas)
del datas
gc.collect()
print(f"train.shape: {train.shape}")


< read parquet >
train.shape: (24205450, 91)


In [5]:
# ------------------------------
# 4. Features and target
# ------------------------------
cols = [f"feature_0{i}" if i < 10 else f"feature_{i}" for i in range(79)]
X = train[cols].values
y = train["responder_6"].values
del train
gc.collect()

0

In [6]:
# ------------------------------
# 5. Train/validation split
# ------------------------------
split = 400000  # around 2%
train_X, test_X = X[:-split], X[-split:]
train_y, test_y = y[:-split], y[-split:]
train_weight, test_weight = weights[:-split], weights[-split:]

print(f"train_X.shape: {train_X.shape}, test_X.shape: {test_X.shape}")

train_X.shape: (23805450, 79), test_X.shape: (400000, 79)


In [None]:
# ------------------------------
# 6. Build preprocessing + Ridge pipeline
# ------------------------------
pipeline = Pipeline([
    ("pre", RobustPreprocessor()),   # Step 1–3 preprocessing
    ("ridge", Ridge(solver="saga", alpha=1.0, random_state=0, max_iter=5000))  # Ridge regression
])


: 

In [None]:
# ------------------------------
# 7. Fit model
# ------------------------------
print("< fit and predict >")
pipeline.fit(train_X, train_y, ridge__sample_weight=train_weight)

train_pred = pipeline.predict(train_X)
test_pred = pipeline.predict(test_X)

print(f"train weighted_r2: {custom_metric(train_y, train_pred, weight=train_weight)}")
print(f"test weighted_r2: {custom_metric(test_y, test_pred, weight=test_weight)}")


< fit and predict >


In [None]:
# =========================================================
# 8. Prediction on Test Data
# =========================================================
def predict(test_df, lags_df):
    X_test = test_df[cols].to_numpy(dtype=np.float64)
    preds = pipeline.predict(X_test)
    return pd.DataFrame({"row_id": test_df["row_id"], "responder_6": preds})


In [None]:

# ------------------------------
# 9. Load test data
# ------------------------------
test_data = pd.read_parquet(
    "/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/test.parquet",
    engine="fastparquet",
)
lags_data = pd.read_parquet(
    "/Users/apple/Masters/Job/kaggle/jane-street-real-time-market-data-forecasting/lags.parquet",
    engine="fastparquet",
)

test_data["date_id"] = test_data["date_id"].astype("int32")
lags_data["date_id"] = lags_data["date_id"].astype("int32")

NameError: name 'pd' is not defined

In [None]:
# ------------------------------
# 10. Run final prediction
# ------------------------------
final_predictions = predict(test_data, lags_data)

print("Prediction DataFrame:")
print(final_predictions.head())

final_predictions.to_csv("ridge_predictions_preprocessed.csv", index=False)
print("Predictions saved to ridge_predictions_preprocessed.csv")