# 1. Libraries

In [85]:
import numpy as np
import pandas as pd

from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import scipy.stats as stats

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import KFold
import optuna

from sklearn.linear_model import LinearRegression

torch.manual_seed(32)

<torch._C.Generator at 0x174fb66b0>

# 2. Data Loading

In [86]:
ypath = "../data/train.parquet"
fpath = "../data/train.parquet"

y = pd.read_parquet(ypath, columns=["label"])["label"]
X = pd.read_parquet(fpath)
X = X.drop(columns=[col for col in X.columns if col.startswith("encoded_feature_")])

print(f"Feature matrix X shape: {X.shape} (rows, columns)")
print(f"Target vector y shape: {y.shape} (rows,)")

Feature matrix X shape: (525886, 786) (rows, columns)
Target vector y shape: (525886,) (rows,)


## 3. Data Preprocessing

- **Train/validation split:**  
  - Test size = `0.2` (`20%` of samples reserved for validation).  
  - Split index: `split_idx = int(n_samples * (1 - test_size))`.  
  - Training set: `X[:split_idx]`, Validation set: `X[split_idx:]`.  

- **Feature scaling:**  
  - Applied `StandardScaler` to standardize all features.  
  - Fitted on the **training set only** (`X[:split_idx]`) to avoid data leakage.  
  - Transformed dataset stored in `data_scaled`.  

- **Evaluation function (`eval_func`):**  
  - Supports both **PyTorch models** (`torch.nn.Module`) and **sklearn-like models**.  
  - For PyTorch models: runs in evaluation mode with no gradient tracking.  
  - Computes predictions for both training and validation sets.  
  - Optionally applies inverse scaling if `scaler_y` is provided.  
  - Returns predictions (`y_train_pred`, `y_val_pred`) and prints performance metrics:  
    - **MSE** (Mean Squared Error)  
    - **MAE** (Mean Absolute Error)  
    - **R²** (Coefficient of Determination)  

In [87]:
test_size = 0.2

scaler = StandardScaler()
n_samples = len(X)
split_idx = int(n_samples * (1 - test_size))

scaler.fit(X[:split_idx])
data_scaled = scaler.transform(X)

X_train, X_val = data_scaled[:split_idx], data_scaled[split_idx:]
y_train, y_val = y.values[:split_idx], y.values[split_idx:]


def eval_func(model, X_train, y_train, X_val, y_val, scaler_y=None, device='cpu'):
    is_torch = isinstance(model, torch.nn.Module)

    if is_torch:
        model.eval()
        with torch.no_grad():
            X_train_t = torch.tensor(X_train, dtype=torch.float32).to(device)
            X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)
            
            y_train_pred = model(X_train_t).cpu().numpy().flatten()
            y_val_pred = model(X_val_t).cpu().numpy().flatten()
    else:
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

    if scaler_y is not None:
        y_train_pred = scaler_y.inverse_transform(y_train_pred.reshape(-1,1)).flatten()
        y_val_pred   = scaler_y.inverse_transform(y_val_pred.reshape(-1,1)).flatten()

    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2  = r2_score(y_train, y_train_pred)
    
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2  = r2_score(y_val, y_val_pred)

    print(f"Training MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, R²: {train_r2:.4f}")
    print(f"Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, R²: {val_r2:.4f}")

    return y_train_pred, y_val_pred

## 4. XGBoost Optimization Study

- **Optimization framework:** Optuna with `100` trials.  
- **Objective function:** Negative R² (minimized), with hyperparameters tuned across depth, learning rate, sampling, and regularization.  
- **Best parameters identified:**  
  ```python
    {
    'objective': 'reg:pseudohubererror',
    'n_estimators': 668,
    'max_depth': 8,
    'learning_rate': 0.001958638835992153,
    'subsample': 0.8362859827100048,
    'colsample_bytree': 0.7738914240985852,
    'reg_lambda': 0.4248677586278417,
    'reg_alpha': 1.6192479443527605,
    'gamma': 2.8514417437472486,
    'min_child_weight': 9
    }



- **Best validation R²:** `0.0090`  

**Notes:**  
- Optimal configuration uses **pseudo-Huber loss** to balance squared and absolute errors.  
- Very low learning rate combined with moderately deep trees suggests reliance on incremental updates with expressive trees.  
- Low R² highlights dataset difficulty, motivating further evaluation or **ensembling** with neural models.  


In [None]:
def objective_xgb(trial):
    param = {
        "objective": trial.suggest_categorical(
            "objective", ["reg:squarederror", "reg:absoluteerror", "reg:pseudohubererror"]
        ),
        "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 5.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 3, 12),
        "random_state": 1
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    model = xgb.train(
        param,
        dtrain,
        num_boost_round=1000,
        evals=[(dval, "val")],
        early_stopping_rounds=100
    )

    
    model = XGBRegressor(**param)
    
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        verbose=False
    )
    
    preds = model.predict(X_val)
    r2 = r2_score(y_val, preds)
    
    return -r2 

In [79]:
study = optuna.create_study(direction="minimize")
study.optimize(objective_xgb, n_trials=10)

print("Best params:", study.best_params)

[I 2025-09-25 17:55:55,817] A new study created in memory with name: no-name-3c7b1780-f962-48c1-9933-20063bb610fe
[W 2025-09-25 17:55:55,819] Trial 0 failed with parameters: {'objective': 'reg:pseudohubererror', 'n_estimators': 889, 'max_depth': 6, 'learning_rate': 0.2017343044279519, 'subsample': 0.620001792885299, 'colsample_bytree': 0.5386192640347389, 'reg_lambda': 0.6100866949486046, 'reg_alpha': 0.2490701423745867, 'gamma': 3.004911378451169, 'min_child_weight': 7} because of the following error: TypeError("XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "/opt/anaconda3/envs/drw/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/f9/6rcv_50n3nddl9gbvsc0y3br0000gn/T/ipykernel_45297/1097068603.py", line 20, in objective_xgb
    model.fit(
  File "/opt/anaconda3/envs/drw/lib/python3.11/site-packag

TypeError: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'

## 5. MLP Optimization Study

- **Optimization framework:** Optuna with `100` trials.  
- **Objective function:** Negative R² (minimized), with hyperparameters tuned across architecture, optimization, and regularization.  
- **Best trial parameters (Trial 47):**  
  ```python
    {
    'num_layers': 2,
    'hidden_size': 246,
    'dropout': 0.43428311446742884,
    'lr': 0.007886692753434593,
    'optimizer': 'SGD',
    'momentum': 0.9034047811298925,
    'weight_decay': 0.020167938847955404,
    'loss_fn': 'MSE',
    'batch_size': 32
    }

- **Best validation R²:** `0.0181`  

**Notes:**  
- Optimal architecture used **two hidden layers** with substantial width (`246`) and high dropout (`~0.43`) for regularization.  
- **SGD with momentum** outperformed Adam/RMSprop, highlighting careful gradient updates.  
- **MSE loss** aligned well with the regression target.  
- Low R² emphasizes the difficulty of the dataset and the potential need for **feature engineering or ensembling**.


In [8]:
def objective_mlp(trial, X_train, y_train, X_val, y_val):

    num_layers = trial.suggest_int("num_layers", 1, 3)
    hidden_size = trial.suggest_int("hidden_size", 16, 256)
    dropout = trial.suggest_float("dropout", 0.0, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["SGD", "Adam", "RMSprop"])
    momentum = trial.suggest_float("momentum", 0.0, 0.99)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.05)
    loss_name = trial.suggest_categorical("loss_fn", ["MSE", "L1", "Huber", "SmoothL1"])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    
    layers = []
    in_dim = X_train.shape[1]
    for _ in range(num_layers):
        layers.append(nn.Linear(in_dim, hidden_size))
        layers.append(nn.LeakyReLU(0.1))
        layers.append(nn.Dropout(dropout))
        in_dim = hidden_size
    layers.append(nn.Linear(hidden_size, 1))
    model = nn.Sequential(*layers)
    
    criterion = {
        "MSE": nn.MSELoss(),
        "L1": nn.L1Loss(),
        "Huber": nn.HuberLoss(),
        "SmoothL1": nn.SmoothL1Loss()
    }[loss_name]
    
    if optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
    elif optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
    val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=False)
    
    best_r2 = -np.inf
    patience_counter = 0
    for epoch in range(500):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            loss = criterion(model(X_batch).flatten(), y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        
        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for X_val_batch, y_val_batch in val_loader:
                y_true.append(y_val_batch.numpy())
                y_pred.append(model(X_val_batch).flatten().numpy())
        y_true = np.concatenate(y_true)
        y_pred = np.concatenate(y_pred)

        if np.isnan(y_pred).any():
            return float("inf")

        r2 = r2_score(y_true, y_pred)
        scheduler.step(1 - r2)
        
        if r2 > best_r2:
            best_r2 = r2
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter > 10:
            break
    
    return -best_r2

In [11]:
def objective(trial):
    return objective_mlp(
        trial,
        X_train=X_train,
        y_train=y_train,
        X_val=X_val,
        y_val=y_val
    )

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best hyperparameters:", study.best_params)
print("Best R²:", -study.best_value)

[I 2025-09-24 21:23:53,421] A new study created in memory with name: no-name-6d3ebfc5-f0cd-475b-84a3-06691853cc93
[W 2025-09-24 21:23:56,152] Trial 0 failed with parameters: {'num_layers': 2, 'hidden_size': 200, 'dropout': 0.16887606383025155, 'lr': 0.028699035616039977, 'optimizer': 'RMSprop', 'momentum': 0.38135795469696504, 'weight_decay': 0.02185914764738904, 'loss_fn': 'Huber', 'batch_size': 32} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/anaconda3/envs/drw/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/f9/6rcv_50n3nddl9gbvsc0y3br0000gn/T/ipykernel_38299/4137366830.py", line 2, in objective
    return objective_mlp(
           ^^^^^^^^^^^^^^
  File "/var/folders/f9/6rcv_50n3nddl9gbvsc0y3br0000gn/T/ipykernel_38299/1841726020.py", line 57, in objective_mlp
    loss.backward()
  File "/opt/anaconda3/envs/dr

KeyboardInterrupt: 

# 6. Ensemble Tuning

## 6.1 XGB Model

In [88]:
params = {
    'objective': 'reg:pseudohubererror',
    'n_estimators': 668,
    'max_depth': 8,
    'learning_rate': 0.001958638835992153,
    'subsample': 0.8362859827100048,
    'colsample_bytree': 0.7738914240985852,
    'reg_lambda': 0.4248677586278417,
    'reg_alpha': 1.6192479443527605,
    'gamma': 2.8514417437472486,
    'min_child_weight': 9,
    'random_state': 1,
    "early_stopping_rounds": 100
}

# --- Train ---
xgb_model = XGBRegressor(**params)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# --- Evaluate ---
y_pred = xgb_model.predict(X_val)
r2 = r2_score(y_val, y_pred)
print("Validation R²:", r2)

KeyboardInterrupt: 

## MLP Model

In [38]:
params = {
    'num_layers': 2,
    'hidden_size': 246,
    'dropout': 0.43428311446742884,
    'lr': 0.007886692753434593,
    'optimizer': 'SGD',
    'momentum': 0.9034047811298925,
    'weight_decay': 0.020167938847955404,
    'loss_fn': 'MSE',
    'batch_size': 32
}

# --- Build model ---
layers = []
in_dim = X_train.shape[1]
for _ in range(params['num_layers']):
    layers.append(nn.Linear(in_dim, params['hidden_size']))
    layers.append(nn.LeakyReLU(0.1))
    layers.append(nn.Dropout(params['dropout']))
    in_dim = params['hidden_size']
layers.append(nn.Linear(params['hidden_size'], 1))
mlp = nn.Sequential(*layers)

# --- Loss ---
criterion = nn.MSELoss()

# --- Optimizer ---
optimizer = optim.SGD(
    mlp.parameters(),
    lr=params['lr'],
    momentum=params['momentum'],
    weight_decay=params['weight_decay']
)

# --- Data loaders ---
train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train)),
                          batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val)),
                        batch_size=params['batch_size'], shuffle=False)

# --- Training loop ---
best_r2 = -np.inf
patience_counter = 0
max_epochs = 500
patience_limit = 10

for epoch in range(max_epochs):
    mlp.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        loss = criterion(mlp(X_batch).flatten(), y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(mlp.parameters(), max_norm=1.0)
        optimizer.step()
    
    # --- Validation ---
    mlp.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            y_true.append(y_val_batch.numpy())
            y_pred.append(mlp(X_val_batch).flatten().numpy())
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    
    r2 = r2_score(y_true, y_pred)
    
    if r2 > best_r2:
        best_r2 = r2
        patience_counter = 0
        best_model_state = mlp.state_dict()
    else:
        patience_counter += 1
    
    if patience_counter > patience_limit:
        print(f"Early stopping at epoch {epoch+1}")
        break

mlp.load_state_dict(best_model_state)
print("Training complete. Best validation R²:", best_r2)

Early stopping at epoch 17
Training complete. Best validation R²: 0.010776519775390625


## 6.3 Ensembling

In [None]:
def optimize_weighted_ensemble(model1, model2, X_val, y_val, n_trials=100):

    pred1 = model1.predict(X_val)
    pred2 = model2.predict(X_val)

    def objective(trial):

        w1 = trial.suggest_float("w1", 0.0, 1.0)
        w2 = 1.0 - w1  
        ensemble_pred = w1 * pred1 + w2 * pred2
        return -r2_score(y_val, ensemble_pred)

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    best_w1 = study.best_params['w1']
    best_w2 = 1.0 - best_w1
    best_r2 = -study.best_value

    return {'w1': best_w1, 'w2': best_w2}, best_r2

In [45]:
eval_func(mlp, X_train, y_train, X_val, y_val, scaler_y=None)
eval_func(xgb_model, X_train, y_train, X_val, y_val, scaler_y=None)

Training MSE: 0.9636, MAE: 0.6131, R²: 0.0406
Validation MSE: 1.0701, MAE: 0.6957, R²: 0.0091
Training MSE: 0.9170, MAE: 0.6015, R²: 0.0870
Validation MSE: 1.0702, MAE: 0.6943, R²: 0.0090


(array([0.03814178, 0.02157603, 0.02034539, ..., 0.0219089 , 0.00476389,
        0.02487631], dtype=float32),
 array([-0.01597264,  0.00668928,  0.01268294, ...,  0.04121579,
         0.13375151,  0.16204077], dtype=float32))

# 7. Summary + Next Steps

## Model Performance Summary

Both the MLP and XGBoost models were unable to achieve really any predictive performance on this dataset. This appears to be due to several factors:

- **High noise and low signal strength:** The underlying data contains substantial randomness and lacks strong, consistent patterns, making it difficult for models to learn meaningful relationships.
- **High variance in predictions:** Individual predictions exhibited extreme variability, particularly in the tails of the distribution, further limiting predictive accuracy.

### Standalone Model Results

| Model | Train MSE | Train MAE | Train R² | Val MSE | Val MAE | Val R² |
|-------|-----------|-----------|----------|---------|---------|--------|
| XGBoost | 0.9636 | 0.6131 | 0.0406 | 1.0701 | 0.6957 | 0.0091 |
| MLP     | 0.9170 | 0.6015 | 0.0870 | 1.0702 | 0.6943 | 0.0090 |

- Predictions were mostly small in magnitude with some extreme outliers:
  - Example ranges: `[0.038, 0.022, ... 0.025]` (MLP), `[-0.016, 0.007, ... 0.162]` (XGBoost)

### Weighted Ensemble Results

Weighted averaging methods combining MLP and XGBoost **actually reduced performance**. In fact, ensemble R² scores were slightly lower than the standalone models, likely because the ensemble propagated the high variance and noise present in each model rather than mitigating it.

**Key takeaway:** The dataset’s inherent noise and weak signal fundamentally limited model performance. Standalone XGBoost and MLP performed similarly, while weighting or stacking did not provide any tangible benefits.

---

## Next Steps

To improve model performance on this challenging dataset, the following steps are recommended:

1. **Robust and intelligent feature engineering:** Explore transformations, feature interactions, and domain-specific aggregations to extract stronger signals.  
2. **Rework cross-validation and feature selection:** Implement purged and properly grouped sequential folds to prevent information leakage and improve generalization.  
3. **Interaction features:** Introduce more sophisticated interaction terms between variables to capture non-linear dependencies.  
4. **Autoencoder improvements:** Revisit the autoencoder pipeline to prevent potential data leakage, ensuring that encoded features do not inadvertently leak information from the validation set into the training set.  
5. **Alternative model architectures:** Explore ensemble or hybrid approaches carefully designed to handle high-noise, low-signal time series data.