In [None]:
import os
from pathlib import Path

# 1. Find the Repo Root dynamically
# Walks up folders until it finds the README.md file
_root = next(p for p in Path.cwd().parents if (p / "README.md").exists())
REPO_ROOT = str(_root)

# 2. Add to sys.path so standard 'import' statements work
import sys
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

print(f"Repo root identified as: {REPO_ROOT}")

In [None]:
############### RUN DATA PREPROCESSING ###############

%run "$REPO_ROOT/run/data preprocessing/model_boilerplate_remote.py"

In [3]:
from sklearn.multioutput import MultiOutputRegressor
from tqdm.auto import tqdm
import xgboost as xgb
import joblib

batch_size = 1000
n_targets = y_train.shape[1]
total_batches = (n_targets + batch_size - 1) // batch_size

all_models = []

with tqdm(total=n_targets, 
          desc="Training XGBRF", 
          unit="targets",
          bar_format="{l_bar}{bar} | {n_fmt}/{total_fmt} targets [{elapsed}<{remaining}]") as pbar:
    
    for batch_start in range(0, n_targets, batch_size):
        batch_end = min(batch_start + batch_size, n_targets)
        
        est = xgb.XGBRFRegressor(
            objective='reg:squarederror',
            random_state=888, # same as RNN and MLR
            n_estimators=3, # generates 48k tree models (3 x 16k targets)
            n_jobs=-1, # uses 100 cores
            verbosity=0
        )
        
        model_batch = MultiOutputRegressor(est)
        model_batch.fit(x_train, y_train[:, batch_start:batch_end])
        
        all_models.append(model_batch)
        
        # Update progress bar
        pbar.update(batch_end - batch_start)
        pbar.set_postfix({'batch': f"{len(all_models)}/{total_batches}"})

print("✓ Training completed!")

Training XGBRF:   0%|           | 0/16100 targets [00:00<?]

✓ Training completed!


In [None]:
#Per‑target validation R² scores (centered data training):
#Target 0: 0.9969
#Target 1: 0.7994
#Target 2: 0.8078
#Target 3: 0.5034
#Target 4: 0.6837
#Target 5: 0.8751
#Target 6: 0.8303
#Target 7: 0.7810
#Target 8: 0.8561
#Target 9: 0.8472
#Target 10: 0.6558
#Target 11: 0.8931
#Target 12: 0.4154
#Target 13: 0.8671
#Target 14: 0.7356
#Target 15: 0.7917
#Target 16: 0.8358
#Target 17: 0.7796
#Target 18: 0.8110
#Target 19: 0.8148
#Target 20: 0.7952
#Target 21: 0.6643
#Target 22: 0.8922
#Target 23: 0.7458
#...
#Target 16099: 0.8649
#Target 16100: 0.8747


#Per‑target validation R² scores:
#Target 0: 0.9990
#Target 1: 0.8177
#Target 2: 0.6591
#Target 3: 0.2847
#Target 4: 0.7026
#Target 5: 0.8190
#Target 6: 0.8588
#Target 7: 0.7312
#Target 8: 0.8396
#Target 9: 0.8450
#Target 10: 0.6959
#Target 11: 0.8709
#Target 12: 0.4973
#Target 13: 0.8404
#Target 14: 0.7303
#Target 15: 0.8287
#Target 16: 0.8188
#Target 17: 0.7240
#Target 18: 0.8077
#Target 19: 0.7831
#Target 20: 0.8151
#Target 21: 0.6918
#Target 22: 0.8908
#Target 23: 0.7561
#...
#Target 16100: 0.8525

#Aggregate test R²: 0.7657
#Aggregate test Pearson's R: 0.9557


from sklearn.metrics import r2_score
import numpy as np

# Get predictions from all batch models
val_pred_batches = [model_batch.predict(x_val) for model_batch in all_models]
val_pred = np.concatenate(val_pred_batches, axis=1)  # shape: (n_samples, n_targets)

test_pred_batches = [model_batch.predict(x_test) for model_batch in all_models]
test_pred = np.concatenate(test_pred_batches, axis=1)  # shape: (n_samples, n_targets)

# per‑target validation R²
print("Per‑target validation R² scores:")
for i in range(y_val.shape[1]):
    val_r2 = r2_score(y_val[:, i], val_pred[:, i])
    print(f"Target {i}: {val_r2:.4f}")

# aggregate test R²
test_r2 = r2_score(y_test, test_pred)
print(f"\nAggregate test R²: {test_r2:.4f}")

# aggregate test Pearson's R
test_r_flat = np.corrcoef(y_test.flatten(), test_pred.flatten())[0, 1]
print(f"Aggregate test Pearson's R: {test_r_flat:.4f}")

Per‑target validation R² scores:
Target 0: 0.9990
Target 1: 0.8177
Target 2: 0.6591
Target 3: 0.2847
Target 4: 0.7026
Target 5: 0.8190
Target 6: 0.8588
Target 7: 0.7312
Target 8: 0.8396
Target 9: 0.8450
Target 10: 0.6959
Target 11: 0.8709
Target 12: 0.4973
Target 13: 0.8404
Target 14: 0.7303
Target 15: 0.8287
Target 16: 0.8188
Target 17: 0.7240
Target 18: 0.8077
Target 19: 0.7831
Target 20: 0.8151
Target 21: 0.6918
Target 22: 0.8908
Target 23: 0.7561
Target 24: 0.7245
Target 25: 0.5431
Target 26: 0.7237
Target 27: 0.8188
Target 28: 0.7394
Target 29: 0.4825
Target 30: 0.8872
Target 31: 0.7275
Target 32: 0.8357
Target 33: 0.8227
Target 34: 0.7246
Target 35: 0.7673
Target 36: 0.8816
Target 37: 0.8732
Target 38: 0.6180
Target 39: 0.7400
Target 40: 0.7888
Target 41: 0.8380
Target 42: 0.7364
Target 43: 0.8523
Target 44: 0.8343
Target 45: 0.8481
Target 46: 0.7780
Target 47: 0.7595
Target 48: 0.7653
Target 49: 0.8900
Target 50: 0.7771
Target 51: 0.8718
Target 52: 0.8492
Target 53: 0.5860
Targe

In [4]:
# saving models and metadata

import joblib
import json
import os

save_dir = 'saved_models'
os.makedirs(save_dir, exist_ok=True)

# Save the batched models list
try:
    joblib.dump(all_models, os.path.join(save_dir, 'all_models_batch_XGBRF[uncentered_REALFINAL].joblib'), compress=9)
    print(f"Saved all_models -> {os.path.join(save_dir, 'all_models_batch_XGBRF[uncentered_REALFINAL].joblib')}")
except Exception as e:
    print(f"Could not save all_models: {e}")

# Optionally save other model objects if present
if 'model' in globals():
    try:
        joblib.dump(model, os.path.join(save_dir, 'model_multioutput_v4_[uncentered_REALFINAL].joblib'), compress=9)
        print(f"Saved model -> {os.path.join(save_dir, 'model_multioutput_v4_[uncentered_REALFINAL].joblib')}")
    except Exception as e:
        print(f"Could not save model: {e}")

if 'final_model' in globals():
    try:
        joblib.dump(final_model, os.path.join(save_dir, 'final_model_optuna[uncentered_FINAL].joblib'), compress=9)
        print(f"Saved final_model -> {os.path.join(save_dir, 'final_model_optuna[uncentered_FINAL].joblib')}")
    except Exception as e:
        print(f"Could not save final_model: {e}")

# Save versions metadata
try:
    versions = {
        'xgboost': xgb.__version__ if 'xgb' in globals() else None,
        'scikit-learn': __import__('sklearn').__version__,
        'joblib': joblib.__version__,
    }
    with open(os.path.join(save_dir, 'model_v4_versions_uncentered_FINAL.json'), 'w') as f:
        json.dump(versions, f)
    print(f"Saved versions -> {os.path.join(save_dir, 'model_v4_versions_uncentered_FINAL.json')}")
except Exception as e:
    print(f"Could not save versions metadata: {e}")

Saved all_models -> saved_models/all_models_batch_XGBRF[uncentered_REALFINAL].joblib
Saved versions -> saved_models/model_v4_versions_uncentered_FINAL.json
