In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="darkgrid")

In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [41]:
train = pd.read_csv("train.csv")



In [42]:
train.columns

Index(['Month', 'BorrowerRate_mean', 'Treasury_lag1', 'Treasury_lag2',
       'Treasury_lag3', 'Fed_lag1', 'Fed_lag2', 'Fed_lag3'],
      dtype='object')

In [43]:

train = train.sort_values("Month").reset_index(drop=True)

n = len(train)

n

70

In [44]:
lags = range(1,4)
# create borrower lags t-1..t-3(no dropping)
for k in lags:
    train[f"BorrowerRate_lag{k}"] = train["BorrowerRate_mean"].shift(k)

lag_cols = (
    [f"BorrowerRate_lag{k}" for k in lags] +
    [f"Treasury_lag{k}" for k in lags] +
    [f"Fed_lag{k}" for k in lags]
)

# sanity check: how many NaNs in the new borrower lags?
train[[f"BorrowerRate_lag{k}" for k in lags]].isna().sum()

BorrowerRate_lag1    1
BorrowerRate_lag2    2
BorrowerRate_lag3    3
dtype: int64

In [45]:
# We are using a 10-month validation window and 5 folds:
# Fold sizes: train grows 20,30,40,50,60 and val is 10 each time (last val ends at 70)
val_window = 10
n_folds = 5

min_train = 20  
print(f"n={n}, val_window={val_window}, min_train={min_train}, folds={n_folds}")



n=70, val_window=10, min_train=20, folds=5


In [46]:
# Creating the cross validation
cv_splits = []
for i in range(n_folds):
    train_end = min_train + i * val_window
    val_start = train_end
    val_end = val_start + val_window

    tr_idx = np.arange(0, train_end)
    val_idx = np.arange(val_start, val_end)
    cv_splits.append((tr_idx, val_idx))

print("Built folds:", len(cv_splits))


Built folds: 5


In [47]:
# Sanity check
rows = []
for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    rows.append({
        "fold": fold,
        "train_size": len(tr_idx),
        "val_size": len(val_idx),
        "train_start": train.loc[tr_idx[0], "Month"],
        "train_end": train.loc[tr_idx[-1], "Month"],
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
    })

cv_summary = pd.DataFrame(rows)
cv_summary

Unnamed: 0,fold,train_size,val_size,train_start,train_end,val_start,val_end
0,1,20,10,2005-11,2007-06,2007-07,2008-04
1,2,30,10,2005-11,2008-04,2008-05,2009-09
2,3,40,10,2005-11,2009-09,2009-10,2010-07
3,4,50,10,2005-11,2010-07,2010-08,2011-05
4,5,60,10,2005-11,2011-05,2011-06,2012-03


In [48]:
y = train["BorrowerRate_mean"].reset_index(drop=True)


In [49]:



# Naive Model: y_hat[t] = y[t-1] 
fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    y_val = y.iloc[val_idx]
    y_hat = y.shift(1).iloc[val_idx]  # naive(1)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
        "val_size": len(val_idx)
    })

naive_cv = pd.DataFrame(fold_rows)
naive_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,val_start,val_end,val_size
0,1,0.004132,0.004499,41.323068,44.986417,2007-07,2008-04,10
1,2,0.008766,0.011167,87.65833,111.665766,2008-05,2009-09,10
2,3,0.008775,0.010409,87.752732,104.092705,2009-10,2010-07,10
3,4,0.013524,0.017855,135.236157,178.547273,2010-08,2011-05,10
4,5,0.007398,0.008459,73.980173,84.588266,2011-06,2012-03,10


In [50]:

naive_summary = naive_cv[["MAE_bps", "RMSE_bps"]].agg(["mean", "std", "min", "max"])
naive_summary

Unnamed: 0,MAE_bps,RMSE_bps
mean,85.190092,104.776085
std,33.790702,48.663321
min,41.323068,44.986417
max,135.236157,178.547273


In [51]:

#Naive Seasonal with Seasonal Lag 3
season_lag = 3

fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    y_val = y.iloc[val_idx]
    y_hat = y.shift(season_lag).iloc[val_idx]  # seasonal naive

    
    m = ~y_hat.isna()
    mae = mean_absolute_error(y_val[m], y_hat[m])
    rmse = np.sqrt(mean_squared_error(y_val[m], y_hat[m]))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
        "val_size": len(val_idx)
    })

season_naive_cv = pd.DataFrame(fold_rows)
season_naive_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,val_start,val_end,val_size
0,1,0.003991,0.005219,39.910048,52.19138,2007-07,2008-04,10
1,2,0.01847,0.021749,184.695621,217.49443,2008-05,2009-09,10
2,3,0.016623,0.01875,166.234311,187.496188,2009-10,2010-07,10
3,4,0.02527,0.028291,252.704838,282.913018,2010-08,2011-05,10
4,5,0.01286,0.014216,128.599381,142.158121,2011-06,2012-03,10


In [52]:
season_naive_summary = season_naive_cv[["MAE_bps", "RMSE_bps"]].agg(["mean", "std", "min", "max"])
season_naive_summary

Unnamed: 0,MAE_bps,RMSE_bps
mean,154.42884,176.450627
std,78.254925,86.251341
min,39.910048,52.19138
max,252.704838,282.913018


In [53]:

#Moving_Average
window = 3

fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    # full series up to end of validation (so we can compute rolling using only past)
    y_all = y.copy()

    # Rolling mean using only past values: shift(1) prevents using y_t itself
    y_hat_all = y_all.shift(1).rolling(window=window).mean()

    y_val = y_all.iloc[val_idx]
    y_hat = y_hat_all.iloc[val_idx]

    m = ~y_hat.isna()
    mae = mean_absolute_error(y_val[m], y_hat[m])
    rmse = np.sqrt(mean_squared_error(y_val[m], y_hat[m]))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
        "val_size": int(m.sum())
    })

rolling_cv = pd.DataFrame(fold_rows)
rolling_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,val_start,val_end,val_size
0,1,0.003628,0.004151,36.278261,41.512537,2007-07,2008-04,10
1,2,0.012591,0.015195,125.912393,151.945409,2008-05,2009-09,10
2,3,0.01167,0.013612,116.698993,136.11757,2009-10,2010-07,10
3,4,0.018411,0.021237,184.105713,212.374555,2010-08,2011-05,10
4,5,0.008278,0.010028,82.776605,100.282906,2011-06,2012-03,10


In [54]:
#Exponential Smoothing
fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    y_tr = y.iloc[tr_idx].astype(float)
    y_val = y.iloc[val_idx].astype(float)

    # Holt-Winters (additive trend)
    ets = ExponentialSmoothing(
        y_tr,
        trend="add",
        seasonal=None,          # change to "add" if you want seasonal component
                    # only relevant if seasonal is not None
    ).fit(optimized=True)

    y_hat = ets.forecast(steps=len(val_idx))
    y_hat = pd.Series(y_hat.values, index=y_val.index)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
        "val_size": len(val_idx)
    })

ets_cv = pd.DataFrame(fold_rows)
ets_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,val_start,val_end,val_size
0,1,0.024975,0.027387,249.748015,273.866171,2007-07,2008-04,10
1,2,0.01441,0.016302,144.098136,163.018723,2008-05,2009-09,10
2,3,0.009016,0.010174,90.159714,101.741734,2009-10,2010-07,10
3,4,0.026238,0.029877,262.376738,298.770886,2010-08,2011-05,10
4,5,0.016083,0.019517,160.831176,195.16877,2011-06,2012-03,10


In [55]:
# Ridge Regression


alpha = 1

fold_rows = []
for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    tr = train.iloc[tr_idx].copy()
    va = train.iloc[val_idx].copy()

    # fold-safe drop (does not affect global train or cv_splits)
    tr = tr.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    va = va.dropna(subset=lag_cols + ["BorrowerRate_mean"])

    X_tr = tr[lag_cols].astype(float)
    y_tr = tr["BorrowerRate_mean"].astype(float)
    X_val = va[lag_cols].astype(float)
    y_val = va["BorrowerRate_mean"].astype(float)

    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr)
    X_val_s = scaler.transform(X_val)

    model = Ridge(alpha=alpha, random_state=0)
    model.fit(X_tr_s, y_tr)
    y_hat = model.predict(X_val_s)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "train_rows_used": len(tr),
        "val_rows_used": len(va),
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
    })

ridge_cv = pd.DataFrame(fold_rows)
ridge_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,train_rows_used,val_rows_used,val_start,val_end
0,1,0.007019,0.009371,70.185578,93.707854,17,10,2007-07,2008-04
1,2,0.018747,0.023553,187.472503,235.534501,27,10,2008-05,2009-09
2,3,0.007168,0.008702,71.680706,87.018631,37,10,2009-10,2010-07
3,4,0.020842,0.023905,208.421363,239.049272,47,10,2010-08,2011-05
4,5,0.007455,0.010506,74.551713,105.061581,57,10,2011-06,2012-03


In [60]:
# SARIMA
order = (1, 0, 1)
seasonal_order = (0, 0, 0, 0)

fold_rows = []
for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    tr = train.iloc[tr_idx].copy()
    va = train.iloc[val_idx].copy()

    # fold-safe drop (does not affect global train or cv_splits)
    tr = tr.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    va = va.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    
    
    y_tr = tr["BorrowerRate_mean"].astype(float)
    y_val = va["BorrowerRate_mean"].astype(float)



    if len(y_tr) < 10 or len(y_val) == 0:
        fold_rows.append({
            "fold": fold, "MAE": np.nan, "RMSE": np.nan,
            "MAE_bps": np.nan, "RMSE_bps": np.nan,
            "train_rows_used": len(y_tr), "val_rows_used": len(y_val),
            "val_start": train.loc[val_idx[0], "Month"],
            "val_end": train.loc[val_idx[-1], "Month"],
        })
        continue

    model = sm.tsa.statespace.SARIMAX(
        y_tr,
        order=order,
        seasonal_order=seasonal_order,
        enforce_stationarity=False,
        enforce_invertibility=False
    ).fit(disp=False)

    y_hat = model.forecast(steps=len(y_val))
    y_hat = pd.Series(np.asarray(y_hat), index=y_val.index)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))
    
    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "train_rows_used": len(y_tr),
        "val_rows_used": len(y_val),
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
    })

sarima_cv = pd.DataFrame(fold_rows)
sarima_cv

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,train_rows_used,val_rows_used,val_start,val_end
0,1,0.007129,0.007924,71.292832,79.236789,17,10,2007-07,2008-04
1,2,0.013455,0.018031,134.545098,180.305076,27,10,2008-05,2009-09
2,3,0.012803,0.014194,128.028766,141.938973,37,10,2009-10,2010-07
3,4,0.033039,0.035756,330.388252,357.563812,47,10,2010-08,2011-05
4,5,0.012259,0.014116,122.591169,141.157026,57,10,2011-06,2012-03


In [61]:
# SARIMAX CV 

order = (1, 0, 1)
seasonal_order = (0, 0, 0, 0)

fold_rows = []
for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    tr = train.iloc[tr_idx].copy()
    va = train.iloc[val_idx].copy()

    tr = tr.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    va = va.dropna(subset=lag_cols + ["BorrowerRate_mean"])

    # If val becomes empty (shouldn't with your fold design), skip safely
    if len(tr) < 10 or len(va) == 0:
        fold_rows.append({
            "fold": fold, "MAE": np.nan, "RMSE": np.nan,
            "MAE_bps": np.nan, "RMSE_bps": np.nan,
            "train_rows_used": len(tr), "val_rows_used": len(va),
            "val_start": train.loc[val_idx[0], "Month"],
            "val_end": train.loc[val_idx[-1], "Month"],
        })
        continue

    y_tr = tr["BorrowerRate_mean"].astype(float)
    y_val = va["BorrowerRate_mean"].astype(float)

    X_tr = tr[lag_cols].astype(float)
    X_val = va[lag_cols].astype(float)

    scaler = StandardScaler()
    X_tr_s = scaler.fit_transform(X_tr)   # fit only on fold-train
    X_val_s = scaler.transform(X_val)

    model = sm.tsa.statespace.SARIMAX(
        y_tr,
        exog=X_tr_s,
        order=order,
        seasonal_order=seasonal_order,
        enforce_stationarity=False,
        enforce_invertibility=False
    ).fit(disp=False)

    y_hat = model.forecast(steps=len(y_val), exog=X_val_s)
    y_hat = pd.Series(np.asarray(y_hat), index=y_val.index)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "train_rows_used": len(tr),
        "val_rows_used": len(va),
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
    })

sarimax_cv = pd.DataFrame(fold_rows)
sarimax_cv

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,train_rows_used,val_rows_used,val_start,val_end
0,1,0.031399,0.047715,313.987078,477.148466,17,10,2007-07,2008-04
1,2,0.012388,0.015445,123.881058,154.453971,27,10,2008-05,2009-09
2,3,0.008125,0.009512,81.252244,95.122018,37,10,2009-10,2010-07
3,4,0.0199,0.022944,198.999986,229.439881,47,10,2010-08,2011-05
4,5,0.009061,0.010455,90.60619,104.549108,57,10,2011-06,2012-03


In [66]:
# Gradient Boosting CV  

from sklearn.ensemble import HistGradientBoostingRegressor

# assumes: train, cv_splits, lag_cols already exist
train = train.sort_values("Month").reset_index(drop=True)

fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    tr = train.iloc[tr_idx].copy()
    va = train.iloc[val_idx].copy()

    # fold-safe drop (do NOT change global train or cv_splits)
    tr = tr.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    va = va.dropna(subset=lag_cols + ["BorrowerRate_mean"])

    if len(tr) < 15 or len(va) == 0:
        fold_rows.append({
            "fold": fold, "MAE_bps": np.nan, "RMSE_bps": np.nan,
            "train_rows_used": len(tr), "val_rows_used": len(va),
            "val_start": train.loc[val_idx[0], "Month"],
            "val_end": train.loc[val_idx[-1], "Month"],
        })
        continue

    X_tr = tr[lag_cols].astype(float)
    y_tr = tr["BorrowerRate_mean"].astype(float)
    X_val = va[lag_cols].astype(float)
    y_val = va["BorrowerRate_mean"].astype(float)

    # Strong regularization for small n
    gbr = HistGradientBoostingRegressor(
        loss="squared_error",
        learning_rate=0.05,
        max_depth=3,
        max_leaf_nodes=15,
        min_samples_leaf=10,
        l2_regularization=1.0,
        max_iter=1000,
        early_stopping=True,
        validation_fraction=0.2,   # internal early-stopping split FROM tr only
        n_iter_no_change=30,
        random_state=0
    )

    gbr.fit(X_tr, y_tr)
    y_hat = gbr.predict(X_val)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "train_rows_used": len(tr),
        "val_rows_used": len(va),
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
    })

gboost_cv = pd.DataFrame(fold_rows)
gboost_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,train_rows_used,val_rows_used,val_start,val_end
0,1,0.007443,0.008086,74.425595,80.858609,17,10,2007-07,2008-04
1,2,0.010732,0.012878,107.320452,128.775723,27,10,2008-05,2009-09
2,3,0.005971,0.007739,59.707624,77.392771,37,10,2009-10,2010-07
3,4,0.034977,0.037384,349.773891,373.841528,47,10,2010-08,2011-05
4,5,0.014104,0.016885,141.037107,168.84645,57,10,2011-06,2012-03


In [70]:
import xgboost
xgboost.__version__

'2.1.4'

In [72]:
#XGBoost

import xgboost as xgb

fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    tr = train.iloc[tr_idx].copy()
    va = train.iloc[val_idx].copy()

    # fold-safe drop
    tr = tr.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    va = va.dropna(subset=lag_cols + ["BorrowerRate_mean"])

    if len(tr) < 15 or len(va) == 0:
        fold_rows.append({
            "fold": fold, "MAE_bps": np.nan, "RMSE_bps": np.nan,
            "train_rows_used": len(tr), "val_rows_used": len(va),
            "val_start": train.loc[val_idx[0], "Month"],
            "val_end": train.loc[val_idx[-1], "Month"],
        })
        continue

    X_tr = tr[lag_cols].astype(float).values
    y_tr = tr["BorrowerRate_mean"].astype(float).values
    X_val = va[lag_cols].astype(float).values
    y_val = va["BorrowerRate_mean"].astype(float).values

    dtr = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)

    params = {
        "objective": "reg:squarederror",
        "eta": 0.02,
        "max_depth": 3,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "min_child_weight": 10,
        "lambda": 5.0,   # L2 regularization
        "alpha": 0.0,    # L1 regularization
        "seed": 0,
    }

    booster = xgb.train(
        params=params,
        dtrain=dtr,
        num_boost_round=5000,
        evals=[(dval, "val")],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    y_hat = booster.predict(dval)

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "train_rows_used": len(tr),
        "val_rows_used": len(va),
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
        "best_iteration": booster.best_iteration,
    })

xgb_cv = pd.DataFrame(fold_rows)
xgb_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,train_rows_used,val_rows_used,val_start,val_end,best_iteration
0,1,0.007542,0.008178,75.421936,81.77661,17,10,2007-07,2008-04,290
1,2,0.011868,0.015284,118.681343,152.835722,27,10,2008-05,2009-09,9
2,3,0.007415,0.009437,74.148485,94.365688,37,10,2009-10,2010-07,276
3,4,0.037039,0.039309,370.387138,393.086456,47,10,2010-08,2011-05,168
4,5,0.012506,0.015309,125.058424,153.089182,57,10,2011-06,2012-03,202


In [74]:
!pip -q install torch

In [76]:

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset




np.random.seed(60)
torch.manual_seed(60)


class TinyMLP(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 12),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(12, 4),
            nn.ReLU(),
            nn.Linear(4, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(-1)

fold_rows = []

for fold, (tr_idx, val_idx) in enumerate(cv_splits, start=1):
    tr = train.iloc[tr_idx].copy()
    va = train.iloc[val_idx].copy()

    # fold-safe drop only inside fold (keeps cv_splits valid)
    tr = tr.dropna(subset=lag_cols + ["BorrowerRate_mean"])
    va = va.dropna(subset=lag_cols + ["BorrowerRate_mean"])

    if len(tr) < 15 or len(va) == 0:
        fold_rows.append({
            "fold": fold, "MAE_bps": np.nan, "RMSE_bps": np.nan,
            "train_rows_used": len(tr), "val_rows_used": len(va),
            "val_start": train.loc[val_idx[0], "Month"],
            "val_end": train.loc[val_idx[-1], "Month"],
        })
        continue

    X_tr = tr[lag_cols].astype(float).values
    y_tr = tr["BorrowerRate_mean"].astype(float).values
    X_val = va[lag_cols].astype(float).values
    y_val = va["BorrowerRate_mean"].astype(float).values

    # Scale using fold-train only (no leakage)
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_val = scaler.transform(X_val)

    # Torch datasets
    X_tr_t = torch.tensor(X_tr, dtype=torch.float32)
    y_tr_t = torch.tensor(y_tr, dtype=torch.float32)
    X_val_t = torch.tensor(X_val, dtype=torch.float32)
    y_val_t = torch.tensor(y_val, dtype=torch.float32)

    loader = DataLoader(TensorDataset(X_tr_t, y_tr_t), batch_size=16, shuffle=True)

    model = TinyMLP(in_dim=X_tr.shape[1]).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
    loss_fn = nn.MSELoss()

    # Early stopping (inside fold): monitor validation loss
    best_loss = float("inf")
    best_state = None
    patience = 40
    bad = 0
    max_epochs = 500

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            opt.step()

        model.eval()
        with torch.no_grad():
            val_pred = model(X_val_t.to(device))
            val_loss = loss_fn(val_pred, y_val_t.to(device)).item()

        if val_loss < best_loss - 1e-8:
            best_loss = val_loss
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    model.eval()
    with torch.no_grad():
        y_hat = model(X_val_t.to(device)).detach().cpu().numpy()

    mae = mean_absolute_error(y_val, y_hat)
    rmse = np.sqrt(mean_squared_error(y_val, y_hat))

    fold_rows.append({
        "fold": fold,
        "MAE": mae,
        "RMSE": rmse,
        "MAE_bps": mae * 10000,
        "RMSE_bps": rmse * 10000,
        "train_rows_used": len(tr),
        "val_rows_used": len(va),
        "val_start": train.loc[val_idx[0], "Month"],
        "val_end": train.loc[val_idx[-1], "Month"],
        "epochs_ran": epoch + 1
    })

mlp_cv = pd.DataFrame(fold_rows)
mlp_cv

Unnamed: 0,fold,MAE,RMSE,MAE_bps,RMSE_bps,train_rows_used,val_rows_used,val_start,val_end,epochs_ran
0,1,0.357433,0.373485,3574.331051,3734.849,17,10,2007-07,2008-04,52
1,2,0.081718,0.084963,817.178063,849.62855,27,10,2008-05,2009-09,54
2,3,0.010873,0.012218,108.727771,122.178782,37,10,2009-10,2010-07,194
3,4,0.030903,0.03807,309.027834,380.697536,47,10,2010-08,2011-05,175
4,5,0.008129,0.010837,81.288024,108.373523,57,10,2011-06,2012-03,103


In [78]:
def summarize(df, name):
    return pd.Series({
        "model": name,
        "MAE_bps_mean": df["MAE_bps"].mean(),
        "RMSE_bps_mean": df["RMSE_bps"].mean(),
        "MAE_bps_std": df["MAE_bps"].std(),
        "RMSE_bps_std": df["RMSE_bps"].std(),
    })

summary = pd.DataFrame([
    summarize(naive_cv, "naive"),
    summarize(season_naive_cv, "seasonal_naive"),
    summarize(rolling_cv, "rolling_mean"),
    summarize(ets_cv, "ETS"),
    summarize(ridge_cv, "Ridge"),
    summarize(sarima_cv, "SARIMA"),
    summarize(sarimax_cv, "SARIMAX"),
    summarize(gboost_cv, "GBOOST"),
    summarize(xgb_cv, "XGBOOST"),
    summarize(xgb_cv, "XGBOOST"),
    summarize(mlp_cv, "MLP")

]).sort_values("MAE_bps_mean")

summary

Unnamed: 0,model,MAE_bps_mean,RMSE_bps_mean,MAE_bps_std,RMSE_bps_std
0,naive,85.190092,104.776085,33.790702,48.663321
2,rolling_mean,109.154393,128.446596,54.685853,63.256747
4,Ridge,122.462373,152.074368,69.322322,78.069406
7,GBOOST,146.452934,165.943016,117.91181,122.159866
9,XGBOOST,152.739465,175.030732,123.946811,126.21958
8,XGBOOST,152.739465,175.030732,123.946811,126.21958
1,seasonal_naive,154.42884,176.450627,78.254925,86.251341
5,SARIMA,157.369223,180.040335,99.920025,105.64047
6,SARIMAX,161.745311,212.142689,96.875018,157.420979
3,ETS,181.442756,206.513257,73.088934,80.692981
