<a href="https://colab.research.google.com/github/DavidGTeklea/BigIdeasFinal/blob/main/JustRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---- 0) Set a data folder ----
import os, glob
DATA_DIR = "/content/data/usacc"       # choose any folder you like
os.makedirs(DATA_DIR, exist_ok=True)

# ---- 1) Install Kaggle CLI ----
!pip -q install -U kaggle

# ---- 2) Upload kaggle.json (from your computer) ----
from google.colab import files
uploaded = files.upload()  # select kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# ---- 3) Download + unzip ----
!kaggle datasets download -d sobhanmoosavi/us-accidents -p "{DATA_DIR}" -o
!unzip -o "{DATA_DIR}/us-accidents.zip" -d "{DATA_DIR}"

# ---- 4) Pick the main CSV (largest file) ----
csvs = glob.glob(os.path.join(DATA_DIR, "*.csv"))
CSV_PATH = max(csvs, key=os.path.getsize)
print("Selected CSV_PATH:", CSV_PATH)

# ---- 5) Wire to your pipeline ----
USACC_PATH = CSV_PATH          # or: RAW_CSV = CSV_PATH
print("USACC_PATH set for builder:", USACC_PATH)

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents
License(s): CC-BY-NC-SA-4.0
Downloading us-accidents.zip to /content/data/usacc
 99% 649M/653M [00:10<00:00, 70.6MB/s]
100% 653M/653M [00:10<00:00, 65.4MB/s]
Archive:  /content/data/usacc/us-accidents.zip
  inflating: /content/data/usacc/US_Accidents_March23.csv  
Selected CSV_PATH: /content/data/usacc/US_Accidents_March23.csv
USACC_PATH set for builder: /content/data/usacc/US_Accidents_March23.csv


In [19]:
# --- one-time install for Parquet ---
!pip -q install pyarrow

import numpy as np, pandas as pd, pyarrow as pa, pyarrow.parquet as pq
from pathlib import Path

USACC_PATH = CSV_PATH  # your existing variable (raw CSV)

RAW_CSV = globals().get("USACC_PATH", None)
if RAW_CSV is None:
    raise RuntimeError("Set USACC_PATH (or RAW_CSV) to the raw CSV path first (kagglehub/CLI).")

OUT_PARQUET = "/content/US_Accidents_duration_fast.parquet"
SUBSET_FRAC = 0.02
CHUNK = 200_000

# We do NOT read End_* coords or Distance (leaky); we read End_Time only to make the label
USECOLS = [
    "ID", "Source",
    "Start_Time", "End_Time",
    "Start_Lat", "Start_Lng",
    "Street", "City", "County", "State", "Zipcode", "Country", "Timezone",
    "Weather_Timestamp",
    "Weather_Condition", "Temperature(F)", "Humidity(%)", "Visibility(mi)",
    "Wind_Speed(mph)", "Wind_Chill(F)", "Pressure(in)", "Precipitation(in)",
    "Sunrise_Sunset", "Amenity", "Bump", "Crossing", "Give_Way", "Junction",
    "No_Exit", "Railway", "Roundabout", "Station", "Stop",
    "Traffic_Calming", "Traffic_Signal", "Turning_Loop",
    # NOTE: not reading Description, End_Lat, End_Lng, Distance(mi)
]

WEATHER_COLS = [
    "Temperature(F)","Wind_Chill(F)","Humidity(%)","Pressure(in)",
    "Visibility(mi)","Wind_Speed(mph)","Precipitation(in)","Weather_Condition"
]

def build_fast_sample_duration(csv_path: str, out_parquet: str, frac: float = 0.10,
                               seed: int = 2025, chunksize: int = 200_000):
    rng = np.random.RandomState(seed)
    writer, total_in, total_kept = None, 0, 0

    for chunk in pd.read_csv(csv_path, usecols=USECOLS, chunksize=chunksize, low_memory=True):
        total_in += len(chunk)
        chunk = chunk.sort_values("Start_Time")  # Sort each chunk for timeseriessplit later

        # --- times
        st = pd.to_datetime(chunk["Start_Time"], errors="coerce", utc=True)
        et = pd.to_datetime(chunk["End_Time"],   errors="coerce", utc=True)
        wt = pd.to_datetime(chunk.get("Weather_Timestamp", pd.Series(pd.NaT, index=chunk.index)),
                            errors="coerce", utc=True)

        # --- duration (min); require positive & finite
        dur = (et - st).dt.total_seconds() / 60.0
        m = np.isfinite(dur) & (dur > 0)
        if not m.any():
            continue
        chunk = chunk.loc[m].copy()
        dur = dur[m]
        st, et, wt = st[m], et[m], wt[m]

        # --- clamp tails per chunk to stabilize training
        lo, hi = np.nanpercentile(dur, [1, 99])
        chunk["duration_min"] = np.clip(dur, lo, hi)

        # --- weather leakage guard
        mask_ok = wt.notna() & st.notna() & (wt <= st)
        if WEATHER_COLS:
            for c in WEATHER_COLS:
                if c in chunk.columns:
                    chunk.loc[~mask_ok, c] = np.nan
        lag = (st - wt).dt.total_seconds() / 60.0
        chunk["weather_lag_min"] = np.where(mask_ok, lag, np.nan)

        # --- start-time engineered parts for temporal splitting / features
        chunk["start_hour"]  = st.dt.hour.astype("Int64")
        chunk["start_wday"]  = st.dt.dayofweek.astype("Int64")
        chunk["start_month"] = st.dt.month.astype("Int64")
        chunk["start_year"]  = st.dt.year.astype("Int64")

        # --- uniform downsample
        keep = rng.rand(len(chunk)) < frac
        if not keep.any():
            continue
        sm = chunk.loc[keep].copy()

        # --- drop definite leakers / non-predictive keys before writing
        sm.drop(columns=[c for c in [
            #"Start_Time",          # keep derived parts instead
            "End_Time",            # used only for label
            "Weather_Timestamp",   # used only for guard
            "ID"                   # row key; can spuriously help
        ] if c in sm.columns], inplace=True)

        # write → Parquet
        tbl = pa.Table.from_pandas(sm, preserve_index=False)
        if writer is None:
            writer = pq.ParquetWriter(out_parquet, tbl.schema)
        writer.write_table(tbl)
        total_kept += len(sm)

    if writer is not None:
        writer.close()
    print(f"Stream-sampled {total_kept:,} rows out of ~{total_in:,} into {out_parquet}")

build_fast_sample_duration(RAW_CSV, OUT_PARQUET, frac=SUBSET_FRAC, seed=2025, chunksize=CHUNK)

Stream-sampled 134,541 rows out of ~7,728,394 into /content/US_Accidents_duration_fast.parquet


In [3]:
# Create a real module on disk so workers can import it
from textwrap import dedent
with open("cat_tokens.py", "w") as f:
    f.write(dedent("""
    from sklearn.base import BaseEstimator, TransformerMixin

    class CatToTokens(BaseEstimator, TransformerMixin):
        \"\"\"Turn categorical frame/array into list-of-tokens per row.
        Tokens are 'col=value' to reduce collisions for FeatureHasher.\"\"\"
        def __init__(self, prefix=True):
            self.prefix = prefix
            self.columns_ = None

        def fit(self, X, y=None):
            if hasattr(X, "columns"):
                self.columns_ = [str(c) for c in X.columns]
            else:
                n_cols = X.shape[1] if hasattr(X, "shape") else len(X[0])
                self.columns_ = [f"c{i}" for i in range(int(n_cols))]
            return self

        def transform(self, X):
            if hasattr(X, "to_numpy"):
                arr = X.astype("U").to_numpy()
            else:
                arr = X.astype("U")
            cols = self.columns_
            if self.prefix:
                return [[f"{c}={v}" for c, v in zip(cols, row)] for row in arr]
            else:
                return [list(row) for row in arr]
    """))


In [20]:
# =================== Regression prep (Duration + Hashing + MaxAbs + Caching) ===================
import os, glob, time, json, pathlib
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from joblib import Memory
import importlib, pyarrow.parquet as pq

# --------- Config ----------
SEED_SPLIT   = 2025
NJOBS        = -1                     # use all cores
DEVELOPMENT_MODE = True               # False for full run
DEV_SAMPLE_FRAC  = 0.10               # 10% during dev
BASE_OUT = Path("results/usaccidents_regression"); BASE_OUT.mkdir(parents=True, exist_ok=True)

# This must point to the DURATION parquet produced by build_fast_sample_duration(...)
USACC_PARQUET_FAST = OUT_PARQUET

# --------- Fresh cache dir (so nothing stale is reused) ----------
import shutil
shutil.rmtree("cache_usacc_reg_hash_tok_v3", ignore_errors=True)
CACHE_DIR = Path("cache_usacc_reg_hash_tok_v3"); CACHE_DIR.mkdir(exist_ok=True)
memory = Memory(location=str(CACHE_DIR), verbose=0)

# --------- Import the picklable transformer from the module you wrote in Step 1 ----------
importlib.invalidate_caches()
from cat_tokens import CatToTokens

from sklearn.preprocessing import MaxAbsScaler, FunctionTransformer
from sklearn.feature_extraction import FeatureHasher
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --------- Read Parquet (schema-only, then needed columns) ----------
print("Parquet path:", USACC_PARQUET_FAST)
pf = pq.ParquetFile(USACC_PARQUET_FAST)
df_columns = pf.schema.names
n_rows = pf.metadata.num_rows
print(f"Parquet rows: {n_rows:,} | columns: {len(df_columns)}")

LEAKY  = ["End_Time","End_Lat","End_Lng","Distance(mi)","Weather_Timestamp","Description","ID","Airport_Code"]
TARGET = "duration_min"
if TARGET not in df_columns:
    raise RuntimeError(
        f"'duration_min' not found in {USACC_PARQUET_FAST}. "
        "Point to the duration parquet (not onset) or rebuild the duration builder."
    )

needed_columns = [c for c in df_columns if c not in LEAKY]
print(f"Reading {len(needed_columns)} columns (skipping {len(LEAKY)} leaky cols)…")
df = pd.read_parquet(USACC_PARQUET_FAST, columns=needed_columns)

# --------- Optional dev sampling ----------
if DEVELOPMENT_MODE:
    df = df.sample(frac=DEV_SAMPLE_FRAC, random_state=SEED_SPLIT)
    print(f"DEV mode: {len(df):,} rows");
else:
    print(f"PROD mode: {len(df):,} rows")

# --------- y / X and temporal split ----------
y = df[TARGET].astype(float).values
X = df.drop(columns=[TARGET]).copy()

req = {"start_year","start_month","start_wday","start_hour"}
if not req.issubset(X.columns):
    raise RuntimeError("Missing start_* parts. Rebuild with the duration builder cell.")


# order_key = (X["start_year"].fillna(-1).astype(int)*10_000
#              + X["start_month"].fillna(-1).astype(int)*100
#              + X["start_wday"].fillna(-1).astype(int)*10
#              + X["start_hour"].fillna(-1).astype(int))
# order = np.argsort(order_key.values)
# cut = int(len(order)*0.80)
# tr_idx, te_idx = order[:cut], order[cut:]

# REPLACE with:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=2)  # Creates 2 folds like your 80/20 split
splits = list(tscv.split(X))
tr_idx, te_idx = splits[-1]

X_train, X_test = X.iloc[tr_idx], X.iloc[te_idx]
y_train, y_test = y[tr_idx], y[te_idx]
print(f"Train: {X_train.shape} | Test: {X_test.shape}")

# --------- Hashing + MaxAbs preprocessor (sparse) and dense variant for MLP ----------
HASH_FEATURES = 1024  # bump to 4096 if RAM allows

def make_hashing_preprocess_reg(X_train, memory=None, dense=False, n_features=HASH_FEATURES):
    cat_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
    num_cols = [c for c in X_train.columns if c not in cat_cols]

    num_pipe = Pipeline([("imp", SimpleImputer(strategy="median"))], memory=memory)
    cat_pipe = Pipeline([
        ("imp",  SimpleImputer(strategy="constant", fill_value="missing")),
        ("tok",  CatToTokens(prefix=True)),  # <— module class (picklable)
        ("hash", FeatureHasher(n_features=n_features, input_type="string", dtype=np.float32)),
    ], memory=memory)

    preprocess = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ], remainder="drop", sparse_threshold=1.0, n_jobs=NJOBS)

    steps = [("prep", preprocess), ("scale", MaxAbsScaler())]
    if dense:
        steps.append(("to_dense", FunctionTransformer(
            lambda X: X.toarray() if hasattr(X, "toarray") else X, accept_sparse=True
        )))
    return Pipeline(steps, memory=memory)

# ---- Evaluation helper (model-agnostic) ----
import pathlib, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, median_absolute_error, mean_squared_error, r2_score

def evaluate_regression(y_true, y_pred, outdir, label="model", dev_flag=False):
    outdir = pathlib.Path(outdir); outdir.mkdir(parents=True, exist_ok=True)
    mae   = float(mean_absolute_error(y_true, y_pred))
    medae = float(median_absolute_error(y_true, y_pred))
    mse   = float(mean_squared_error(y_true, y_pred)); rmse = float(np.sqrt(mse))
    r2    = float(r2_score(y_true, y_pred))
    mape  = float(np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), 1e-6, None))) * 100.0)

    res = pd.DataFrame({"y_true": y_true, "y_pred": y_pred, "resid": y_true - y_pred})
    res.to_csv(outdir / f"residuals_{label}.csv", index=False)

    plt.figure(); plt.scatter(res["y_pred"], res["resid"], s=6, alpha=0.5)
    plt.axhline(0, color="k", linestyle="--"); plt.xlabel("Predicted (min)"); plt.ylabel("Residual")
    plt.title(f"Residuals vs Prediction — {label}{' (DEV)' if dev_flag else ''}")
    plt.tight_layout(); plt.savefig(outdir / f"residuals_scatter_{label}.png", dpi=140); plt.close()

    plt.figure(); plt.hist(res["resid"], bins=50)
    plt.xlabel("Residual (min)"); plt.ylabel("Count")
    plt.title(f"Residual Histogram — {label}{' (DEV)' if dev_flag else ''}")
    plt.tight_layout(); plt.savefig(outdir / f"residuals_hist_{label}.png", dpi=140); plt.close()

    return {"mae": mae, "medae": medae, "rmse": rmse, "mape_pct": mape, "r2": r2}


shared_reg       = make_hashing_preprocess_reg(X_train, memory, dense=False, n_features=HASH_FEATURES)
shared_reg_dense = make_hashing_preprocess_reg(X_train, memory, dense=True,  n_features=HASH_FEATURES)  # for MLP

print("\nReady objects:")
print(" - X_train, y_train, X_test, y_test")
print(" - shared_reg       (sparse; hashing + MaxAbs)")
print(" - shared_reg_dense (dense; for MLP)")
print(" - BASE_OUT, DEVELOPMENT_MODE, HASH_FEATURES, NJOBS, CACHE_DIR")

Parquet path: /content/US_Accidents_duration_fast.parquet
Parquet rows: 134,541 | columns: 39
Reading 39 columns (skipping 8 leaky cols)…
DEV mode: 13,454 rows
Train: (8970, 38) | Test: (4484, 38)

Ready objects:
 - X_train, y_train, X_test, y_test
 - shared_reg       (sparse; hashing + MaxAbs)
 - shared_reg_dense (dense; for MLP)
 - BASE_OUT, DEVELOPMENT_MODE, HASH_FEATURES, NJOBS, CACHE_DIR


In [21]:
## MLP Regressor
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.base import clone
import numpy as np

# Dense preprocessor for MLP
shared_reg_dense = make_hashing_preprocess_reg(X_train, memory, dense=True, n_features=HASH_FEATURES)

base_mlp = MLPRegressor(
    hidden_layer_sizes=(64, 32),
    activation="relu",
    solver="sgd",
    learning_rate="adaptive",
    learning_rate_init=0.01,
    momentum=0.0,
    nesterovs_momentum=False,
    alpha=1e-4,
    batch_size=256,
    early_stopping=False,   # IMPORTANT during halving
    max_iter=32,            # will be increased by halving
    random_state=42,
)

wrapped = TransformedTargetRegressor(
    regressor=Pipeline([("shared", shared_reg_dense), ("mlp", base_mlp)]),
    func=np.log1p, inverse_func=np.expm1
)

param_grid = {
    "regressor__mlp__hidden_layer_sizes": [(64,)],
    "regressor__mlp__learning_rate_init": [0.01, 0.005],
    "regressor__mlp__alpha": [1e-3, 1e-2],
    "regressor__mlp__batch_size": [256],
}

inner_cv = TimeSeriesSplit(n_splits=3)
cv = inner_cv
hgs = HalvingGridSearchCV(
    estimator=wrapped,
    param_grid=param_grid,
    resource="regressor__mlp__max_iter",
    min_resources=64, max_resources=512, factor=2,
    scoring="neg_mean_absolute_error",
    cv=cv, n_jobs=NJOBS, verbose=1, refit=True, error_score="raise"
)

hgs.fit(X_train, y_train)
print("Best params:", hgs.best_params_, "Best CV MAE:", -hgs.best_score_)

# Evaluate the refit-from-halving model
pred_refit = hgs.predict(X_test)
metrics_refit = evaluate_regression(y_test, pred_refit, BASE_OUT, label="mlp_halving_refit")
print("Refit metrics:", metrics_refit)

# Optional: final polish with early stopping ON
final = clone(hgs.best_estimator_)
final.set_params(
    **{"regressor__mlp__early_stopping": True,
       "regressor__mlp__validation_fraction": 0.1,
       "regressor__mlp__max_iter": 1000}
)
final.fit(X_train, y_train)
pred_final = final.predict(X_test)
metrics_final = evaluate_regression(y_test, pred_final, BASE_OUT, label="mlp_halving_ES")
print("Final (ES) metrics:", metrics_final)

train_median = float(np.median(y_train))
baseline = evaluate_regression(y_test, np.full_like(y_test, train_median), BASE_OUT,
                               label="baseline_median", dev_flag=DEVELOPMENT_MODE)
print({"train_median_min": train_median, **baseline})


n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 4
min_resources_: 64
max_resources_: 512
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 4
n_resources: 64
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 1
n_candidates: 2
n_resources: 128
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 2
n_candidates: 1
n_resources: 256
Fitting 3 folds for each of 1 candidates, totalling 3 fits




Best params: {'regressor__mlp__alpha': 0.01, 'regressor__mlp__batch_size': 256, 'regressor__mlp__hidden_layer_sizes': (64,), 'regressor__mlp__learning_rate_init': 0.01, 'regressor__mlp__max_iter': 256} Best CV MAE: 64.27844505370088
Refit metrics: {'mae': 60.78642139157643, 'medae': 24.67716911597712, 'rmse': 131.83372298934614, 'mape_pct': 64.02865468498165, 'r2': 0.07688712982677248}
Final (ES) metrics: {'mae': 60.31459091372482, 'medae': 23.521524682205467, 'rmse': 131.98641801732037, 'mape_pct': 64.19897343725678, 'r2': 0.07474751983905725}
{'train_median_min': 60.0, 'mae': 67.88908742194471, 'medae': 30.283333333333335, 'rmse': 144.614580133217, 'mape_pct': 61.482678500772394, 'r2': -0.1107745854854012}


In [22]:
# DT Regressor
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor

# pipeline: hashing+MaxAbs (sparse) -> DecisionTree
dt = DecisionTreeRegressor(random_state=42)

pipe_dt = TransformedTargetRegressor(
    regressor=Pipeline([("shared", shared_reg), ("dt", dt)]),
    func=np.log1p, inverse_func=np.expm1
)

# Keep the grid tiny but targeted (NO max_depth here; halving controls it)
param_grid = {
    "regressor__dt__criterion": ["absolute_error", "poisson"],  # robust losses for skewed durations
    "regressor__dt__min_samples_leaf": [100, 300],
    "regressor__dt__ccp_alpha": [0.0, 1e-4],      # light pruning
}

inner_cv = TimeSeriesSplit(n_splits=3)
cv = inner_cv

hgs_dt = HalvingGridSearchCV(
    estimator=pipe_dt,
    param_grid=param_grid,
    resource="regressor__dt__max_leaf_nodes",  # capacity budget
    min_resources=64,
    max_resources=512,
    factor=2,
    scoring="neg_mean_absolute_error",
    cv=cv,
    n_jobs=NJOBS,
    verbose=1,
    refit=True,
)

hgs_dt.fit(X_train, y_train)
print("DT best:", hgs_dt.best_params_)

# Evaluate
pred_dt = hgs_dt.predict(X_test)
metrics_dt = evaluate_regression(y_test, pred_dt, BASE_OUT, label="dt_halving", dev_flag=DEVELOPMENT_MODE)
print(metrics_dt)

# Median baseline (unchanged)
train_median = float(np.median(y_train))
baseline_metrics = evaluate_regression(
    y_test, np.full_like(y_test, train_median), BASE_OUT,
    label="baseline_median", dev_flag=DEVELOPMENT_MODE
)
print({"train_median_min": train_median, **baseline_metrics})

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 64
max_resources_: 512
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 8
n_resources: 64
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 1
n_candidates: 4
n_resources: 128
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 2
n_candidates: 2
n_resources: 256
Fitting 3 folds for each of 2 candidates, totalling 6 fits
----------
iter: 3
n_candidates: 1
n_resources: 512
Fitting 3 folds for each of 1 candidates, totalling 3 fits
DT best: {'regressor__dt__ccp_alpha': 0.0, 'regressor__dt__criterion': 'absolute_error', 'regressor__dt__min_samples_leaf': 100, 'regressor__dt__max_leaf_nodes': 512}
{'mae': 41.10987378718458, 'medae': 15.0, 'rmse': 116.09245103109221, 'mape_pct': 45.10542996030805, 'r2': 0.28417026709634874}
{'train_median_min': 60.0, 'mae': 67.88908742194471, 'medae': 30.283333333333335, 'rmse': 144.614580133217, 'mape_p

In [23]:
#Linear SVR
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
import numpy as np

inner_cv = TimeSeriesSplit(n_splits=3)
cv = inner_cv

lin_svr = LinearSVR(
    loss="squared_epsilon_insensitive",  # <-- FIX
    dual=False,                          # keep primal solver
    max_iter=64,                         # resource; halving will grow this
    random_state=42
)

pipe_lin = TransformedTargetRegressor(
    regressor=Pipeline([("shared", shared_reg), ("svm", lin_svr)]),
    func=np.log1p, inverse_func=np.expm1
)

grid_lin = {
    "regressor__svm__C":       [1e-2, 1e-1, 1, 10],
    "regressor__svm__epsilon": [0.05, 0.1, 0.2],
}

hgs_lin = HalvingGridSearchCV(
    estimator=pipe_lin,
    param_grid=grid_lin,
    resource="regressor__svm__max_iter",
    min_resources=64, max_resources=512, factor=3,
    scoring="neg_mean_absolute_error",
    cv=cv, n_jobs=NJOBS, verbose=1, refit=True
)

hgs_lin.fit(X_train, y_train)
lin_pred = hgs_lin.predict(X_test)
lin_metrics = evaluate_regression(y_test, lin_pred, BASE_OUT, label="svm_linear", dev_flag=DEVELOPMENT_MODE)
print({"svm_linear_best": hgs_lin.best_params_, **lin_metrics})


n_iterations: 2
n_required_iterations: 3
n_possible_iterations: 2
min_resources_: 64
max_resources_: 512
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 64
Fitting 3 folds for each of 12 candidates, totalling 36 fits
----------
iter: 1
n_candidates: 4
n_resources: 192
Fitting 3 folds for each of 4 candidates, totalling 12 fits
{'svm_linear_best': {'regressor__svm__C': 0.01, 'regressor__svm__epsilon': 0.05, 'regressor__svm__max_iter': 192}, 'mae': 60.30738537453757, 'medae': 24.179680592225296, 'rmse': 131.53522223840793, 'mape_pct': 65.27591639400785, 'r2': 0.08106266133295348}


In [27]:
# RBF kernel, Linear SVR
from sklearn.svm import SVR

inner_cv = TimeSeriesSplit(n_splits=3)
cv = inner_cv

rbf_svr = SVR(
    kernel="rbf",  # True RBF kernel
    max_iter=64,   # Keep halving resource
    cache_size=200  # MB for kernel cache
)

pipe_rbf = TransformedTargetRegressor(
    regressor=Pipeline([
        ("shared", shared_reg),
        ("svm", rbf_svr),
    ]),
    func=np.log1p, inverse_func=np.expm1
)

grid_rbf = {
    "regressor__svm__C":         [1e-2, 1e-1, 1, 10],
    "regressor__svm__epsilon":   [0.05, 0.1],
    "regressor__svm__gamma":     [1e-3, 1e-2, 1e-1],  # coarse log grid
}

hgs_rbf = HalvingGridSearchCV(
    estimator=pipe_rbf,
    param_grid=grid_rbf,
    resource="regressor__svm__max_iter",
    min_resources=64, max_resources=512, factor=3,
    scoring="neg_mean_absolute_error",
    cv=cv, n_jobs=NJOBS, verbose=1, refit=True
)

hgs_rbf.fit(X_train, y_train)
rbf_pred = hgs_rbf.predict(X_test)
rbf_metrics = evaluate_regression(y_test, rbf_pred, BASE_OUT, label="svm_rbf_nystroem", dev_flag=DEVELOPMENT_MODE)
print({"svm_rbf_best": hgs_rbf.best_params_, **rbf_metrics})

n_iterations: 2
n_required_iterations: 3
n_possible_iterations: 2
min_resources_: 64
max_resources_: 512
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 24
n_resources: 64
Fitting 3 folds for each of 24 candidates, totalling 72 fits
----------
iter: 1
n_candidates: 8
n_resources: 192
Fitting 3 folds for each of 8 candidates, totalling 24 fits




{'svm_rbf_best': {'regressor__svm__C': 10, 'regressor__svm__epsilon': 0.05, 'regressor__svm__gamma': 0.001, 'regressor__svm__max_iter': 192}, 'mae': 64.16335475616454, 'medae': 30.451771918099155, 'rmse': 137.52876635964827, 'mape_pct': 67.62373098952742, 'r2': -0.004590038170670496}


In [26]:
# Knn regressor

from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.base import clone
import numpy as np

inner_cv = TimeSeriesSplit(n_splits=3)
cv = inner_cv
# --- Budget guard: k must be < train fold size ---
approx_fold = int(len(X_train) * (inner_cv.get_n_splits()-1)/inner_cv.get_n_splits())  # ~ size of each training fold

K_MIN = 64
K_MAX_CAP = 256          # <-- our cap to avoid oversmoothing (was 512)
# k must be < training fold size; also ensure K_MAX >= K_MIN+1
K_MAX_SAFE = max(K_MIN + 1, min(K_MAX_CAP, approx_fold - 1))

print(f"KNN halving ladder: n_neighbors from {K_MIN} up to {K_MAX_SAFE} (factor=3)")

# Pipeline: shared sparse preprocessing -> KNN
knn = KNeighborsRegressor(
    algorithm="brute",       # required for sparse + allows many metrics
    metric="minkowski",      # grid will try p=1/2
    p=2,
    weights="distance",
    n_jobs=NJOBS,
)

pipe_knn = TransformedTargetRegressor(
    regressor=Pipeline([("shared", shared_reg), ("knn", knn)]),
    func=np.log1p, inverse_func=np.expm1
)

# Tiny, targeted grid (6 combos max)
param_grid = {
    "regressor__knn__weights": ["uniform", "distance"],
    "regressor__knn__metric": ["minkowski"],  # keep it simple & robust on sparse
    "regressor__knn__p": [1, 2],              # Manhattan vs Euclidean
    # (If you want to try cosine later: add "cosine" to metric; keep algorithm="brute")
}

hgs_knn = HalvingGridSearchCV(
    estimator=pipe_knn,
    param_grid=param_grid,
    resource="regressor__knn__n_neighbors",  # MATCHED BUDGET
    min_resources=K_MIN, max_resources=K_MAX_SAFE, factor=3,
    scoring="neg_mean_absolute_error",
    cv=cv, n_jobs=NJOBS, verbose=1, refit=True
)

hgs_knn.fit(X_train, y_train)

# Evaluate refit and (optionally) an ES-style “final” (not needed for KNN)
pred_knn = hgs_knn.predict(X_test)
metrics_knn = evaluate_regression(y_test, pred_knn, BASE_OUT,
                                  label="knn_halving_refit", dev_flag=DEVELOPMENT_MODE)

print({"knn_best_params": hgs_knn.best_params_, **metrics_knn})

# (Optional) print baseline right next to it
train_median = float(np.median(y_train))
baseline = evaluate_regression(y_test, np.full_like(y_test, train_median),
                               BASE_OUT, label="baseline_median", dev_flag=DEVELOPMENT_MODE)
print({"train_median_min": train_median, **baseline})

KNN halving ladder: n_neighbors from 64 up to 256 (factor=3)
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 64
max_resources_: 256
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 4
n_resources: 64
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 1
n_candidates: 2
n_resources: 192
Fitting 3 folds for each of 2 candidates, totalling 6 fits
{'knn_best_params': {'regressor__knn__metric': 'minkowski', 'regressor__knn__p': 1, 'regressor__knn__weights': 'distance', 'regressor__knn__n_neighbors': 192}, 'mae': 61.2234807744056, 'medae': 25.36842232045066, 'rmse': 135.3166683817466, 'mape_pct': 62.567149437542746, 'r2': 0.027466957815660442}
{'train_median_min': 60.0, 'mae': 67.88908742194471, 'medae': 30.283333333333335, 'rmse': 144.614580133217, 'mape_pct': 61.482678500772394, 'r2': -0.1107745854854012}


In [12]:
from sklearn.base import clone

best = hgs_knn.best_estimator_
ks = [128, 160, 192, 224]   # around the winner
results = {}
for k in ks:
    for w in ["distance", "uniform"]:
        m = clone(best)
        m.set_params(**{"regressor__knn__n_neighbors": k,
                        "regressor__knn__weights": w})
        m.fit(X_train, y_train)
        pred = m.predict(X_test)
        results[(k,w)] = evaluate_regression(
            y_test, pred, BASE_OUT, label=f"knn_k{k}_{w}", dev_flag=DEVELOPMENT_MODE)

# quick views
print("Top by RMSE:", sorted(results.items(), key=lambda kv: kv[1]["rmse"])[:3])
print("Top by MAE :", sorted(results.items(), key=lambda kv: kv[1]["mae"])[:3])

Top by RMSE: [((224, 'uniform'), {'mae': 55.43095211679688, 'medae': 28.429933337987237, 'rmse': 119.51176247949664, 'mape_pct': 65.17928170455433, 'r2': -0.030413198391386898}), ((224, 'distance'), {'mae': 55.41548220920418, 'medae': 28.176861991754592, 'rmse': 119.5328904780959, 'mape_pct': 65.11309007661343, 'r2': -0.03077755571531382}), ((192, 'uniform'), {'mae': 55.587651323250284, 'medae': 28.458580469436896, 'rmse': 119.61672797351062, 'mape_pct': 65.3794477875074, 'r2': -0.03222398797243908})]
Top by MAE : [((224, 'distance'), {'mae': 55.41548220920418, 'medae': 28.176861991754592, 'rmse': 119.5328904780959, 'mape_pct': 65.11309007661343, 'r2': -0.03077755571531382}), ((224, 'uniform'), {'mae': 55.43095211679688, 'medae': 28.429933337987237, 'rmse': 119.51176247949664, 'mape_pct': 65.17928170455433, 'r2': -0.030413198391386898}), ((192, 'distance'), {'mae': 55.56678513435276, 'medae': 28.561276742092748, 'rmse': 119.62873508110336, 'mape_pct': 65.31129428944827, 'r2': -0.032431