# RH/Temp/ETo XGBoost (outside-only)

Single-model per target using outside weather/radiation features with time-based train/test split.

Internal air temperature now comes from the `air_temp_C` column inside `rh_et0.csv` (no `air_temp_c.csv`).


In [66]:
from __future__ import annotations
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error

DATA_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()
HOURS_LAGS = [1, 2, 3, 6, 12, 24]
HOURS_ROLL = [1, 2, 6, 12, 24]
TEST_FRAC = 0.2
MAX_LAG_STEPS = 24  # cap lags/rolls to avoid very long histories (approx <=4h at 10min freq)
TIMESTAMP_COL = "timestamp"
SEED = 42

In [67]:

# Load outside weather
rad = pd.read_csv(DATA_DIR / "bet_dagan_radiation.csv")
wx = pd.read_csv(DATA_DIR / "bet_dagan_weather.csv")
rc = rad.columns.tolist(); wc = wx.columns.tolist()
rad = rad.rename(columns={rc[0]: "station_radiation", rc[1]: "datetime", rc[2]: "diffuse_radiation_Wm2", rc[3]: "global_radiation_Wm2", rc[4]: "direct_radiation_Wm2"})
wx = wx.rename(columns={wc[0]: "station_weather", wc[1]: "datetime", wc[2]: "station_pressure_hpa", wc[3]: "relative_humidity_pct", wc[4]: "air_temp_C_weather", wc[5]: "air_temp_max_C_weather", wc[6]: "air_temp_min_C_weather", wc[7]: "ground_temp_C", wc[8]: "wet_temp_C", wc[9]: "wind_dir_deg", wc[10]: "gust_dir_deg", wc[11]: "wind_speed_ms", wc[12]: "wind_speed_1m_max_ms", wc[13]: "wind_speed_10m_max_ms", wc[14]: "wind_speed_10m_max_end_time", wc[15]: "gust_speed_ms", wc[16]: "wind_dir_std_deg"})
rad["datetime"] = pd.to_datetime(rad["datetime"], dayfirst=True)
wx["datetime"] = pd.to_datetime(wx["datetime"], dayfirst=True)
rad = rad.replace("-", np.nan)
wx = wx.replace("-", np.nan)
def _coerce(df, ignore):
    for c in df.columns:
        if c in ignore:
            continue
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df
rad = _coerce(rad, ["station_radiation", "datetime"])
wx = _coerce(wx, ["station_weather", "datetime", "wind_speed_10m_max_end_time"])
outside = pd.merge(rad, wx, on="datetime", how="inner").sort_values("datetime")

# Deltas
for col in ["air_temp_C_weather", "relative_humidity_pct", "global_radiation_Wm2", "wind_speed_ms"]:
    outside[f"{col}_diff"] = outside[col].diff()

# Calendar/time features
outside["hour"] = outside["datetime"].dt.hour
outside["dayofyear"] = outside["datetime"].dt.dayofyear
outside["weekofyear"] = outside["datetime"].dt.isocalendar().week.astype(int)
outside["dayofyear_sin"] = np.sin(2 * np.pi * outside["dayofyear"] / 365.25)
outside["dayofyear_cos"] = np.cos(2 * np.pi * outside["dayofyear"] / 365.25)
outside["hour_sin"] = np.sin(2 * np.pi * outside["hour"] / 24)
outside["hour_cos"] = np.cos(2 * np.pi * outside["hour"] / 24)

# Interactions
outside["temp_rad_interaction"] = outside["air_temp_C_weather"] * outside["global_radiation_Wm2"]
outside["humidity_wind_interaction"] = outside["relative_humidity_pct"] * outside["wind_speed_ms"]

# Moisture/thermal derived features
_a, _b = 17.27, 237.7
rh_clip = outside["relative_humidity_pct"].clip(1, 100)
gamma = (_a * outside["air_temp_C_weather"] / (_b + outside["air_temp_C_weather"])) + np.log(rh_clip / 100.0)
outside["dew_point_C"] = (_b * gamma) / (_a - gamma)
es = 0.6108 * np.exp((17.27 * outside["air_temp_C_weather"]) / (outside["air_temp_C_weather"] + 237.3))
ea = es * rh_clip / 100.0
outside["vpd_kpa"] = es - ea

# Load targets (internal air temp + ETo) from rh_et0.csv
rh_eto = pd.read_csv(DATA_DIR / "rh_et0.csv")
rh_eto["Date & Time (Summer)"] = rh_eto["Date & Time (Summer)"].astype(str).str.strip()
rh_eto[TIMESTAMP_COL] = pd.to_datetime(rh_eto["Date & Time (Summer)"], dayfirst=True, errors="coerce")
rh_eto = rh_eto.rename(columns={"Eto (mm)": "Eto_mm", "RH%": "Rh_internal", "internal_RH": "Rh_internal", "air_temp_C": "air_temp_C"})
for col in ["Rh_internal", "Eto_mm", "air_temp_C"]:
    rh_eto[col] = pd.to_numeric(rh_eto[col], errors="coerce")
rh_eto = rh_eto[[TIMESTAMP_COL, "Rh_internal", "Eto_mm", "air_temp_C"]].sort_values(TIMESTAMP_COL)


  wx = wx.replace("-", np.nan)


In [68]:
outside_vars = [
    "air_temp_C_weather", "relative_humidity_pct", "global_radiation_Wm2",
    "diffuse_radiation_Wm2", "direct_radiation_Wm2", "station_pressure_hpa",
    "wind_speed_ms", "gust_speed_ms", "dew_point_C", "vpd_kpa",
    "air_temp_C_weather_diff", "relative_humidity_pct_diff", "global_radiation_Wm2_diff", "wind_speed_ms_diff",
    "hour", "dayofyear", "weekofyear", "dayofyear_sin", "dayofyear_cos", "hour_sin", "hour_cos",
    "temp_rad_interaction", "humidity_wind_interaction",
]

def merge_target_with_weather(target_df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    merged = pd.merge_asof(
        target_df.sort_values(TIMESTAMP_COL),
        outside.sort_values("datetime"),
        left_on=TIMESTAMP_COL,
        right_on="datetime",
        direction="nearest",
        tolerance=pd.Timedelta("15min"),
    )
    merged = merged.drop(columns=["datetime"], errors="ignore")
    merged[TIMESTAMP_COL] = pd.to_datetime(merged[TIMESTAMP_COL], errors="coerce")
    exclude_start = pd.Timestamp("2025-09-17")
    exclude_end = pd.Timestamp("2025-09-20")
    merged = merged[(merged[TIMESTAMP_COL] < exclude_start) | (merged[TIMESTAMP_COL] >= exclude_end)]
    merged[outside_vars] = merged[outside_vars].ffill().bfill()
    merged[target_col] = pd.to_numeric(merged[target_col], errors="coerce")
    merged = merged.dropna(subset=[target_col]).reset_index(drop=True)
    return merged

def build_lagged(
    df: pd.DataFrame,
    target_col: str,
    include_target_lags: bool = False,
    feature_vars: List[str] | None = None,
) -> Tuple[pd.DataFrame, pd.Series, pd.Series]:
    vars_all = (feature_vars or outside_vars) + ([target_col] if include_target_lags else [])
    base = df.copy()
    ts_sorted = base[TIMESTAMP_COL].sort_values()
    if len(ts_sorted) >= 2:
        freq_min = ts_sorted.diff().dropna().median().total_seconds() / 60.0
        steps_per_hour = max(1, int(round(60.0 / freq_min))) if pd.notnull(freq_min) else 6
    else:
        steps_per_hour = 6
    lag_steps = sorted({max(1, int(round(h * steps_per_hour))) for h in HOURS_LAGS})
    roll_windows = sorted({max(1, int(round(h * steps_per_hour))) for h in HOURS_ROLL})
    lag_steps = [s for s in lag_steps if s <= MAX_LAG_STEPS]
    roll_windows = [s for s in roll_windows if s <= MAX_LAG_STEPS]
    no_lag_vars = {"hour_sin", "hour_cos", "hour", "dayofyear", "weekofyear"}

    feat_frames: List[pd.Series] = []
    for var in vars_all:
        if var in no_lag_vars:
            feat_frames.append(base[var].rename(f"{var}_base"))
            continue
        for lag in lag_steps:
            feat_frames.append(base[var].shift(lag).rename(f"{var}_lag_{lag}"))
        for win in roll_windows:
            feat_frames.append(base[var].rolling(win).mean().shift(1).rename(f"{var}_roll_{win}"))
    lagged = pd.concat([base[[TIMESTAMP_COL, target_col]]] + feat_frames, axis=1)
    lagged = lagged.dropna().reset_index(drop=True)
    X = lagged[[c for c in lagged.columns if c.endswith(tuple([f"_lag_{l}" for l in lag_steps])) or "_roll_" in c or c.endswith("_base")]]
    y = lagged[target_col]
    ts = lagged[TIMESTAMP_COL]
    return X, y, ts


In [69]:

metrics = {}
models = {}
predicted_series = {}
diagnostics = {}
search_results = {}

def _train_with_search(X_train, y_train, target: str, val_frac: float = 0.05):
    base_params = dict(
        n_estimators=1200,
        max_depth=3,
        min_child_weight=50 if target == "air_temp_C" else 20,
        learning_rate=0.03 if target == "air_temp_C" else 0.02,
        subsample=0.7,
        colsample_bytree=0.9,
        reg_alpha=0.6,
        reg_lambda=6.0 if target == "air_temp_C" else 3.0,
    )
    param_grid = [
        dict(n_estimators=800, learning_rate=0.06, max_depth=2, min_child_weight=35, subsample=0.9, colsample_bytree=0.95, reg_alpha=0.2, reg_lambda=3.0),
        dict(n_estimators=950, learning_rate=0.05, max_depth=3, min_child_weight=45, subsample=0.9, colsample_bytree=0.9, reg_alpha=0.4, reg_lambda=4.5),
        dict(n_estimators=1100, learning_rate=0.04, max_depth=3, min_child_weight=55, subsample=0.8, colsample_bytree=0.85, reg_alpha=0.6, reg_lambda=4.5),
        dict(n_estimators=650, learning_rate=0.08, max_depth=2, min_child_weight=25, subsample=0.95, colsample_bytree=0.95, reg_alpha=0.1, reg_lambda=2.5),
        dict(n_estimators=750, learning_rate=0.07, max_depth=3, min_child_weight=50, subsample=0.9, colsample_bytree=0.9, reg_alpha=0.5, reg_lambda=5.0),
        dict(n_estimators=1300, learning_rate=0.03, max_depth=3, min_child_weight=60, subsample=0.8, colsample_bytree=0.85, reg_alpha=0.8, reg_lambda=5.5),
        dict(n_estimators=900, learning_rate=0.05, max_depth=4, min_child_weight=60, subsample=0.85, colsample_bytree=0.9, reg_alpha=0.7, reg_lambda=6.0),
    ]

    # If too few rows, skip search and fit once
    if len(X_train) < 40:
        model = XGBRegressor(
            **base_params,
            objective="reg:squarederror",
            tree_method="hist",
            n_jobs=-1,
            random_state=SEED,
            eval_metric="rmse",
        )
        model.fit(X_train, y_train, verbose=False)
        return model, {"best_params": base_params, "val_r2": np.nan, "val_mae": np.nan, "used_search": False}

    split_val = int(len(X_train) * (1 - val_frac))
    if split_val <= 0 or split_val >= len(X_train):
        split_val = max(1, len(X_train) - 1)
    X_tr, X_val = X_train.iloc[:split_val], X_train.iloc[split_val:]
    y_tr, y_val = y_train.iloc[:split_val], y_train.iloc[split_val:]

    best = {"val_r2": -np.inf, "val_mae": np.inf, "params": base_params}
    grid = list(param_grid)
    if target == "air_temp_C":
        grid.append(dict(n_estimators=1000, learning_rate=0.05, max_depth=3, min_child_weight=50, subsample=0.85, colsample_bytree=0.9, reg_alpha=0.6, reg_lambda=5.5))
        grid.append(dict(n_estimators=800, learning_rate=0.06, max_depth=2, min_child_weight=60, subsample=0.9, colsample_bytree=0.95, reg_alpha=0.4, reg_lambda=6.0))
    for params in grid:
        model = XGBRegressor(
            **params,
            objective="reg:squarederror",
            tree_method="hist",
            n_jobs=-1,
            random_state=SEED,
            eval_metric="rmse",
        )
        model.fit(X_tr, y_tr, verbose=False)
        pred_val = model.predict(X_val)
        val_r2 = r2_score(y_val, pred_val)
        val_mae = mean_absolute_error(y_val, pred_val)
        if val_r2 > best["val_r2"]:
            best = {"val_r2": val_r2, "val_mae": val_mae, "params": params}

    final_model = XGBRegressor(
        **best["params"],
        objective="reg:squarederror",
        tree_method="hist",
        n_jobs=-1,
        random_state=SEED,
        eval_metric="rmse",
    )
    final_model.fit(X_train, y_train, verbose=False)
    return final_model, {"best_params": best["params"], "val_r2": best.get("val_r2", np.nan), "val_mae": best.get("val_mae", np.nan), "used_search": True}

target_defs = {
    "air_temp_C": rh_eto[[TIMESTAMP_COL, "air_temp_C"]].rename(columns={"air_temp_C": "target"}),
    "Eto_mm": rh_eto[[TIMESTAMP_COL, "Eto_mm"]].rename(columns={"Eto_mm": "target"}),
}

for target_col, df_target in target_defs.items():
    merged = merge_target_with_weather(df_target, target_col="target").rename(columns={"target": target_col})
    feature_vars = outside_vars.copy()
    if merged.empty:
        metrics[target_col] = {
            "train_r2": np.nan, "train_mae": np.nan,
            "test_r2": np.nan, "test_mae": np.nan,
            "train_rows": 0, "test_rows": 0,
            "val_r2": np.nan, "val_mae": np.nan,
        }
        diagnostics[target_col] = {
            "correlations": pd.Series(dtype=float),
            "feature_importances": pd.Series(dtype=float),
        }
        continue
    X, y, ts = build_lagged(merged[[TIMESTAMP_COL, target_col] + feature_vars], target_col, include_target_lags=False, feature_vars=feature_vars)
    if X.empty:
        metrics[target_col] = {
            "train_r2": np.nan, "train_mae": np.nan,
            "test_r2": np.nan, "test_mae": np.nan,
            "train_rows": 0, "test_rows": 0,
            "val_r2": np.nan, "val_mae": np.nan,
        }
        diagnostics[target_col] = {
            "correlations": pd.Series(dtype=float),
            "feature_importances": pd.Series(dtype=float),
        }
        continue
    split_idx = int(len(ts) * (1 - TEST_FRAC))
    cutoff_time = ts.iloc[split_idx]
    train_mask = ts < cutoff_time
    test_mask = ~train_mask
    X_train, X_test = X.loc[train_mask], X.loc[test_mask]
    y_train, y_test = y.loc[train_mask], y.loc[test_mask]

    model, search_info = _train_with_search(X_train, y_train, target_col, val_frac=0.05)
    search_results[target_col] = search_info

    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    metrics[target_col] = {
        "train_r2": r2_score(y_train, pred_train),
        "train_mae": mean_absolute_error(y_train, pred_train),
        "test_r2": r2_score(y_test, pred_test),
        "test_mae": mean_absolute_error(y_test, pred_test),
        "train_rows": len(X_train),
        "test_rows": len(X_test),
        "val_r2": search_info.get("val_r2", np.nan),
        "val_mae": search_info.get("val_mae", np.nan),
    }
    diagnostics[target_col] = {
        "correlations": pd.concat([
            y_train.reset_index(drop=True),
            X_train.reset_index(drop=True),
        ], axis=1).corr()[target_col].drop(target_col).sort_values(
            key=lambda s: s.abs(), ascending=False
        ),
        "feature_importances": pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False),
    }
    models[target_col] = model
    predicted_series[target_col] = pd.DataFrame({
        TIMESTAMP_COL: ts.reset_index(drop=True),
        f"pred_{target_col}": model.predict(X.reset_index(drop=True)),
    })

print("Metrics (per target, train/test/val):")
def fmt(val):
    return f"{val:.4f}" if pd.notnull(val) else "nan"
for tgt, m in metrics.items():
    print(
        f"- {tgt}: "
        f"train R2={fmt(m.get('train_r2', np.nan))} MAE={fmt(m.get('train_mae', np.nan))} | "
        f"test R2={fmt(m.get('test_r2', np.nan))} MAE={fmt(m.get('test_mae', np.nan))} | "
        f"val R2={fmt(m.get('val_r2', np.nan))} MAE={fmt(m.get('val_mae', np.nan))} | "
        f"train_rows={m.get('train_rows', 0)}, test_rows={m.get('test_rows', 0)}"
    )

print("Top correlations with target (train set, abs sorted):")
for tgt, diag in diagnostics.items():
    corr = diag.get("correlations", pd.Series(dtype=float))
    if corr.empty:
        print(f"- {tgt}: no correlation data")
        continue
    print(f"- {tgt}:")
    print(corr.head(10).to_string())

print("Top model feature importances:")
for tgt, diag in diagnostics.items():
    imp = diag.get("feature_importances", pd.Series(dtype=float))
    if imp.empty:
        print(f"- {tgt}: no importance data")
        continue
    print(f"- {tgt}:")
    print(imp.head(15).to_string())

# Export test rows with predictions
for tgt, fname in [
    ("air_temp_C", "test_rows_air_temp_C_predictions.csv"),
    ("Eto_mm", "test_rows_eto_mm_predictions.csv"),
]:
    if tgt not in target_defs or tgt not in models:
        continue
    merged = merge_target_with_weather(target_defs[tgt], target_col="target").rename(columns={"target": tgt})
    feature_vars = outside_vars.copy()
    X_tmp, y_tmp, ts_tmp = build_lagged(merged[[TIMESTAMP_COL, tgt] + feature_vars], tgt, include_target_lags=False, feature_vars=feature_vars)
    if X_tmp.empty:
        continue
    split_idx = int(len(ts_tmp) * (1 - TEST_FRAC))
    cutoff_time = ts_tmp.iloc[split_idx]
    test_mask = ts_tmp >= cutoff_time
    X_test_tmp = X_tmp.loc[test_mask].reset_index(drop=True)
    y_test_tmp = y_tmp.loc[test_mask].reset_index(drop=True)
    pred_tmp = models[tgt].predict(X_test_tmp)
    ts_test = ts_tmp.loc[test_mask].reset_index(drop=True)
    export_df = pd.DataFrame({
        TIMESTAMP_COL: ts_test,
        tgt: y_test_tmp,
        f"pred_{tgt}": pred_tmp,
    })
    export_path = DATA_DIR / fname
    export_df.to_csv(export_path, index=False)
    print(f"Exported {tgt} test rows with predictions to {export_path}")


Metrics (per target, train/test):
- air_temp_C: train R2=0.9977 MAE=0.1766 | test R2=0.9729 MAE=0.5925 | train_rows=3011, test_rows=753
- Eto_mm: train R2=0.9845 MAE=0.0140 | test R2=0.9486 MAE=0.0195 | train_rows=4262, test_rows=1066
Top correlations with target (train set, abs sorted):
- air_temp_C:
air_temp_C_weather_lag_1        0.887985
temp_rad_interaction_lag_1      0.882814
global_radiation_Wm2_lag_1      0.874866
temp_rad_interaction_lag_2      0.872739
air_temp_C_weather_lag_2        0.869311
global_radiation_Wm2_lag_2      0.866829
temp_rad_interaction_lag_3      0.860396
temp_rad_interaction_roll_6     0.859891
global_radiation_Wm2_roll_6     0.857320
diffuse_radiation_Wm2_roll_6    0.856514
- Eto_mm:
global_radiation_Wm2_lag_1     0.869796
temp_rad_interaction_lag_1     0.864932
global_radiation_Wm2_lag_2     0.851423
temp_rad_interaction_lag_2     0.844836
global_radiation_Wm2_lag_3     0.829700
global_radiation_Wm2_roll_6    0.824138
temp_rad_interaction_lag_3     0.8213

In [70]:
# Rolling blocked training/eval (train small slice, test next slice repeatedly)
from math import ceil

def rolling_block_eval(target: str, train_frac: float = 0.15, test_frac: float = 0.05):
    if target not in target_defs:
        raise ValueError(f"Unknown target {target}")
    merged = merge_target_with_weather(target_defs[target], target_col="target").rename(columns={"target": target})
    X, y, ts = build_lagged(merged[[TIMESTAMP_COL, target] + outside_vars], target, include_target_lags=False, feature_vars=outside_vars)
    n = len(X)
    if n == 0:
        raise ValueError("No data")
    train_size = max(1, int(n * train_frac))
    test_size = max(1, int(n * test_frac))
    if train_size + test_size >= n:
        raise ValueError("Fractions too large")

    blocks = []
    preds_all, actual_all, ts_all = [], [], []
    idx = 0
    block_id = 0
    while idx + train_size + test_size <= n:
        train_end = idx + train_size
        test_end = train_end + test_size
        X_train, y_train = X.iloc[idx:train_end], y.iloc[idx:train_end]
        X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end]
        model = XGBRegressor(
            n_estimators=800,
            max_depth=3,
            min_child_weight=40,
            learning_rate=0.02,
            subsample=0.65,
            colsample_bytree=0.65,
            reg_alpha=1.0,
            reg_lambda=4.0,
            objective="reg:squarederror",
            tree_method="hist",
            n_jobs=-1,
            random_state=SEED,
            eval_metric="rmse",
        )
        model.fit(X_train, y_train, verbose=False)
        pred = model.predict(X_test)
        preds_all.append(pred)
        actual_all.append(y_test.to_numpy())
        ts_all.append(ts.iloc[train_end:test_end])
        blocks.append({
            "block": block_id,
            "start": ts.iloc[train_end],
            "end": ts.iloc[test_end-1],
            "r2": r2_score(y_test, pred),
            "mae": mean_absolute_error(y_test, pred),
            "rows": len(pred),
        })
        idx = test_end
        block_id += 1

    preds_all = np.concatenate(preds_all)
    actual_all = np.concatenate(actual_all)
    ts_all = pd.concat(ts_all).reset_index(drop=True)
    overall_r2 = r2_score(actual_all, preds_all)
    overall_mae = mean_absolute_error(actual_all, preds_all)
    print(f"Rolling blocks for {target}: R2={overall_r2:.4f} MAE={overall_mae:.4f} over {len(actual_all)} rows")
    print(pd.DataFrame(blocks).to_string(index=False))
    return pd.DataFrame({TIMESTAMP_COL: ts_all, target: actual_all, f"pred_{target}_rolling": preds_all})

rolling_air = rolling_block_eval("air_temp_C", train_frac=0.15, test_frac=0.05)
rolling_eto = rolling_block_eval("Eto_mm", train_frac=0.15, test_frac=0.05)


Rolling blocks for air_temp_C: R2=0.9464 MAE=0.7472 over 940 rows
 block               start                 end       r2      mae  rows
     0 2025-08-15 16:00:00 2025-08-16 23:10:00 0.896770 1.195896   188
     1 2025-08-25 04:30:00 2025-08-26 11:40:00 0.971219 0.440177   188
     2 2025-08-30 09:50:00 2025-09-04 17:10:00 0.986193 0.466945   188
     3 2025-09-08 15:20:00 2025-09-11 17:50:00 0.936264 0.787412   188
     4 2025-09-15 16:00:00 2025-09-16 23:10:00 0.905369 0.845331   188
Rolling blocks for Eto_mm: R2=0.9400 MAE=0.0273 over 1330 rows
 block               start                 end       r2      mae  rows
     0 2025-08-16 13:10:00 2025-08-18 09:20:00 0.915948 0.035025   266
     1 2025-08-23 22:40:00 2025-08-25 18:50:00 0.964661 0.024061   266
     2 2025-08-31 08:10:00 2025-09-02 04:20:00 0.962090 0.023215   266
     3 2025-09-07 17:40:00 2025-09-09 13:50:00 0.952157 0.022438   266
     4 2025-09-15 03:10:00 2025-09-16 23:20:00 0.874397 0.031881   266


In [None]:
# Predict over full weather period and export ETo + internal air temp
full_weather = outside.rename(columns={"datetime": TIMESTAMP_COL}).copy()
export_targets = ["air_temp_C", "Eto_mm"]
export_frames = []

for tgt in export_targets:
    if tgt not in models:
        print(f"Skipping {tgt}: no trained model")
        continue
    weather_for_pred = full_weather.copy()
    weather_for_pred[tgt] = 0.0
    X_full, _, ts_full = build_lagged(
        weather_for_pred[[TIMESTAMP_COL, tgt] + outside_vars],
        tgt,
        include_target_lags=False,
        feature_vars=outside_vars,
    )
    if X_full.empty:
        print(f"Skipping {tgt}: no feature rows")
        continue
    preds = models[tgt].predict(X_full)
    export_frames.append(pd.DataFrame({
        TIMESTAMP_COL: ts_full.reset_index(drop=True),
        f"pred_{tgt}": preds,
    }))

if export_frames:
    export_df = export_frames[0]
    for df_extra in export_frames[1:]:
        export_df = pd.merge(export_df, df_extra, on=TIMESTAMP_COL, how="inner")
    export_path = DATA_DIR / "full_weather_period_air_temp_ETo_predictions.csv"
    export_df.to_csv(export_path, index=False)
    print(f"Exported full weather-period predictions to {export_path}")
else:
    print("No predictions exported for full weather period")


In [None]:

# Rolling forecast across full season (train on past window, predict next block, advance)
def rolling_forecast_full(target: str, train_frac: float = 0.15, test_frac: float = 0.05):
    if target not in target_defs:
        raise ValueError(f"Unknown target {target}")
    merged = merge_target_with_weather(target_defs[target], target_col="target").rename(columns={"target": target})
    X, y, ts = build_lagged(merged[[TIMESTAMP_COL, target] + outside_vars], target, include_target_lags=False, feature_vars=outside_vars)
    n = len(X)
    if n == 0:
        raise ValueError("No data")
    train_size = max(1, int(n * train_frac))
    test_size = max(1, int(n * test_frac))
    if train_size + test_size >= n:
        raise ValueError("Fractions too large for rolling forecast")

    preds_all, actual_all, ts_all = [], [], []
    start = 0
    block_id = 0
    while True:
        train_end = start + train_size
        test_end = train_end + test_size
        if test_end > n:
            break
        X_train, y_train = X.iloc[start:train_end], y.iloc[start:train_end]
        X_test, y_test = X.iloc[train_end:test_end], y.iloc[train_end:test_end]
        model = XGBRegressor(
            n_estimators=800,
            max_depth=3,
            min_child_weight=40,
            learning_rate=0.02,
            subsample=0.65,
            colsample_bytree=0.65,
            reg_alpha=1.0,
            reg_lambda=4.0,
            objective="reg:squarederror",
            tree_method="hist",
            n_jobs=-1,
            random_state=SEED,
            eval_metric="rmse",
        )
        model.fit(X_train, y_train, verbose=False)
        pred = model.predict(X_test)
        preds_all.append(pred)
        actual_all.append(y_test.to_numpy())
        ts_all.append(ts.iloc[train_end:test_end])
        print(f"Block {block_id}: {ts.iloc[train_end]} -> {ts.iloc[test_end-1]} R2={r2_score(y_test, pred):.4f} MAE={mean_absolute_error(y_test, pred):.4f} rows={len(pred)}")
        start += test_size  # advance window by test block to include predicted slice in future training
        block_id += 1

    preds_all = np.concatenate(preds_all)
    actual_all = np.concatenate(actual_all)
    ts_all = pd.concat(ts_all).reset_index(drop=True)
    df_out = pd.DataFrame({
        TIMESTAMP_COL: ts_all,
        f"{target}_actual": actual_all,
        f"pred_{target}_rolling_forecast": preds_all,
    })
    overall_r2 = r2_score(actual_all, preds_all)
    overall_mae = mean_absolute_error(actual_all, preds_all)
    print(f"Rolling forecast {target}: R2={overall_r2:.4f} MAE={overall_mae:.4f} over {len(actual_all)} rows")
    export_path = DATA_DIR / f"rolling_forecast_{target}.csv"
    df_out.to_csv(export_path, index=False)
    print(f"Saved stitched rolling forecast to {export_path}")
    return df_out

rolling_forecast_air = rolling_forecast_full("air_temp_C", train_frac=0.15, test_frac=0.05)
rolling_forecast_eto = rolling_forecast_full("Eto_mm", train_frac=0.15, test_frac=0.05)
