# RH/Temp/ETo Prediction (split targets)

Weather-only features: time sin/cos, lags 1/3/6/18/24/48, rolls 3/6/12/24/48, `rh_x_temp`.
- `air_temp_C` from `air_temp_c.csv`
- `Rh_internal` and `Eto_mm` from `rh_et0.csv` (column now `internal_RH`)
- RH/ETo training restricted to data up to 2025-09-03 to retain higher R2
- Chronological 80/20 split with forward-chaining CV per target
- ETo predictions clamped to >= 0

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

DATA_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()

param_grid = [
    {"max_depth": 6, "learning_rate": 0.05, "min_samples_leaf": 20, "max_bins": 255},
    {"max_depth": 8, "learning_rate": 0.05, "min_samples_leaf": 20, "max_bins": 255},
    {"max_depth": 10, "learning_rate": 0.03, "min_samples_leaf": 30, "max_bins": 255},
    {"max_depth": 12, "learning_rate": 0.03, "min_samples_leaf": 20, "max_bins": 255},
    {"max_depth": None, "learning_rate": 0.05, "min_samples_leaf": 30, "max_bins": 255},
]

In [None]:
def load_weather() -> pd.DataFrame:
    rad = pd.read_csv(DATA_DIR / "bet_dagan_radiation.csv")
    wx = pd.read_csv(DATA_DIR / "bet_dagan_weather.csv")

    rc = rad.columns.tolist()
    wc = wx.columns.tolist()

    rad = rad.rename(
        columns={
            rc[0]: "station_radiation",
            rc[1]: "datetime",
            rc[2]: "diffuse_radiation_Wm2",
            rc[3]: "global_radiation_Wm2",
            rc[4]: "direct_radiation_Wm2",
        }
    )
    wx = wx.rename(
        columns={
            wc[0]: "station_weather",
            wc[1]: "datetime",
            wc[2]: "station_pressure_hpa",
            wc[3]: "relative_humidity_pct",
            wc[4]: "air_temp_C_weather",
            wc[5]: "air_temp_max_C_weather",
            wc[6]: "air_temp_min_C_weather",
            wc[7]: "ground_temp_C",
            wc[8]: "wet_temp_C",
            wc[9]: "wind_dir_deg",
            wc[10]: "gust_dir_deg",
            wc[11]: "wind_speed_ms",
            wc[12]: "wind_speed_1m_max_ms",
            wc[13]: "wind_speed_10m_max_ms",
            wc[14]: "wind_speed_10m_max_end_time",
            wc[15]: "gust_speed_ms",
            wc[16]: "wind_dir_std_deg",
        }
    )

    rad["datetime"] = pd.to_datetime(rad["datetime"], dayfirst=True)
    wx["datetime"] = pd.to_datetime(wx["datetime"], dayfirst=True)
    rad = rad.replace("-", np.nan)
    wx = wx.replace("-", np.nan)

    def _coerce_numeric(df: pd.DataFrame, ignore: List[str]) -> pd.DataFrame:
        for col in df.columns:
            if col in ignore:
                continue
            df[col] = pd.to_numeric(df[col], errors="coerce")
        return df

    rad = _coerce_numeric(rad, ignore=["station_radiation", "datetime"])
    wx = _coerce_numeric(wx, ignore=["station_weather", "datetime", "wind_speed_10m_max_end_time"])
    merged = pd.merge(rad, wx, on="datetime", how="inner")
    return merged.sort_values("datetime")


def load_targets_rh_et0() -> pd.DataFrame:
    df = pd.read_csv(DATA_DIR / "rh_et0.csv")
    df["timestamp"] = pd.to_datetime(df["Date & Time (Summer)"], dayfirst=True)
    df = df.rename(columns={"Eto (mm)": "Eto_mm", "RH%": "Rh_internal", "internal_RH": "Rh_internal"})
    cutoff = pd.Timestamp("2025-09-08 23:59:59")
    df = df[df["timestamp"] <= cutoff]
    return df[["timestamp", "Rh_internal", "Eto_mm"]].sort_values("timestamp")


def load_targets_air() -> pd.DataFrame:
    df = pd.read_csv(DATA_DIR / "air_temp_c.csv")
    df["timestamp"] = pd.to_datetime(df["datetime"] + " " + df["time"], dayfirst=False)
    return df[["timestamp", "air_temp_C"]].sort_values("timestamp")

In [None]:
base_feature_cols = [
    "diffuse_radiation_Wm2",
    "global_radiation_Wm2",
    "direct_radiation_Wm2",
    "station_pressure_hpa",
    "relative_humidity_pct",
    "air_temp_C_weather",
    "air_temp_max_C_weather",
    "air_temp_min_C_weather",
    "ground_temp_C",
    "wet_temp_C",
    "wind_dir_deg",
    "gust_dir_deg",
    "wind_speed_ms",
    "wind_speed_1m_max_ms",
    "wind_speed_10m_max_ms",
    "gust_speed_ms",
    "wind_dir_std_deg",
]

def engineer_features(df_all: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    df_features = df_all[base_feature_cols].copy()
    timestamps = df_all["timestamp"]
    df_features["hour"] = timestamps.dt.hour
    df_features["dayofyear"] = timestamps.dt.dayofyear
    df_features["hour_sin"] = np.sin(2 * np.pi * df_features["hour"] / 24)
    df_features["hour_cos"] = np.cos(2 * np.pi * df_features["hour"] / 24)
    df_features["doy_sin"] = np.sin(2 * np.pi * df_features["dayofyear"] / 365)
    df_features["doy_cos"] = np.cos(2 * np.pi * df_features["dayofyear"] / 365)
    df_features["rh_x_temp"] = df_features["relative_humidity_pct"] * df_features["air_temp_C_weather"]
    lag_cols = [
        "diffuse_radiation_Wm2",
        "global_radiation_Wm2",
        "direct_radiation_Wm2",
        "air_temp_C_weather",
        "relative_humidity_pct",
    ]
    for col in lag_cols:
        df_features[f"{col}_lag1"] = df_all[col].shift(1)
        df_features[f"{col}_lag3"] = df_all[col].shift(3)
        df_features[f"{col}_lag6"] = df_all[col].shift(6)
        df_features[f"{col}_lag18"] = df_all[col].shift(18)
        df_features[f"{col}_lag24"] = df_all[col].shift(24)
        df_features[f"{col}_lag48"] = df_all[col].shift(48)
        df_features[f"{col}_roll3_mean"] = df_all[col].rolling(window=3, min_periods=1).mean().shift(1)
        df_features[f"{col}_roll6_mean"] = df_all[col].rolling(window=6, min_periods=1).mean().shift(1)
        df_features[f"{col}_roll12_mean"] = df_all[col].rolling(window=12, min_periods=1).mean().shift(1)
        df_features[f"{col}_roll24_mean"] = df_all[col].rolling(window=24, min_periods=1).mean().shift(1)
        df_features[f"{col}_roll48_mean"] = df_all[col].rolling(window=48, min_periods=1).mean().shift(1)
    df_features = df_features.apply(pd.to_numeric, errors="coerce")
    non_empty_features = [c for c in df_features.columns if not df_features[c].isna().all()]
    df_features = df_features[non_empty_features].apply(lambda col: col.fillna(col.median()))
    return df_features, non_empty_features

In [None]:
def prepare_target(weather: pd.DataFrame, targets: pd.DataFrame, target_col: str):
    merged = pd.merge_asof(
        targets.sort_values("timestamp"),
        weather.sort_values("datetime"),
        left_on="timestamp",
        right_on="datetime",
        direction="nearest",
        tolerance=pd.Timedelta("30min"),
    )
    df_all = merged.sort_values("timestamp").reset_index(drop=True)
    df_features, non_empty_features = engineer_features(df_all)
    y = pd.to_numeric(df_all[target_col], errors="coerce")
    mask = y.notna()
    df_features = df_features.loc[mask].reset_index(drop=True)
    y = y.loc[mask].reset_index(drop=True)
    df_all = df_all.loc[mask].reset_index(drop=True)
    return df_features, y, df_all, non_empty_features

In [None]:
def train_with_cv(df_features: pd.DataFrame, y: pd.Series, non_empty_features: List[str]):
    tscv = TimeSeriesSplit(n_splits=3)
    X = df_features[non_empty_features].values
    results = []
    for params in param_grid:
        rmses = []
        for tr_idx, val_idx in tscv.split(X):
            X_tr, X_val = X[tr_idx], X[val_idx]
            y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
            model = HistGradientBoostingRegressor(
                max_depth=params.get("max_depth"),
                learning_rate=params.get("learning_rate", 0.1),
                min_samples_leaf=params.get("min_samples_leaf", 20),
                max_bins=params.get("max_bins", 255),
                random_state=0,
            )
            model.fit(X_tr, y_tr)
            preds = model.predict(X_val)
            rmse = mean_squared_error(y_val, preds, squared=False)
            rmses.append(rmse)
        results.append((params, float(np.mean(rmses))))
    best_params, best_score = sorted(results, key=lambda x: x[1])[0]
    final_model = HistGradientBoostingRegressor(
        max_depth=best_params.get("max_depth"),
        learning_rate=best_params.get("learning_rate", 0.1),
        min_samples_leaf=best_params.get("min_samples_leaf", 20),
        max_bins=best_params.get("max_bins", 255),
        random_state=0,
    )
    final_model.fit(X, y)
    return final_model, best_params, best_score

In [None]:
# Load weather
weather = load_weather()

# Air temp target
df_air_tgt = load_targets_air()
air_feat, air_y, air_all, air_non_empty = prepare_target(weather, df_air_tgt, "air_temp_C")

# RH/ETo targets
df_rh_tgt = load_targets_rh_et0()
rh_feat, rh_y, rh_all, rh_non_empty = prepare_target(weather, df_rh_tgt, "Rh_internal")
eto_feat, eto_y, eto_all, eto_non_empty = prepare_target(weather, df_rh_tgt, "Eto_mm")

In [None]:
# Train models
air_model, air_params, air_cv = train_with_cv(air_feat, air_y, air_non_empty)
rh_model, rh_params, rh_cv = train_with_cv(rh_feat, rh_y, rh_non_empty)
eto_model, eto_params, eto_cv = train_with_cv(eto_feat, eto_y, eto_non_empty)

print("Best params:")
print("air_temp_C", air_params, air_cv)
print("Rh_internal", rh_params, rh_cv)
print("Eto_mm", eto_params, eto_cv)

In [None]:
# Chronological 80/20 splits for metrics
def split_for_metrics(df_feat, y, non_empty):
    split_idx = int(len(df_feat) * 0.8)
    X_train = df_feat.iloc[:split_idx][non_empty]
    X_test = df_feat.iloc[split_idx:][non_empty]
    y_train = y.iloc[:split_idx]
    y_test = y.iloc[split_idx:]
    return X_train, X_test, y_train, y_test

air_Xtr, air_Xte, air_ytr, air_yte = split_for_metrics(air_feat, air_y, air_non_empty)
rh_Xtr, rh_Xte, rh_ytr, rh_yte = split_for_metrics(rh_feat, rh_y, rh_non_empty)
eto_Xtr, eto_Xte, eto_ytr, eto_yte = split_for_metrics(eto_feat, eto_y, eto_non_empty)

air_model.fit(air_Xtr, air_ytr)
rh_model.fit(rh_Xtr, rh_ytr)
eto_model.fit(eto_Xtr, eto_ytr)

air_pred = air_model.predict(air_Xte)
rh_pred = rh_model.predict(rh_Xte)
eto_pred = eto_model.predict(eto_Xte)
eto_pred = np.maximum(eto_pred, 0)

print("Test metrics (chronological split):")
for name, true, pred in [
    ("air_temp_C", air_yte, air_pred),
    ("Rh_internal", rh_yte, rh_pred),
    ("Eto_mm", eto_yte, eto_pred),
]:
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = mean_squared_error(true, pred, squared=False)
    r2 = r2_score(true, pred)
    print(f"- {name}")
    print(f"  MAE:  {mae:.3f}")
    print(f"  MSE:  {mse:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R2:   {r2:.4f}")

In [None]:
# Inspect RH training rows
split_idx = int(len(rh_all) * 0.8)
rh_train_rows = rh_all.iloc[:split_idx][["timestamp", "Rh_internal"]].copy()
print(f"RH training rows: {len(rh_train_rows)}")
print(f"Range: {rh_train_rows['timestamp'].min()} -> {rh_train_rows['timestamp'].max()}")
print("Head:")
print(rh_train_rows.head())
print("Tail:")
print(rh_train_rows.tail())


In [None]:
# Export test predictions (per target + combined)
import numpy as np
import pandas as pd

test_air = air_all.iloc[int(len(air_all)*0.8):].copy()
test_air["pred_air_temp_C"] = air_pred
test_air_path = DATA_DIR / "test_predictions_air.csv"
test_air[["timestamp", "air_temp_C", "pred_air_temp_C"]].to_csv(test_air_path, index=False)

test_rh = rh_all.iloc[int(len(rh_all)*0.8):].copy()
test_rh["pred_Rh_internal"] = rh_pred
test_rh_path = DATA_DIR / "test_predictions_rh.csv"
test_rh[["timestamp", "Rh_internal", "pred_Rh_internal"]].to_csv(test_rh_path, index=False)

test_eto = eto_all.iloc[int(len(eto_all)*0.8):].copy()
test_eto["pred_Eto_mm"] = np.maximum(eto_pred, 0)
test_eto_path = DATA_DIR / "test_predictions_eto.csv"
test_eto[["timestamp", "Eto_mm", "pred_Eto_mm"]].to_csv(test_eto_path, index=False)

# Combined, aligned to air timestamps
test_output = pd.DataFrame({"timestamp": test_air["timestamp"]})
test_output["air_temp_C"] = test_air["air_temp_C"].values
test_output["pred_air_temp_C"] = test_air["pred_air_temp_C"].values
test_output["Rh_internal"] = test_rh.set_index("timestamp")["Rh_internal"].reindex(test_air["timestamp"]).values
test_output["pred_Rh_internal"] = test_rh.set_index("timestamp")["pred_Rh_internal"].reindex(test_air["timestamp"]).values
test_output["Eto_mm"] = test_eto.set_index("timestamp")["Eto_mm"].reindex(test_air["timestamp"]).values
test_output["pred_Eto_mm"] = test_eto.set_index("timestamp")["pred_Eto_mm"].reindex(test_air["timestamp"]).values

test_path = DATA_DIR / "test_predictions_rh_temp_et0_split.csv"
test_output.to_csv(test_path, index=False)
print(f"Wrote test predictions to {test_air_path}, {test_rh_path}, {test_eto_path}, {test_path}")
test_output.head()


In [None]:
# Predict full weather range and export
weather_full = load_weather().sort_values("datetime").reset_index(drop=True)
wf_features = weather_full[base_feature_cols].copy()
wf_features["hour"] = weather_full["datetime"].dt.hour
wf_features["dayofyear"] = weather_full["datetime"].dt.dayofyear
wf_features["hour_sin"] = np.sin(2 * np.pi * wf_features["hour"] / 24)
wf_features["hour_cos"] = np.cos(2 * np.pi * wf_features["hour"] / 24)
wf_features["doy_sin"] = np.sin(2 * np.pi * wf_features["dayofyear"] / 365)
wf_features["doy_cos"] = np.cos(2 * np.pi * wf_features["dayofyear"] / 365)
wf_features["rh_x_temp"] = wf_features["relative_humidity_pct"] * wf_features["air_temp_C_weather"]
lag_cols = [
    "diffuse_radiation_Wm2",
    "global_radiation_Wm2",
    "direct_radiation_Wm2",
    "air_temp_C_weather",
    "relative_humidity_pct",
]
for col in lag_cols:
    wf_features[f"{col}_lag1"] = weather_full[col].shift(1)
    wf_features[f"{col}_lag3"] = weather_full[col].shift(3)
    wf_features[f"{col}_lag6"] = weather_full[col].shift(6)
    wf_features[f"{col}_lag18"] = weather_full[col].shift(18)
    wf_features[f"{col}_lag24"] = weather_full[col].shift(24)
    wf_features[f"{col}_lag48"] = weather_full[col].shift(48)
    wf_features[f"{col}_roll3_mean"] = weather_full[col].rolling(window=3, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll6_mean"] = weather_full[col].rolling(window=6, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll12_mean"] = weather_full[col].rolling(window=12, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll24_mean"] = weather_full[col].rolling(window=24, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll48_mean"] = weather_full[col].rolling(window=48, min_periods=1).mean().shift(1)
wf_features = wf_features.apply(pd.to_numeric, errors="coerce")
wf_features = wf_features.reindex(columns=air_non_empty)
air_medians = air_feat[air_non_empty].median()
wf_features = wf_features.fillna(air_medians)

season_preds = pd.DataFrame({
    "timestamp": weather_full["datetime"].reset_index(drop=True),
    "pred_air_temp_C": air_model.predict(wf_features),
    "pred_Rh_internal": rh_model.predict(wf_features[rh_non_empty].reindex(columns=rh_non_empty)),
    "pred_Eto_mm": np.maximum(eto_model.predict(wf_features[eto_non_empty].reindex(columns=eto_non_empty)), 0),
})
season_path = DATA_DIR / "full_weather_predictions_rh_temp_et0_split.csv"
season_preds.to_csv(season_path, index=False)
print(f"Wrote full weather-range predictions to {season_path}")
season_preds.head()