# Greenhouse time-series model

Walk-forward model that uses external Bet Dagan weather and radiation data as exogenous features to predict internal greenhouse variables (air temp, RH, ET0, internal radiation). Each iteration trains on 1440 rows (10 days at 10 min), tests on 144 rows (1 day), then skips 144 rows (1 day) before the next training window.

In [18]:

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import clone
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-v0_8-whitegrid")
DATA_DIR = Path('.')
TRAIN_WINDOW = 1440  # 10 days of 10-minute samples
TEST_WINDOW = 144    # 1 day of 10-minute samples
GAP = 0            # no skip between train and test blocks
MAX_TRAIN_ROWS = TRAIN_WINDOW  # sliding window: 10 days train for each 1-day test
from xgboost import XGBRegressor






In [19]:
# Load raw files
weather_path = DATA_DIR / "bet_dagan_weather.csv"
radiation_path = DATA_DIR / "bet_dagan_radiation.csv"
micro_path = DATA_DIR / "micro_climate_rh_t_et0.xlsx"

weather = pd.read_csv(weather_path)
radiation = pd.read_csv(radiation_path)
micro = pd.read_excel(micro_path).rename(columns={"internal_rh_%": "internal_rh_pct"})

# Replace Hebrew headers with ASCII names based on column order for reproducibility
weather.columns = [
    "station",
    "timestamp",
    "station_pressure_hpa",
    "rel_humidity_ext",
    "temp_c_ext",
    "temp_max_c_ext",
    "temp_min_c_ext",
    "temp_ground_c_ext",
    "temp_wet_c_ext",
    "wind_dir_deg",
    "gust_dir_deg",
    "wind_speed_ms",
    "wind_speed_max_1m_ms",
    "wind_speed_max_10m_ms",
    "wind_speed_max_10m_time",
    "gust_speed_ms",
    "wind_dir_std_deg",
]

radiation.columns = [
    "rad_station",
    "timestamp",
    "diffuse_rad_wm2",
    "global_rad_wm2",
    "direct_rad_wm2",
]

# Drop station id columns; we only need the measurements
weather = weather.drop(columns=["station"])
radiation = radiation.drop(columns=["rad_station"])

# Convert numeric columns
for df in (weather, radiation, micro):
    for col in df.columns:
        if col not in ("timestamp", "timestamp_dayfirst"):
            df[col] = pd.to_numeric(df[col], errors="coerce")

# Robust parse for micro timestamps (handles mixed formats)
micro_ts_raw = micro["timestamp_dayfirst"].astype(str).str.replace("‏", "", regex=False).str.strip()
micro_ts = micro_ts_raw.apply(lambda x: pd.to_datetime(x, dayfirst=True, errors="coerce"))
unparsed = micro_ts.isna().sum()
if unparsed:
    print(f"Warning: {unparsed} micro rows had unparsable timestamps; dropping them")
micro = micro.assign(timestamp=micro_ts).dropna(subset=["timestamp"])

# Ensure micro is sorted
micro = micro.sort_values("timestamp").reset_index(drop=True)

# Parse weather/radiation timestamps
weather["timestamp"] = pd.to_datetime(weather["timestamp"], dayfirst=True, format="mixed")
radiation["timestamp"] = pd.to_datetime(radiation["timestamp"], dayfirst=True, format="mixed")

weather = weather.drop(columns=[c for c in weather.columns if weather[c].isna().all()])
radiation = radiation.drop(columns=[c for c in radiation.columns if radiation[c].isna().all()])

# Moisture physics features
def calculate_saturation_vapor_pressure(temp_c):
    return 0.61078 * np.exp((17.27 * temp_c) / (temp_c + 237.3))

def calculate_vpd(temp_c, rh_pct):
    es = calculate_saturation_vapor_pressure(temp_c)
    ea = es * (rh_pct / 100.0)
    return es - ea

def calculate_absolute_humidity(temp_c, rh_pct):
    return (6.112 * np.exp((17.67 * temp_c) / (temp_c + 243.5)) * rh_pct * 2.1674) / (273.15 + temp_c)

weather["vpd_ext"] = calculate_vpd(weather["temp_c_ext"], weather["rel_humidity_ext"])
weather["abs_humidity_ext"] = calculate_absolute_humidity(weather["temp_c_ext"], weather["rel_humidity_ext"])
weather["dew_point_ext"] = weather["temp_c_ext"] - ((100 - weather["rel_humidity_ext"]) / 5)

weather.head(), radiation.head(), micro.head()


(            timestamp  station_pressure_hpa  rel_humidity_ext  temp_c_ext  \
 0 2025-05-29 00:00:00                1009.0                70        21.5   
 1 2025-05-29 00:10:00                1008.9                70        21.5   
 2 2025-05-29 00:20:00                1008.8                70        21.5   
 3 2025-05-29 00:30:00                1008.8                71        21.4   
 4 2025-05-29 00:40:00                1008.8                71        21.3   
 
    temp_max_c_ext  temp_min_c_ext  temp_ground_c_ext  wind_dir_deg  \
 0            21.5            21.4               20.8           240   
 1            21.5            21.5               20.8           245   
 2            21.5            21.4               20.5           247   
 3            21.4            21.3               20.4           242   
 4            21.4            21.3               20.3           245   
 
    gust_dir_deg  wind_speed_ms  wind_speed_max_1m_ms  wind_speed_max_10m_ms  \
 0           238      

In [20]:
# Align all sources on timestamp; keep micro as spine
target_cols = [c for c in ["ET0", "internal_air_temp_c", "internal_radiation"] if c in micro.columns]
merged = micro.merge(weather, on="timestamp", how="left").merge(radiation, on="timestamp", how="left")

missing_rate = merged.isna().mean() * 100
display(missing_rate.sort_values(ascending=False))

# Reindex to continuous 10-min grid; fill only exogenous columns
full_range = pd.date_range(micro["timestamp"].min(), micro["timestamp"].max(), freq="10min")
reindexed = merged.set_index("timestamp").reindex(full_range)
reindexed.index.name = "timestamp"
reindexed = reindexed.reset_index()

exogenous_cols = [c for c in reindexed.columns if c not in target_cols + ["timestamp", "timestamp_dayfirst"]]
reindexed = reindexed.set_index("timestamp")
reindexed[exogenous_cols] = reindexed[exogenous_cols].interpolate(method="time").bfill().ffill()
reindexed = reindexed.reset_index()

# Drop rows only if targets are missing
data = reindexed.dropna(subset=target_cols).sort_values("timestamp").reset_index(drop=True)
print(f"Rows after aligning, reindexing, and filling exogenous gaps: {len(data):,}")
print(f"Time span: {data['timestamp'].iloc[0]} -> {data['timestamp'].iloc[-1]}")

# Optional manual outlier time windows
OUTLIER = [
    (pd.Timestamp("2025-08-28 08:00"), pd.Timestamp("2025-08-28 11:00")),
    (pd.Timestamp("2025-09-11 00:00"), pd.Timestamp("2025-09-11 10:00")),
    (pd.Timestamp("2025-09-12 05:00"), pd.Timestamp("2025-09-12 06:30")),
    (pd.Timestamp("2025-09-18 00:00"), pd.Timestamp("2025-09-18 08:20")),
]
if OUTLIER:
    before = len(data)
    for start, end in OUTLIER:
        data = data[(data["timestamp"] < start) | (data["timestamp"] > end)]
    removed = before - len(data)
    print(f"Removed {removed} rows based on OUTLIER")
    data = data.reset_index(drop=True)

# Calendar features
data["hour_of_day"] = data["timestamp"].dt.hour
data["day_of_year"] = data["timestamp"].dt.dayofyear

# Rolling medians to smooth noisy externals (6-step = 60 minutes)
smooth_cols = ["global_rad_wm2", "diffuse_rad_wm2", "direct_rad_wm2", "temp_c_ext", "rel_humidity_ext"]
for c in smooth_cols:
    if c in data.columns:
        data[f"{c}_med6"] = data[c].rolling(window=6, min_periods=1, center=True).median()

# External VPD + dew point
if {"temp_c_ext", "rel_humidity_ext"}.issubset(data.columns):
    data["vpd_ext_calc"] = calculate_vpd(data["temp_c_ext"], data["rel_humidity_ext"])
    data["dew_point_ext_calc"] = data["temp_c_ext"] - ((100 - data["rel_humidity_ext"]) / 5)

# Simple gradient proxy (fine to keep)
if {"dew_point_ext_calc", "temp_c_ext"}.issubset(data.columns):
    data["dewpoint_gradient"] = data["dew_point_ext_calc"] - data["temp_c_ext"]

# VPD rollups
if "vpd_ext_calc" in data.columns:
    data["vpd_ext_rolling_mean_30min"] = data["vpd_ext_calc"].rolling(window=3, min_periods=1).mean()
    data["vpd_ext_min_1h"] = data["vpd_ext_calc"].rolling(window=6, min_periods=1).min()

# Radiation integrals/peaks
if "global_rad_wm2" in data.columns:
    data["rad_integral_1h"] = data["global_rad_wm2"].rolling(window=6, min_periods=1).sum()
    data["rad_peak_intensity"] = data["global_rad_wm2"].rolling(window=6, min_periods=1).max()

# Extra sunrise dynamics
if "global_rad_wm2" in data.columns:
    data["rad_slope_10min"] = data["global_rad_wm2"].diff()
    data["rad_slope_30min"] = data["global_rad_wm2"].diff(3)
    data["rad_integral_3h"] = data["global_rad_wm2"].rolling(window=18, min_periods=1).sum()
    data["rad_peak_30min"] = data["global_rad_wm2"].rolling(window=3, min_periods=1).max()

    # Sunrise/sunset flags and timers to capture rapid RH shifts
    rad_day_threshold = 40
    data["is_day"] = (data["global_rad_wm2"] > rad_day_threshold).astype(int)

# VPD dynamics + memory
if "vpd_ext_calc" in data.columns:
    data["vpd_slope_30min"] = data["vpd_ext_calc"].diff(3)
    data["vpd_integral_1h"] = data["vpd_ext_calc"].rolling(window=6, min_periods=1).sum()
    data["vpd_integral_1h_lag_1h"] = data["vpd_integral_1h"].shift(6)



# Additional feature engineering
if {"rad_integral_1h", "temp_c_ext"}.issubset(data.columns):
    data["rad_efficiency"] = data["rad_integral_1h"] / (data["temp_c_ext"] + 273.15)

if "temp_c_ext" in data.columns:
    data["temp_ext_1h_mean"] = data["temp_c_ext"].rolling(window=6, min_periods=1).mean()
    data["temp_ext_3h_mean"] = data["temp_c_ext"].rolling(window=18, min_periods=1).mean()
    data["temp_ext_6h_mean"] = data["temp_c_ext"].rolling(window=36, min_periods=1).mean()

wind_col = "wind_speed_ms" if "wind_speed_ms" in data.columns else ("wind_ms_ext" if "wind_ms_ext" in data.columns else None)
if wind_col:
    data["wind_ext_1h_mean"] = data[wind_col].rolling(window=6, min_periods=1).mean()
    data["wind_energy"] = data[wind_col] ** 2

if "rel_humidity_ext" in data.columns:
    data["rh_ext_30min_mean"] = data["rel_humidity_ext"].rolling(window=3, min_periods=1).mean()
    data["rh_ext_2h_min"] = data["rel_humidity_ext"].rolling(window=12, min_periods=1).min()
    data["rh_ext_6h_mean"] = data["rel_humidity_ext"].rolling(window=36, min_periods=1).mean()

if "vpd_ext_calc" in data.columns:
    data["vpd_ext_lag_1h"] = data["vpd_ext_calc"].shift(6)
    data["vpd_ext_lag_3h"] = data["vpd_ext_calc"].shift(18)
    data["vpd_drop_flag"] = (data["vpd_ext_calc"].diff() < 0).astype(int)

if {"global_rad_wm2", "vpd_ext_calc"}.issubset(data.columns):
    data["rad_vpd_interaction"] = data["global_rad_wm2"] * data["vpd_ext_calc"]

if {"global_rad_wm2", "rel_humidity_ext"}.issubset(data.columns):
    data["rad_rh_interaction"] = data["global_rad_wm2"] * data["rel_humidity_ext"]

if "is_day" in data.columns:
    data["time_since_sunrise"] = (
        data.groupby((data["is_day"] != data["is_day"].shift()).cumsum()).cumcount()
    )

if {"temp_c_ext", "global_rad_wm2"}.issubset(data.columns):
    data["ventilation_proxy"] = (
        (data["temp_c_ext"] > 24) &
        (data["global_rad_wm2"] > 200)
    ).astype(int)

if {"rel_humidity_ext", "vpd_ext_calc"}.issubset(data.columns):
    data["humidification_proxy"] = (
        (data["rel_humidity_ext"] < 50) &
        (data["vpd_ext_calc"] > 1.2)
    ).astype(int)


data.head()

# Select model features for training
non_feature_cols = target_cols + ['timestamp', 'timestamp_dayfirst']
feature_cols = [c for c in data.columns if c not in non_feature_cols]
print(f'Feature count: {len(feature_cols)}')
print(feature_cols)


diffuse_rad_wm2          0.929145
ET0                      0.000000
internal_air_temp_c      0.000000
internal_rh_pct          0.000000
internal_radiation       0.000000
timestamp                0.000000
station_pressure_hpa     0.000000
rel_humidity_ext         0.000000
timestamp_dayfirst       0.000000
temp_c_ext               0.000000
temp_max_c_ext           0.000000
temp_ground_c_ext        0.000000
temp_min_c_ext           0.000000
gust_dir_deg             0.000000
wind_speed_ms            0.000000
wind_speed_max_1m_ms     0.000000
wind_dir_deg             0.000000
wind_speed_max_10m_ms    0.000000
gust_speed_ms            0.000000
vpd_ext                  0.000000
wind_dir_std_deg         0.000000
abs_humidity_ext         0.000000
dew_point_ext            0.000000
global_rad_wm2           0.000000
direct_rad_wm2           0.000000
dtype: float64

Rows after aligning, reindexing, and filling exogenous gaps: 16,682
Time span: 2025-05-29 01:00:00 -> 2025-09-21 23:00:00
Removed 141 rows based on OUTLIER
Feature count: 59
['internal_rh_pct', 'station_pressure_hpa', 'rel_humidity_ext', 'temp_c_ext', 'temp_max_c_ext', 'temp_min_c_ext', 'temp_ground_c_ext', 'wind_dir_deg', 'gust_dir_deg', 'wind_speed_ms', 'wind_speed_max_1m_ms', 'wind_speed_max_10m_ms', 'gust_speed_ms', 'wind_dir_std_deg', 'vpd_ext', 'abs_humidity_ext', 'dew_point_ext', 'diffuse_rad_wm2', 'global_rad_wm2', 'direct_rad_wm2', 'hour_of_day', 'day_of_year', 'global_rad_wm2_med6', 'diffuse_rad_wm2_med6', 'direct_rad_wm2_med6', 'temp_c_ext_med6', 'rel_humidity_ext_med6', 'vpd_ext_calc', 'dew_point_ext_calc', 'dewpoint_gradient', 'vpd_ext_rolling_mean_30min', 'vpd_ext_min_1h', 'rad_integral_1h', 'rad_peak_intensity', 'rad_slope_10min', 'rad_slope_30min', 'rad_integral_3h', 'rad_peak_30min', 'is_day', 'vpd_slope_30min', 'vpd_integral_1h', 'vpd_integral_1h_lag_1h', 'rad_efficie

In [21]:
from sklearn.base import clone

rh_estimator = XGBRegressor(
    n_estimators=1200,
    max_depth=7,
    learning_rate=0.025,
    min_child_weight=8,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=2.0,
    reg_alpha=0.3,
    gamma=0.0,
    random_state=42,
    n_jobs=-1,
    objective="reg:squarederror",
    tree_method="hist",
)

def walk_forward(df, feature_cols, target_cols, model,
                 train_window=TRAIN_WINDOW, test_window=TEST_WINDOW,
                 gap=GAP, max_train_rows=MAX_TRAIN_ROWS):
    metrics = []
    preds = []
    train_end = train_window
    test_end = train_end + test_window
    run = 0

    while test_end <= len(df):
        train_start_idx = max(0, train_end - max_train_rows) if max_train_rows else 0
        train = df.iloc[train_start_idx:train_end]
        test = df.iloc[train_end:test_end]

        metric = {
            "run": run,
            "train_start": train["timestamp"].iloc[0],
            "train_end": train["timestamp"].iloc[-1],
            "test_start": test["timestamp"].iloc[0],
            "test_end": test["timestamp"].iloc[-1],
            "train_rows": len(train),
            "test_rows": len(test),
        }

        preds_run = {}

        for t_col in target_cols:
            model_t = clone(model)
            model_t.fit(train[feature_cols], train[t_col])

            pred_test = model_t.predict(test[feature_cols])
            pred_test = np.clip(pred_test, a_min=0, a_max=None)
            preds_run[t_col] = pred_test
            metric[f"mae_{t_col}"] = mean_absolute_error(test[t_col], pred_test)
            denom = max(np.abs(test[t_col]).mean(), 1e-6)
            metric[f"mae_pct_{t_col}"] = (metric[f"mae_{t_col}"] / denom) * 100
            mse_test = mean_squared_error(test[t_col], pred_test)
            metric[f"rmse_{t_col}"] = np.sqrt(mse_test)
            metric[f"r2_{t_col}"] = r2_score(test[t_col], pred_test)

        metrics.append(metric)

        preds.append(
            pd.concat(
                [
                    test[["timestamp"] + target_cols].reset_index(drop=True),
                    pd.DataFrame({f"pred_{t}": preds_run[t] for t in target_cols}),
                ],
                axis=1,
            ).assign(run=run)
        )

        print(
            f"Run {run}: train {metric['train_rows']} rows ({metric['train_start']} -> {metric['train_end']}), "
            f"test {metric['test_rows']} rows ({metric['test_start']} -> {metric['test_end']})"
        )
        for t_col in target_cols:
            msg = (
                f"  {t_col}: MAE={metric[f'mae_{t_col}']:.4f}, "
                f"RMSE={metric[f'rmse_{t_col}']:.4f}, R2={metric[f'r2_{t_col}']:.4f}"
            )
            print(msg)

        run += 1
        train_end += test_window
        test_end = train_end + test_window

    if not metrics:
        raise ValueError("Not enough data for a single walk-forward run; check window sizes.")

    return pd.DataFrame(metrics), pd.concat(preds, ignore_index=True)


In [22]:
metrics_all, preds_all = walk_forward(
    data,
    feature_cols,
    target_cols,
    rh_estimator,
    max_train_rows=MAX_TRAIN_ROWS,
)

# Persist and display metrics
metrics_all.to_csv("metrics_multitarget.csv", index=False)
preds_all.to_csv("preds_multitarget.csv", index=False)

display(metrics_all)


# Collect predicted target columns for later RH modeling
pred_target_cols = [f"pred_{t}" for t in target_cols if f"pred_{t}" in preds_all.columns]
preds_targets_only = preds_all[["timestamp"] + pred_target_cols].copy()
preds_targets_only.to_csv("predicted_targets_for_rh.csv", index=False)
print("Saved target predictions to predicted_targets_for_rh.csv")


Run 0: train 1440 rows (2025-05-29 01:00:00 -> 2025-06-08 00:50:00), test 144 rows (2025-06-08 01:00:00 -> 2025-06-09 00:50:00)
  ET0: MAE=0.0186, RMSE=0.0380, R2=0.9655
  internal_air_temp_c: MAE=0.4690, RMSE=0.6315, R2=0.9744
  internal_radiation: MAE=27.0037, RMSE=61.8792, R2=0.9378
Run 1: train 1440 rows (2025-05-30 01:00:00 -> 2025-06-09 00:50:00), test 144 rows (2025-06-09 01:00:00 -> 2025-06-10 00:50:00)
  ET0: MAE=0.0148, RMSE=0.0229, R2=0.9876
  internal_air_temp_c: MAE=0.3143, RMSE=0.3849, R2=0.9902
  internal_radiation: MAE=23.5295, RMSE=38.2090, R2=0.9736
Run 2: train 1440 rows (2025-05-31 01:00:00 -> 2025-06-10 00:50:00), test 144 rows (2025-06-10 01:00:00 -> 2025-06-11 00:50:00)
  ET0: MAE=0.0156, RMSE=0.0264, R2=0.9841
  internal_air_temp_c: MAE=0.3908, RMSE=0.5021, R2=0.9850
  internal_radiation: MAE=27.8927, RMSE=48.8101, R2=0.9588
Run 3: train 1440 rows (2025-06-01 01:00:00 -> 2025-06-11 00:50:00), test 144 rows (2025-06-11 01:00:00 -> 2025-06-12 00:50:00)
  ET0: MAE=

Unnamed: 0,run,train_start,train_end,test_start,test_end,train_rows,test_rows,mae_ET0,mae_pct_ET0,rmse_ET0,r2_ET0,mae_internal_air_temp_c,mae_pct_internal_air_temp_c,rmse_internal_air_temp_c,r2_internal_air_temp_c,mae_internal_radiation,mae_pct_internal_radiation,rmse_internal_radiation,r2_internal_radiation
0,0,2025-05-29 01:00:00,2025-06-08 00:50:00,2025-06-08 01:00:00,2025-06-09 00:50:00,1440,144,0.018635,9.342600,0.038015,0.965452,0.468964,1.703240,0.631506,0.974395,27.003712,11.545837,61.879206,0.937842
1,1,2025-05-30 01:00:00,2025-06-09 00:50:00,2025-06-09 01:00:00,2025-06-10 00:50:00,1440,144,0.014838,7.709655,0.022904,0.987646,0.314256,1.142650,0.384918,0.990195,23.529493,10.796431,38.208971,0.973596
2,2,2025-05-31 01:00:00,2025-06-10 00:50:00,2025-06-10 01:00:00,2025-06-11 00:50:00,1440,144,0.015567,7.991463,0.026364,0.984132,0.390790,1.409703,0.502106,0.985008,27.892667,12.823004,48.810099,0.958832
3,3,2025-06-01 01:00:00,2025-06-11 00:50:00,2025-06-11 01:00:00,2025-06-12 00:50:00,1440,144,0.025422,13.767718,0.043902,0.954961,0.334638,1.206296,0.428728,0.990217,25.206272,11.912409,55.475533,0.950336
4,4,2025-06-02 01:00:00,2025-06-12 00:50:00,2025-06-12 01:00:00,2025-06-13 00:50:00,1440,144,0.023752,13.427032,0.038684,0.964570,0.356782,1.298688,0.500870,0.986276,36.197231,17.993133,59.530769,0.939164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,99,2025-09-05 06:00:00,2025-09-15 17:40:00,2025-09-15 17:50:00,2025-09-16 17:40:00,1440,144,0.031306,33.395068,0.064869,0.809852,0.403970,1.496789,0.544060,0.975790,38.800348,32.131158,82.076782,0.814060
100,100,2025-09-06 06:00:00,2025-09-16 17:40:00,2025-09-16 17:50:00,2025-09-17 17:40:00,1440,144,0.019458,19.964695,0.037480,0.943413,0.422249,1.549778,0.531024,0.977557,26.212030,20.930522,51.860905,0.933225
101,101,2025-09-07 06:00:00,2025-09-17 17:40:00,2025-09-17 17:50:00,2025-09-19 02:10:00,1440,144,0.030052,31.125152,0.068125,0.831656,1.165267,4.280649,1.380812,0.825266,38.426608,32.383317,84.906986,0.824682
102,102,2025-09-08 06:00:00,2025-09-19 02:10:00,2025-09-19 02:20:00,2025-09-20 02:10:00,1440,144,0.017892,19.287762,0.036315,0.945394,0.678083,2.481681,0.963499,0.927615,22.611382,19.434127,45.103766,0.946484


Saved target predictions to predicted_targets_for_rh.csv


In [26]:
# Per-target mean summary (keeps MAE, MAE%, RMSE, R2)
for t in target_cols:
    cols = [c for c in metrics_all.columns if any(k in c for k in [f"mae_{t}", f"mae_pct_{t}", f"rmse_{t}", f"r2_{t}"])]
    if cols:
        print(f"\nOverall for {t}:")
        display(metrics_all[cols].mean().to_frame("mean").T)




Overall for ET0:


Unnamed: 0,mae_ET0,mae_pct_ET0,rmse_ET0,r2_ET0
mean,0.019704,13.000702,0.03565,0.962096



Overall for internal_air_temp_c:


Unnamed: 0,mae_internal_air_temp_c,mae_pct_internal_air_temp_c,rmse_internal_air_temp_c,r2_internal_air_temp_c
mean,0.448716,1.56662,0.58665,0.970745



Overall for internal_radiation:


Unnamed: 0,mae_internal_radiation,mae_pct_internal_radiation,rmse_internal_radiation,r2_internal_radiation
mean,24.940352,13.298079,47.232915,0.956136


In [23]:
# Build dataset for internal RH using out-of-fold target predictions
rh_target = "internal_rh_pct"
pred_target_cols = [f"pred_{t}" for t in target_cols if f"pred_{t}" in preds_targets_only.columns]

# Merge predictions back to original data; drop rows without predictions or RH target
data_for_rh = data.merge(preds_targets_only, on="timestamp", how="inner")
data_for_rh = data_for_rh.dropna(subset=[rh_target] + pred_target_cols)

# Use existing exogenous features plus predicted targets (not the actual target columns)
exclude_cols = target_cols + [rh_target, "timestamp", "timestamp_dayfirst"]
base_rh_features = [c for c in data_for_rh.columns if c not in exclude_cols and not c.startswith("pred_")]
rh_feature_cols = base_rh_features + pred_target_cols
print(f"RH feature count: {len(rh_feature_cols)}")

metrics_rh, preds_rh = walk_forward(
    data_for_rh,
    rh_feature_cols,
    [rh_target],
    rh_estimator,
    max_train_rows=MAX_TRAIN_ROWS,
)
metrics_rh.to_csv("metrics_internal_rh.csv", index=False)
preds_rh.to_csv("preds_internal_rh.csv", index=False)

print("Saved RH metrics to metrics_internal_rh.csv and predictions to preds_internal_rh.csv")
display(metrics_rh.head())


RH feature count: 61
Run 0: train 1440 rows (2025-06-08 01:00:00 -> 2025-06-18 01:00:00), test 144 rows (2025-06-18 01:10:00 -> 2025-06-19 01:00:00)
  internal_rh_pct: MAE=1.9652, RMSE=2.6313, R2=0.9301
Run 1: train 1440 rows (2025-06-09 01:00:00 -> 2025-06-19 01:00:00), test 144 rows (2025-06-19 01:10:00 -> 2025-06-20 01:00:00)
  internal_rh_pct: MAE=2.7221, RMSE=3.7271, R2=0.7872
Run 2: train 1440 rows (2025-06-10 01:00:00 -> 2025-06-20 01:00:00), test 144 rows (2025-06-20 01:10:00 -> 2025-06-21 01:00:00)
  internal_rh_pct: MAE=2.8528, RMSE=5.0251, R2=0.7291
Run 3: train 1440 rows (2025-06-11 01:00:00 -> 2025-06-21 01:00:00), test 144 rows (2025-06-21 01:10:00 -> 2025-06-22 01:00:00)
  internal_rh_pct: MAE=2.3721, RMSE=3.8143, R2=0.8425
Run 4: train 1440 rows (2025-06-12 01:00:00 -> 2025-06-22 01:00:00), test 144 rows (2025-06-22 01:10:00 -> 2025-06-23 01:00:00)
  internal_rh_pct: MAE=1.7919, RMSE=2.3500, R2=0.9347
Run 5: train 1440 rows (2025-06-13 01:00:00 -> 2025-06-23 01:00:00), 

Unnamed: 0,run,train_start,train_end,test_start,test_end,train_rows,test_rows,mae_internal_rh_pct,mae_pct_internal_rh_pct,rmse_internal_rh_pct,r2_internal_rh_pct
0,0,2025-06-08 01:00:00,2025-06-18 01:00:00,2025-06-18 01:10:00,2025-06-19 01:00:00,1440,144,1.965179,2.71494,2.631341,0.930057
1,1,2025-06-09 01:00:00,2025-06-19 01:00:00,2025-06-19 01:10:00,2025-06-20 01:00:00,1440,144,2.722078,3.795934,3.727098,0.787219
2,2,2025-06-10 01:00:00,2025-06-20 01:00:00,2025-06-20 01:10:00,2025-06-21 01:00:00,1440,144,2.852785,3.854552,5.025092,0.729079
3,3,2025-06-11 01:00:00,2025-06-21 01:00:00,2025-06-21 01:10:00,2025-06-22 01:00:00,1440,144,2.372096,3.198473,3.814282,0.842541
4,4,2025-06-12 01:00:00,2025-06-22 01:00:00,2025-06-22 01:10:00,2025-06-23 01:00:00,1440,144,1.791897,2.395981,2.350035,0.934654


In [None]:
for t in rh_target:
    cols = [c for c in metrics_rh .columns if any(k in c for k in [f"mae_{t}", f"mae_pct_{t}", f"rmse_{t}", f"r2_{t}"])]
    if cols:
        print(f"\nOverall for {t}:")
        display(metrics_rh [cols].mean().to_frame("mean").T)



Overall for i:


Unnamed: 0,mae_internal_rh_pct,mae_pct_internal_rh_pct,rmse_internal_rh_pct,r2_internal_rh_pct
mean,2.912529,4.097978,3.784704,0.813283



Overall for p:


Unnamed: 0,mae_pct_internal_rh_pct
mean,4.097978
