# Greenhouse Air Sensor Prediction (HGB, weather-only)

Weather-only features (time sin/cos, lags/rolls, rh_x_temp). HistGradientBoostingRegressor per target with time-aware validation.

## Split strategy
- Chronological 80% train/validation, newest 20% test to avoid leakage.
- Forward-chaining TimeSeriesSplit inside training for hyperparameter selection.

In [1]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List

import numpy as np
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

DATA_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path().resolve()

In [2]:
def load_weather() -> pd.DataFrame:
    rad = pd.read_csv(DATA_DIR / "bet_dagan_radiation.csv")
    wx = pd.read_csv(DATA_DIR / "bet_dagan_weather.csv")

    rad = rad.rename(
        columns={
            "תחנה": "station_radiation",
            "תאריך ושעה (שעון קיץ)": "datetime",
            "קרינה מפוזרת (וואט/מ\"ר)": "diffuse_radiation_Wm2",
            "קרינה גלובלית (וואט/מ\"ר)": "global_radiation_Wm2",
            "קרינה ישירה (וואט/מ\"ר)": "direct_radiation_Wm2",
        }
    )
    wx = wx.rename(
        columns={
            "תחנה": "station_weather",
            "תאריך ושעה (שעון קיץ)": "datetime",
            "לחץ בגובה התחנה (הקטופסקל)": "station_pressure_hpa",
            "לחות יחסית (%)": "relative_humidity_pct",
            "טמפרטורה (C°)": "air_temp_C_weather",
            "טמפרטורת מקסימום (C°)": "air_temp_max_C_weather",
            "טמפרטורת מינימום (C°)": "air_temp_min_C_weather",
            "טמפרטורה ליד הקרקע (C°)": "ground_temp_C",
            "טמפרטורה לחה (C°)": "wet_temp_C",
            "כיוון הרוח (מעלות)": "wind_dir_deg",
            "כיוון המשב העליון (מעלות)": "gust_dir_deg",
            "מהירות רוח (מטר לשניה)": "wind_speed_ms",
            "מהירות רוח דקתית מקסימלית (מטר לשניה)": "wind_speed_1m_max_ms",
            "מהירות רוח 10 דקתית מקסימלית (מטר לשניה)": "wind_speed_10m_max_ms",
            "זמן סיום מהירות רוח 10 דקתית מקסימלית  (hhmm)": "wind_speed_10m_max_end_time",
            "מהירות המשב העליון (מטר לשניה)": "gust_speed_ms",
            "סטיית התקן של כיוון הרוח (מעלות)": "wind_dir_std_deg",
        }
    )

    rad["datetime"] = pd.to_datetime(rad["datetime"], dayfirst=True)
    wx["datetime"] = pd.to_datetime(wx["datetime"], dayfirst=True)
    rad = rad.replace("-", np.nan)
    wx = wx.replace("-", np.nan)

    def _coerce_numeric(df: pd.DataFrame, ignore: List[str]) -> pd.DataFrame:
        for col in df.columns:
            if col in ignore:
                continue
            df[col] = pd.to_numeric(df[col], errors="coerce")
        return df

    rad = _coerce_numeric(rad, ignore=["station_radiation", "datetime"])
    wx = _coerce_numeric(wx, ignore=["station_weather", "datetime", "wind_speed_10m_max_end_time"])
    merged = pd.merge(rad, wx, on="datetime", how="inner")
    return merged.sort_values("datetime")


def load_targets() -> pd.DataFrame:
    df = pd.read_csv(DATA_DIR / "Data Final OG.csv")
    df["timestamp"] = pd.to_datetime(df["datetime"] + " " + df["time"], dayfirst=False)
    return df.sort_values("timestamp")


def build_dataset() -> pd.DataFrame:
    weather = load_weather()
    targets = load_targets()

    merged = pd.merge_asof(
        targets,
        weather,
        left_on="timestamp",
        right_on="datetime",
        direction="nearest",
        tolerance=pd.Timedelta("30min"),
    )
    merged = merged.dropna(subset=["diffuse_radiation_Wm2", "global_radiation_Wm2", "direct_radiation_Wm2"])
    return merged


base_feature_cols = [
    "diffuse_radiation_Wm2",
    "global_radiation_Wm2",
    "direct_radiation_Wm2",
    "station_pressure_hpa",
    "relative_humidity_pct",
    "air_temp_C_weather",
    "air_temp_max_C_weather",
    "air_temp_min_C_weather",
    "ground_temp_C",
    "wet_temp_C",
    "wind_dir_deg",
    "gust_dir_deg",
    "wind_speed_ms",
    "wind_speed_1m_max_ms",
    "wind_speed_10m_max_ms",
    "gust_speed_ms",
    "wind_dir_std_deg",
]

target_cols = [
    "air_temp_C",
    "solar_radiation_Wm2",
    "drywet1_temp_C",
    "drywet2_temp_C",
]

In [3]:
# Build merged dataset and engineer features (weather-only)
df_all = build_dataset().sort_values("timestamp").reset_index(drop=True)

df_features = df_all[base_feature_cols].copy()

# Time features
timestamps = df_all["timestamp"]
df_features["hour"] = timestamps.dt.hour
df_features["dayofyear"] = timestamps.dt.dayofyear
df_features["hour_sin"] = np.sin(2 * np.pi * df_features["hour"] / 24)
df_features["hour_cos"] = np.cos(2 * np.pi * df_features["hour"] / 24)
df_features["doy_sin"] = np.sin(2 * np.pi * df_features["dayofyear"] / 365)
df_features["doy_cos"] = np.cos(2 * np.pi * df_features["dayofyear"] / 365)

# Interaction feature
df_features["rh_x_temp"] = df_features["relative_humidity_pct"] * df_features["air_temp_C_weather"]

# Lagged and rolling weather/radiation features (10-min resolution assumed)
lag_cols = [
    "diffuse_radiation_Wm2",
    "global_radiation_Wm2",
    "direct_radiation_Wm2",
    "air_temp_C_weather",
    "relative_humidity_pct",
    "ground_temp_C",
    "wet_temp_C",
]

for col in lag_cols:
    df_features[f"{col}_lag1"] = df_all[col].shift(1)
    df_features[f"{col}_lag3"] = df_all[col].shift(3)
    df_features[f"{col}_lag6"] = df_all[col].shift(6)
    df_features[f"{col}_lag18"] = df_all[col].shift(18)
    df_features[f"{col}_lag24"] = df_all[col].shift(24)
    df_features[f"{col}_roll3_mean"] = df_all[col].rolling(window=3, min_periods=1).mean().shift(1)
    df_features[f"{col}_roll6_mean"] = df_all[col].rolling(window=6, min_periods=1).mean().shift(1)
    df_features[f"{col}_roll12_mean"] = df_all[col].rolling(window=12, min_periods=1).mean().shift(1)
    df_features[f"{col}_roll24_mean"] = df_all[col].rolling(window=24, min_periods=1).mean().shift(1)

# Coerce features to numeric and drop entirely-missing columns
df_features = df_features.apply(pd.to_numeric, errors="coerce")
non_empty_features = [c for c in df_features.columns if not df_features[c].isna().all()]
df_features = df_features[non_empty_features].apply(lambda col: col.fillna(col.median()))

# Targets
df_targets = df_all[target_cols].apply(pd.to_numeric, errors="coerce").dropna()
aligned = df_features.join(df_targets, how="inner")
aligned["timestamp"] = df_all.loc[aligned.index, "timestamp"].values
aligned_rows = len(aligned)
aligned_rows

  wx = wx.replace("-", np.nan)


4082

In [4]:
# Chronological train/test split (80/20)
split_idx = int(len(aligned) * 0.8)
train_df = aligned.iloc[:split_idx]
test_df = aligned.iloc[split_idx:]

train_rows = len(train_df)
test_rows = len(test_df)

print(f"Rows aligned: {aligned_rows}")
print(f"Train rows: {train_rows}")
print(f"Test rows:  {test_rows}")

Rows aligned: 4082
Train rows: 3265
Test rows:  817


In [5]:
# Time-series CV for HistGradientBoosting (per target, weather-only features)
tscv = TimeSeriesSplit(n_splits=3)
param_grid = [
    {"max_depth": 6, "learning_rate": 0.05, "min_samples_leaf": 20, "max_bins": 255},
    {"max_depth": 8, "learning_rate": 0.05, "min_samples_leaf": 20, "max_bins": 255},
    {"max_depth": 10, "learning_rate": 0.03, "min_samples_leaf": 30, "max_bins": 255},
    {"max_depth": 12, "learning_rate": 0.03, "min_samples_leaf": 20, "max_bins": 255},
    {"max_depth": None, "learning_rate": 0.05, "min_samples_leaf": 30, "max_bins": 255}
]

best_params: Dict[str, Dict[str, Any]] = {}

for target in target_cols:
    X = train_df[non_empty_features].values
    y = train_df[target].values
    results = []
    for params in param_grid:
        rmses = []
        for tr_idx, val_idx in tscv.split(X):
            X_tr, X_val = X[tr_idx], X[val_idx]
            y_tr, y_val = y[tr_idx], y[val_idx]
            model = HistGradientBoostingRegressor(
                max_depth=params.get("max_depth"),
                learning_rate=params.get("learning_rate", 0.1),
                min_samples_leaf=params.get("min_samples_leaf", 20),
                max_bins=params.get("max_bins", 255),
                random_state=0,
            )
            model.fit(X_tr, y_tr)
            preds = model.predict(X_val)
            rmse = mean_squared_error(y_val, preds, squared=False)
            rmses.append(rmse)
        results.append((params, float(np.mean(rmses))))
    best_params[target], best_score = sorted(results, key=lambda x: x[1])[0]
    print(f"Target {target}: best params {best_params[target]} (CV RMSE={best_score:.3f})")



Target air_temp_C: best params {'max_depth': 8, 'learning_rate': 0.05, 'min_samples_leaf': 20, 'max_bins': 255} (CV RMSE=1.007)




Target solar_radiation_Wm2: best params {'max_depth': 6, 'learning_rate': 0.05, 'min_samples_leaf': 20, 'max_bins': 255} (CV RMSE=68.369)




Target drywet1_temp_C: best params {'max_depth': 8, 'learning_rate': 0.05, 'min_samples_leaf': 20, 'max_bins': 255} (CV RMSE=2.944)




Target drywet2_temp_C: best params {'max_depth': 6, 'learning_rate': 0.05, 'min_samples_leaf': 20, 'max_bins': 255} (CV RMSE=0.967)




In [6]:
# Train final HGB models and evaluate on held-out test set
models: Dict[str, Any] = {}
for target in target_cols:
    params = best_params[target]
    model = HistGradientBoostingRegressor(
        max_depth=params.get("max_depth"),
        learning_rate=params.get("learning_rate", 0.1),
        min_samples_leaf=params.get("min_samples_leaf", 20),
        max_bins=params.get("max_bins", 255),
        random_state=0,
    )
    model.fit(train_df[non_empty_features], train_df[target])
    models[target] = model

preds_combined = pd.DataFrame()
for target in target_cols:
    preds_combined[target] = models[target].predict(test_df[non_empty_features])

print("Test metrics (regression):")
for target in target_cols:
    true_vals = test_df[target]
    pred_vals = preds_combined[target]
    mae = mean_absolute_error(true_vals, pred_vals)
    mse = mean_squared_error(true_vals, pred_vals)
    rmse = mean_squared_error(true_vals, pred_vals, squared=False)
    r2 = r2_score(true_vals, pred_vals)
    print(f"- {target}")
    print(f"  MAE:  {mae:.3f}")
    print(f"  MSE:  {mse:.3f}")
    print(f"  RMSE: {rmse:.3f}")
    print(f"  R2:   {r2:.4f}")

print("Note: 'accuracy' is not defined for regression; R2 is reported instead.")

Test metrics (regression):
- air_temp_C
  MAE:  0.502
  MSE:  0.585
  RMSE: 0.765
  R2:   0.9724
- solar_radiation_Wm2
  MAE:  39.331
  MSE:  6358.351
  RMSE: 79.739
  R2:   0.8545
- drywet1_temp_C
  MAE:  1.339
  MSE:  4.083
  RMSE: 2.021
  R2:   0.7230
- drywet2_temp_C
  MAE:  0.531
  MSE:  0.480
  RMSE: 0.693
  R2:   0.9693
Note: 'accuracy' is not defined for regression; R2 is reported instead.




In [7]:
# Correlation between features and targets (aligned data)
corr = aligned[non_empty_features + target_cols].corr()
corr_targets = corr[target_cols]
print("Feature/target correlation (Pearson):")
corr_targets.sort_values(by=target_cols, key=lambda c: abs(c), ascending=False).head(15)

Feature/target correlation (Pearson):


Unnamed: 0,air_temp_C,solar_radiation_Wm2,drywet1_temp_C,drywet2_temp_C
air_temp_C,1.0,0.833349,0.770834,0.971864
drywet2_temp_C,0.971864,0.774219,0.801548,1.0
ground_temp_C,0.93598,0.762762,0.767099,0.931048
ground_temp_C_lag1,0.92566,0.741718,0.756176,0.919077
ground_temp_C_roll3_mean,0.914932,0.721224,0.745032,0.907013
air_temp_max_C_weather,0.904243,0.613531,0.719766,0.911299
air_temp_C_weather,0.901113,0.607494,0.715572,0.907821
ground_temp_C_lag3,0.897976,0.695743,0.72885,0.88887
air_temp_min_C_weather,0.896136,0.599035,0.709242,0.902157
ground_temp_C_roll6_mean,0.893337,0.685048,0.724613,0.884349


In [8]:
# Permutation importance per target on held-out test set
from sklearn.inspection import permutation_importance

importance_results = {}
for target in target_cols:
    model = models[target]
    result = permutation_importance(
        model,
        test_df[non_empty_features],
        test_df[target],
        n_repeats=10,
        random_state=0,
        n_jobs=-1,
    )
    importances = pd.Series(result.importances_mean, index=non_empty_features)
    top_importances = importances.sort_values(ascending=False).head(15)
    importance_results[target] = top_importances
    print(f"Top features for {target}:")
    print(top_importances)
    print("---")

importance_results

Top features for air_temp_C:
ground_temp_C                       0.244485
hour_cos                            0.096769
ground_temp_C_roll3_mean            0.036592
global_radiation_Wm2                0.009753
air_temp_max_C_weather              0.009366
hour                                0.007015
air_temp_C_weather_lag24            0.005003
ground_temp_C_lag1                  0.004728
hour_sin                            0.003747
air_temp_C_weather                  0.001934
air_temp_C_weather_lag18            0.001766
air_temp_min_C_weather              0.001727
direct_radiation_Wm2_roll24_mean    0.001580
global_radiation_Wm2_roll3_mean     0.001334
station_pressure_hpa                0.001222
dtype: float64
---
Top features for solar_radiation_Wm2:
hour_cos                             1.017636
hour                                 0.036411
global_radiation_Wm2                 0.021468
air_temp_C_weather_lag24             0.010308
global_radiation_Wm2_roll24_mean     0.010077
ground_te

{'air_temp_C': ground_temp_C                       0.244485
 hour_cos                            0.096769
 ground_temp_C_roll3_mean            0.036592
 global_radiation_Wm2                0.009753
 air_temp_max_C_weather              0.009366
 hour                                0.007015
 air_temp_C_weather_lag24            0.005003
 ground_temp_C_lag1                  0.004728
 hour_sin                            0.003747
 air_temp_C_weather                  0.001934
 air_temp_C_weather_lag18            0.001766
 air_temp_min_C_weather              0.001727
 direct_radiation_Wm2_roll24_mean    0.001580
 global_radiation_Wm2_roll3_mean     0.001334
 station_pressure_hpa                0.001222
 dtype: float64,
 'solar_radiation_Wm2': hour_cos                             1.017636
 hour                                 0.036411
 global_radiation_Wm2                 0.021468
 air_temp_C_weather_lag24             0.010308
 global_radiation_Wm2_roll24_mean     0.010077
 ground_temp_C       

In [9]:
# Export test-set predictions with actuals
test_output = test_df[["timestamp"] + target_cols].copy()
for target in target_cols:
    test_output[f"pred_{target}"] = preds_combined[target].values

test_path = DATA_DIR / "test_predictions.csv"
test_output.to_csv(test_path, index=False)
print(f"Wrote test predictions to {test_path}")
test_output.head()

Wrote test predictions to C:\Users\edene\OneDrive\שולחן העבודה\final_data_file\test_predictions.csv


Unnamed: 0,timestamp,air_temp_C,solar_radiation_Wm2,drywet1_temp_C,drywet2_temp_C,pred_air_temp_C,pred_solar_radiation_Wm2,pred_drywet1_temp_C,pred_drywet2_temp_C
3489,2025-09-14 02:10:00,25.47,0.01,20.76,21.34,25.916986,1.723678,21.229247,21.72104
3490,2025-09-14 02:20:00,25.37,0.02,20.58,21.17,25.852381,1.723678,21.214955,21.644068
3491,2025-09-14 02:30:00,25.28,0.01,20.51,21.11,25.690148,1.723678,21.010211,21.530037
3492,2025-09-14 02:40:00,25.2,0.02,20.59,21.13,25.511477,1.723678,20.873285,21.278505
3493,2025-09-14 02:50:00,25.14,0.003,20.62,21.1,25.457043,1.723678,20.866744,21.522203


In [10]:
# Optional: preview predictions on all rows (weather-only features)
full_preds = pd.DataFrame()
for target in target_cols:
    full_preds[target] = models[target].predict(df_features[non_empty_features])
full_preds.insert(0, "timestamp", df_all["timestamp"].reset_index(drop=True))
full_preds.head()

Unnamed: 0,timestamp,air_temp_C,solar_radiation_Wm2,drywet1_temp_C,drywet2_temp_C
0,2025-08-10 12:30:00,42.471275,633.538617,28.796438,36.335879
1,2025-08-10 12:40:00,44.685849,752.561784,29.724917,37.244251
2,2025-08-10 12:50:00,44.692912,765.683238,29.724917,37.244251
3,2025-08-10 13:00:00,45.486258,794.552745,29.754506,37.85679
4,2025-08-10 13:10:00,45.483133,791.597327,29.661941,37.972264


In [11]:
# Predict sensors for the full Bet Dagan weather/radiation range and export
weather_full = load_weather().sort_values("datetime").reset_index(drop=True)
wf_features = weather_full[base_feature_cols].copy()

# Time features from weather timestamps
wf_features["hour"] = weather_full["datetime"].dt.hour
wf_features["dayofyear"] = weather_full["datetime"].dt.dayofyear
wf_features["hour_sin"] = np.sin(2 * np.pi * wf_features["hour"] / 24)
wf_features["hour_cos"] = np.cos(2 * np.pi * wf_features["hour"] / 24)
wf_features["doy_sin"] = np.sin(2 * np.pi * wf_features["dayofyear"] / 365)
wf_features["doy_cos"] = np.cos(2 * np.pi * wf_features["dayofyear"] / 365)

# Interaction
wf_features["rh_x_temp"] = wf_features["relative_humidity_pct"] * wf_features["air_temp_C_weather"]

lag_cols = [
    "diffuse_radiation_Wm2",
    "global_radiation_Wm2",
    "direct_radiation_Wm2",
    "air_temp_C_weather",
    "relative_humidity_pct",
    "ground_temp_C",
    "wet_temp_C",
]
for col in lag_cols:
    wf_features[f"{col}_lag1"] = weather_full[col].shift(1)
    wf_features[f"{col}_lag3"] = weather_full[col].shift(3)
    wf_features[f"{col}_lag6"] = weather_full[col].shift(6)
    wf_features[f"{col}_lag18"] = weather_full[col].shift(18)
    wf_features[f"{col}_lag24"] = weather_full[col].shift(24)
    wf_features[f"{col}_roll3_mean"] = weather_full[col].rolling(window=3, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll6_mean"] = weather_full[col].rolling(window=6, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll12_mean"] = weather_full[col].rolling(window=12, min_periods=1).mean().shift(1)
    wf_features[f"{col}_roll24_mean"] = weather_full[col].rolling(window=24, min_periods=1).mean().shift(1)

# Align to training feature set and impute using training medians
wf_features = wf_features.apply(pd.to_numeric, errors="coerce")
wf_features = wf_features.reindex(columns=non_empty_features)
train_medians = train_df[non_empty_features].median()
wf_features = wf_features.fillna(train_medians)

all_preds = pd.DataFrame()
for target in target_cols:
    all_preds[target] = models[target].predict(wf_features)
all_preds.insert(0, "timestamp", weather_full["datetime"].reset_index(drop=True))

full_path = DATA_DIR / "full_weather_range_predictions.csv"
all_preds.to_csv(full_path, index=False)
print(f"Wrote full-range weather predictions to {full_path}")
all_preds.head()

  wx = wx.replace("-", np.nan)


Wrote full-range weather predictions to C:\Users\edene\OneDrive\שולחן העבודה\final_data_file\full_weather_range_predictions.csv


Unnamed: 0,timestamp,air_temp_C,solar_radiation_Wm2,drywet1_temp_C,drywet2_temp_C
0,2025-05-29 00:00:00,29.390614,3.279148,22.991831,22.694422
1,2025-05-29 00:10:00,25.534784,3.279148,21.601336,21.93935
2,2025-05-29 00:20:00,25.437118,3.279148,21.601336,21.93935
3,2025-05-29 00:30:00,25.431254,4.167241,21.510558,21.62341
4,2025-05-29 00:40:00,25.149566,3.279148,21.413484,21.228929
