In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# CHAMAU

chamau_lag = pd.read_csv("../datasets/Chamau_2014-2024_clean_newlag.csv")
chamau_daily = pd.read_csv("../datasets/Chamau_Daily_2014-2024_newlag.csv")

chamau_A = chamau_lag[chamau_lag["Parcel"] == "A"].copy()
chamau_B = chamau_lag[chamau_lag["Parcel"] == "B"].copy()

chamau_daily_A = chamau_daily[chamau_daily["Parcel"] == "A"]
chamau_daily_B = chamau_daily[chamau_daily["Parcel"] == "B"]

In [3]:
# AESCHI

aeschi_lag = pd.read_csv("../datasets/Aeschi_2019-20_clean_newlag.csv")
aeschi_daily = pd.read_csv("../datasets/Aeschi_Daily_2019-20_newlag.csv")

In [4]:
# OENSINGEN

oensingen_lag_1 = pd.read_csv("../datasets/Oensingen_2018-19_clean_newlag.csv")
oensingen_daily_1 = pd.read_csv("../datasets/Oensingen_Daily_2018-19_clean_newlag.csv")

oensingen_lag_2 = pd.read_csv("../datasets/Oensingen_2021-23_clean_newlag.csv")
oensingen_daily_2 = pd.read_csv("../datasets/Oensingen_Daily_2021-23_clean_newlag.csv")

In [5]:
# TANIKON

tanikon_lag = pd.read_csv("../datasets/Tanikon_2023-25_clean_newlag.csv")
tanikon_daily = pd.read_csv("../datasets/Tanikon_Daily_2023-25_clean.csv")

In [6]:
# FOREL

forel_lag = pd.read_csv("../datasets/Forel_2024-25_clean_newlag.csv")
forel_daily = pd.read_csv("../datasets/Forel_Daily_2024-25_clean_newlag.csv")

In [7]:

datasets = {
    "Chamau": chamau_lag,
    "Chamau A": chamau_A,
    "Chamau B": chamau_B,
    "Aeschi": aeschi_lag,
    "Oensingen 1": oensingen_lag_1,
    "Oensingen 2": oensingen_lag_2,
    "Tanikon": tanikon_lag,
    "Forel" : forel_lag
}

In [10]:
# for name, df in datasets.items():
#     print(f"\n{name} — {len(df.columns)} columns:")
#     print(df.columns.tolist())

In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

def train_rf_timeseries_simple(
    df, predictors, target, 
    date_col="Date", test_ratio=0.15,
    pca=False, pca_components=0.95,
    random_state=42
):
    """
    Fast baseline RandomForest (no hyperparameter search).
    Uses Train/Test only (no validation split).
    """

    # --- Keep only available predictors and drop missing ---
    available_predictors = [p for p in predictors if p in df.columns]
    df = df.dropna(subset=available_predictors + [target]).sort_values(date_col)

    # --- Train / Test split (chronological) ---
    n = len(df)
    n_test = int(n * test_ratio)

    train = df.iloc[:n - n_test]
    test  = df.iloc[n - n_test:]

    X_train = train[available_predictors].to_numpy()
    y_train = train[target].to_numpy()
    X_test  = test[available_predictors].to_numpy()
    y_test  = test[target].to_numpy()

    # --- Model ---
    rf = RandomForestRegressor(
        n_estimators=460,
        max_depth=20,
        min_samples_split=3,
        min_samples_leaf=8,
        max_features=0.35,
        random_state=42,
        n_jobs=-1
    )

    steps = []
    if pca:
        steps.append(("pca", PCA(n_components=pca_components)))
    steps.append(("rf", rf))

    model = Pipeline(steps)

    # --- Fit on Train only ---
    model.fit(X_train, y_train)

    # --- Predict + Score on Test ---
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    r, _ = pearsonr(y_test, y_pred)

    # ✅ Ensure dates remain datetime (avoid the 1970 issue)
    test_dates = pd.to_datetime(test[date_col])

    return {
        "r2": r2,
        "pearson_r": r,
        "best_params": rf.get_params(),
        "n_train": len(train),
        "n_test": len(test),
        "pca": pca,
        "n_components": (
            model.named_steps["pca"].n_components_ if pca else None
        ),
        "y_test": y_test,
        "y_pred": y_pred,
        "test_dates": test_dates,
        "model": model,
    }



In [7]:
temporal_structure_families = {
    "lags_only": ["_lag"],
    "roll_mean_only": ["roll", "mean"],
    "roll_sum_only": ["roll", "sum"],
    "lags_plus_roll": ["_lag", "roll"],
    "full_temporal": ["_lag", "roll", "DaysSince_", "expHL"],  
    # includes: lags, roll windows, days-since, fertilizer decay
}

lag_window_options = {
    "1d": [1],
    "3d": [3],
    "5d": [5],
    "7d": [7],
    "1_3_5": [1,3,5],
    "1_3_5_7": [1,3,5,7],
    "3_5_7": [3,5,7],
}

decay_modes = {
    "none": [],
    "HL3": ["Fertilizer_N_kg_ha_daily_expHL3d"],
    "HL7": ["Fertilizer_N_kg_ha_daily_expHL7d"],
    "HL14": ["Fertilizer_N_kg_ha_daily_expHL14d"],
    "all": [
        "Fertilizer_N_kg_ha_daily_expHL3d",
        "Fertilizer_N_kg_ha_daily_expHL7d",
        "Fertilizer_N_kg_ha_daily_expHL14d",
    ],
}

instantaneous_modes = {
    "instantaneous_only": "no_temporal",
    "instantaneous_plus_temporal": "all",
}


In [8]:
def build_temporal_predictors(
    df, all_predictors,
    temporal_families,
    lag_days,
    decay_vars,
    days_since_vars,
    include_instantaneous=True
):

    selected = []

    # --- temporal families (lags, rolls, decay) ---
    for p in all_predictors:
        if any(keyword in p for keyword in temporal_families):
            # handle lag windows specifically:
            if "_lag" in p:
                if any(f"lag{d}" in p for d in lag_days):
                    selected.append(p)
            else:
                selected.append(p)

    # --- fertilizer decay ---
    for dvar in decay_vars:
        if dvar in df.columns:
            selected.append(dvar)

    # --- days-since ---
    for col in days_since_vars:
        if col in df.columns:
            selected.append(col)

    # --- instantaneous variables ---
    if include_instantaneous:
        instant = [
            c for c in all_predictors
            if "_lag" not in c
            and "roll" not in c
            and "DaysSince" not in c
            and "expHL" not in c
        ]
        selected += instant

    # --- keep vars that are present in df ---
    final = sorted(set([c for c in selected if c in df.columns]))

    return final


In [9]:
def is_temporal_family(fam_kw):
    """Return True if the family includes any temporal keywords."""
    temporal_keys = ["_lag", "roll", "DaysSince_", "expHL"]
    return any(key in fam_kw for key in temporal_keys)


In [None]:
# no instatanoues variables

# ============================================================
#   CUBE SEARCH: families × lag windows × decay modes
#   (5 × 7 × 5 = 175 model configurations)
# ============================================================

df = chamau_lag.copy()
dataset_name = "Chamau"
target = "N2O_Flux_ln"

# All predictors except obvious non-features
all_predictors = [
    c for c in df.columns
    if c not in ["Timestamp", "Date", "N2O_Flux", "N2O_Flux_ln", "Parcel"]
    and not any(c.endswith(suf) for suf in [".1", ".2", ".3"])
]

print(f"Total theoretical combinations: "
      f"{len(temporal_structure_families)} × {len(lag_window_options)} × {len(decay_modes)} "
      f"= {len(temporal_structure_families) * len(lag_window_options) * len(decay_modes)}")

results = []
run_idx = 0
total_runs = len(temporal_structure_families) * len(lag_window_options) * len(decay_modes)


# ============================================================
# Helper: Build predictors based on family × lag window × decay
# ============================================================

def build_predictors(df, all_predictors, fam_keywords, lag_days, decay_vars):
    selected = []

    # 1) Temporal families (“lags_only”, “roll_mean_only”, etc.)
    for col in all_predictors:
        # Does this predictor match the temporal family?
        if any(kw in col for kw in fam_keywords):

            # Special handling: include only the specified lag days
            if "_lag" in col:
                if any(f"lag{d}" in col for d in lag_days):
                    selected.append(col)
            else:
                selected.append(col)

    # 2) Add decay vars
    for dv in decay_vars:
        if dv in df.columns:
            selected.append(dv)

    return sorted(set(selected))


# ============================================================
# MAIN GRID SEARCH
# ============================================================

for fam_name, fam_kw in temporal_structure_families.items():
    for lag_name, lag_days in lag_window_options.items():
        for decay_name, decay_vars in decay_modes.items():

            run_idx += 1
            print(f"[{run_idx}/{total_runs}] fam={fam_name} | lag={lag_name} | decay={decay_name}")

            predictors = build_predictors(
                df=df,
                all_predictors=all_predictors,
                fam_keywords=fam_kw,
                lag_days=lag_days,
                decay_vars=decay_vars
            )

            # skip weak predictor sets
            if len(predictors) < 10:
                continue

            df_clean = df.dropna(subset=predictors + [target])
            if len(df_clean) < 200:
                continue

            res = train_rf_timeseries_simple(
                df_clean, predictors, target,
                date_col="Timestamp"
            )

            results.append({
                "dataset": dataset_name,
                "struct_family": fam_name,
                "lag_window": lag_name,
                "decay": decay_name,
                "r2": res["r2"],
                "r": res["pearson_r"],
                "n_pred": len(predictors),
            })


print("\nFinished grid search!")

# ============================================================
# RESULTS SUMMARY
# ============================================================

results_df = pd.DataFrame(results)
display(results_df.sort_values("r2", ascending=False).head(15))


Total theoretical combinations: 5 × 7 × 5 = 175
[1/175] fam=lags_only | lag=1d | decay=none


In [None]:
# with instataneous variables

# ============================================================
#   CUBE SEARCH: families × lag windows × decay modes
#   (5 × 7 × 5 = 175 model combinations)
# ============================================================

df = chamau_lag.copy()
dataset_name = "Chamau"
target = "N2O_Flux_ln"

# ------------------------------------------------------------
# Get all usable predictors (exclude non-features)
# ------------------------------------------------------------
all_predictors = [
    c for c in df.columns
    if c not in ["Timestamp", "Date", "N2O_Flux", "N2O_Flux_ln", "Parcel"]
    and not any(c.endswith(suf) for suf in [".1", ".2", ".3"])
]

# Show total combinations
total_runs = (
    len(temporal_structure_families)
    * len(lag_window_options)
    * len(decay_modes)
)
print(f"Total theoretical combinations: {total_runs}")

results = []
run_idx = 0


# ============================================================
# Helper: Build predictors for one model configuration
# ============================================================

def build_predictors(df, all_predictors, fam_keywords, lag_days, decay_vars):

    selected = []

    # 1) TEMPORAL FEATURES (controlled by cube search)
    for col in all_predictors:

        # Check if variable matches temporal keywords for this family
        if any(kw in col for kw in fam_keywords):

            if "_lag" in col:
                # Only keep lags for the chosen windows (e.g. lag1, lag3, lag5)
                if any(f"lag{d}" in col for d in lag_days):
                    selected.append(col)

            else:
                # Rolls, DaysSince_, etc.
                selected.append(col)

    # 2) FERTILIZER DECAY VARIABLES
    for dv in decay_vars:
        if dv in df.columns:
            selected.append(dv)

    # 3) INSTANTANEOUS BASE VARIABLES (ALWAYS INCLUDED)
    base_predictors = [
        c for c in all_predictors
        if "_lag" not in c
        and "roll" not in c
        and "expHL" not in c
    ]
    selected += base_predictors

    return sorted(set(selected))


# ============================================================
# MAIN GRID SEARCH LOOP
# ============================================================

for fam_name, fam_kw in temporal_structure_families.items():
    for lag_name, lag_days in lag_window_options.items():
        for decay_name, decay_vars in decay_modes.items():

            run_idx += 1
            print(f"[{run_idx}/{total_runs}] fam={fam_name} | lag={lag_name} | decay={decay_name}")

            # ---- Build predictor set for this model ----
            predictors = build_predictors(
                df=df,
                all_predictors=all_predictors,
                fam_keywords=fam_kw,
                lag_days=lag_days,
                decay_vars=decay_vars
            )

            # Skip tiny predictor sets
            if len(predictors) < 10:
                continue

            # Drop NA rows for this model's predictors
            df_clean = df.dropna(subset=predictors + [target])
            if len(df_clean) < 200:
                continue

            # ---- Train RF model ----
            res = train_rf_timeseries_simple(
                df_clean, predictors, target,
                date_col="Timestamp"
            )

            # ---- Store results ----
            results.append({
                "dataset": dataset_name,
                "struct_family": fam_name,
                "lag_window": lag_name,
                "decay": decay_name,
                "r2": res["r2"],
                "r": res["pearson_r"],
                "n_pred": len(predictors),
            })

print("\nFinished grid search!")


# ============================================================
# RESULTS SUMMARY
# ============================================================

results_df = pd.DataFrame(results)
display(results_df.sort_values("r2", ascending=False).head(15))
