In [1]:
import os
import re
import sys
import subprocess
from typing import Tuple, List, Dict

def ensure_openpyxl():
    try:
        import openpyxl
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "openpyxl"])
        import importlib; importlib.invalidate_caches()
        import openpyxl  # noqa: F401
ensure_openpyxl()

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, balanced_accuracy_score,
    mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
)

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K

In [17]:
EXCEL_PATH = "C:/Users/chinw/Downloads/K-FOLD_Chinesetallow_AceticAcid.xlsx"
SHEET_NAME = "Sheet1"
FERTILITY_DEFAULT_THRESHOLD = 470.0
TIMESTEPS = 12
REQUESTED_FOLDS = 5
FALLBACK_QUANTILES = [0.5, 0.6, 0.4, 0.7, 0.3]  # median first, then neighbors
EPOCHS = 50
BATCH_SIZE = 16
SEED = 47

In [19]:
def load_and_tidy(path: str, sheet: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Excel not found: {path}")
    raw = pd.read_excel(path, sheet_name=sheet, engine="openpyxl")

    labels = raw.iloc[:, 0].ffill()
    date_row = pd.to_datetime(raw.iloc[0, 1:].values, errors="coerce")
    values = raw.iloc[1:, 1:]
    values.index = labels.iloc[1:]
    values.columns = date_row

    plant_re = re.compile(r"sapium", re.IGNORECASE)
    metrics = {"LIGHT", "MOISTURE", "TEMPERATURE", "FERTILITY"}

    blocks = []
    cur = {"Plant": None, "LIGHT": None, "MOISTURE": None, "TEMPERATURE": None, "FERTILITY": None}
    for label, row in values.iterrows():
        if isinstance(label, str) and plant_re.search(label):
            if cur["Plant"] and cur["MOISTURE"] is not None and cur["FERTILITY"] is not None:
                blocks.append(cur)
            cur = {"Plant": label, "LIGHT": None, "MOISTURE": None, "TEMPERATURE": None, "FERTILITY": None}
        elif label in metrics:
            cur[label] = pd.to_numeric(row, errors="coerce")
    if cur["Plant"] and cur["MOISTURE"] is not None and cur["FERTILITY"] is not None:
        blocks.append(cur)

    if not blocks:
        raise ValueError("No plant blocks with MOISTURE and FERTILITY found.")

    frames = []
    for b in blocks:
        frames.append(pd.DataFrame({
            "Plant": b["Plant"],
            "Date": b["MOISTURE"].index,
            "Moisture": b["MOISTURE"].values,
            "Fertility": b["FERTILITY"].values
        }))
    ts = pd.concat(frames, ignore_index=True)
    ts = ts.dropna(subset=["Moisture", "Fertility"]).sort_values("Date").reset_index(drop=True)
    return ts

In [21]:
def compute_variation_scaled(df: pd.DataFrame) -> pd.Series:
    var_abs = (df["Moisture"].values - df["Fertility"].values).astype(float)
    var_abs = np.abs(var_abs)
    scaler = MinMaxScaler()
    var_scaled = scaler.fit_transform(var_abs.reshape(-1, 1)).ravel()
    return pd.Series(var_scaled, index=df.index, name="VariationScaled")

def build_windows(series_1d: np.ndarray, labels_1d: np.ndarray, timesteps: int) -> Tuple[np.ndarray, np.ndarray]:
    X_list, y_list = [], []
    for t in range(timesteps - 1, len(series_1d)):
        win = series_1d[t - timesteps + 1: t + 1]
        X_list.append(win.reshape(timesteps, 1))
        y_list.append(labels_1d[t])  # align label at window end
    X = np.asarray(X_list, dtype=np.float32)
    y = np.asarray(y_list)
    if X.size == 0:
        raise ValueError("No windows built; reduce TIMESTEPS.")
    return X, y

In [23]:
def class_weights_from_labels(y: np.ndarray) -> Dict[int, float]:
    counts = np.bincount(y.astype(int), minlength=2)
    total = counts.sum()
    return {0: total / (2 * max(counts[0], 1)), 1: total / (2 * max(counts[1], 1))}

In [25]:
def build_classifier_model(timesteps: int, features: int = 1) -> Sequential:
    model = Sequential([
        Input(shape=(timesteps, features)),
        LSTM(50),
        Dropout(0.3), 
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [27]:
def build_regressor_model(timesteps: int, features: int = 1) -> Sequential:
    model = Sequential([
        Input(shape=(timesteps, features)),
        LSTM(50),
        Dropout(0.3),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer=Adam(), loss='mse', metrics=['mse'])
    return model

def choose_threshold(y_endpoints: np.ndarray) -> Tuple[float, np.ndarray, str]:
    """Try default 470, then median, then quantiles until both classes exist."""
    trials = [("fixed_470", FERTILITY_DEFAULT_THRESHOLD)]
    med = float(np.median(y_endpoints))
    trials.append(("median", med))
    for q in FALLBACK_QUANTILES:
        trials.append((f"q{q}", float(np.quantile(y_endpoints, q))))

    for label, thr in trials:
        y_bin = (y_endpoints > thr).astype(int)
        if len(np.unique(y_bin)) == 2:
            return thr, y_bin, label
    return np.nan, (y_endpoints > trials[0][1]).astype(int), "none"

In [29]:
def main():
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
    np.random.seed(SEED); tf.random.set_seed(SEED)

    # Load all data
    ts = load_and_tidy(EXCEL_PATH, SHEET_NAME)

    # Variation → [0,1]
    var_scaled = compute_variation_scaled(ts).values
    fert = ts["Fertility"].values.astype(float)

    # Build windows across the whole series
    X_all, fert_end = build_windows(var_scaled, fert, TIMESTEPS)  # fert_end are the window-end Fertility values

    # Try to form balanced classes from Fertility with an auto threshold
    thr, y_all, thr_tag = choose_threshold(fert_end)
    print(f"Windows: X={X_all.shape}, threshold_choice={thr_tag}, threshold_value={thr}")

    unique = np.unique(y_all)
    if len(unique) == 2:
        class_counts = np.bincount(y_all, minlength=2)
        max_k = int(class_counts.min())
        if max_k < 2:
            raise ValueError("Even after thresholding, min class count < 2; cannot stratify.")
        k = max(2, min(REQUESTED_FOLDS, max_k))
        print(f"Classification mode with StratifiedKFold k={k}, class_counts={class_counts.tolist()}")

        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=SEED)
        results: List[Dict] = []

        for fold, (tr, va) in enumerate(skf.split(X_all, y_all), start=1):
            X_tr, y_tr = X_all[tr], y_all[tr]
            X_va, y_va = X_all[va], y_all[va]

            cw = class_weights_from_labels(y_tr)
            model = build_classifier_model(TIMESTEPS, 1)
            es = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
            model.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                      epochs=EPOCHS, batch_size=BATCH_SIZE,
                      class_weight=cw, callbacks=[es], verbose=0)

            y_prob = model.predict(X_va, verbose=0).ravel()
            y_hat  = (y_prob >= 0.5).astype(int)

            acc  = accuracy_score(y_va, y_hat)
            bacc = balanced_accuracy_score(y_va, y_hat)
            prec = precision_score(y_va, y_hat, zero_division=0)
            rec  = recall_score(y_va, y_hat, zero_division=0)
            f1   = f1_score(y_va, y_hat, zero_division=0)
            ap   = average_precision_score(y_va, y_prob)
            auc  = roc_auc_score(y_va, y_prob)
            cm   = confusion_matrix(y_va, y_hat, labels=[0,1])

            print(f"Fold {fold}: n_train={len(y_tr)} n_val={len(y_va)} | "
                  f"Acc={acc:.3f} BAcc={bacc:.3f} Prec={prec:.3f} Rec={rec:.3f} "
                  f"F1={f1:.3f} AP={ap:.3f} AUC={auc:.3f}")
            # print("CM:\n", cm)

            results.append(dict(acc=acc,bacc=bacc,prec=prec,rec=rec,f1=f1,ap=ap,auc=auc))
            K.clear_session()

        means = {m: float(np.mean([r[m] for r in results])) for m in ["acc","bacc","prec","rec","f1","ap","auc"]}
        print("\nMean classification metrics:", means)
        print(f"(Threshold used: {thr} from {thr_tag})")

    else:
        print("No viable threshold produced two classes. Falling back to regression (predict Fertility).")
        k = min(REQUESTED_FOLDS, max(2, len(fert_end) // 5))
        k = max(k, 2)
        kf = KFold(n_splits=k, shuffle=True, random_state=SEED)
        results_r: List[Dict] = []

        for fold, (tr, va) in enumerate(kf.split(X_all), start=1):
            X_tr, X_va = X_all[tr], X_all[va]
            y_tr, y_va = fert_end[tr], fert_end[va]

            model = build_regressor_model(TIMESTEPS, 1)
            es = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
            model.fit(X_tr, y_tr, validation_data=(X_va, y_va),
                      epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es], verbose=0)

            y_pred = model.predict(X_va, verbose=0).ravel()
            rmse = mean_squared_error(y_va, y_pred, squared=False)
            mae  = mean_absolute_error(y_va, y_pred)
            r2   = r2_score(y_va, y_pred)

            print(f"Fold {fold}: n_train={len(y_tr)} n_val={len(y_va)} | RMSE={rmse:.3f} MAE={mae:.3f} R2={r2:.3f}")
            results_r.append(dict(rmse=rmse, mae=mae, r2=r2))
            K.clear_session()

        mean_r = {
            "rmse": float(np.mean([r["rmse"] for r in results_r])),
            "mae":  float(np.mean([r["mae"] for r in results_r])),
            "r2":   float(np.mean([r["r2"] for r in results_r])),
        }
        print("\nMean regression metrics:", mean_r)

if __name__ == "__main__":
    main()

Windows: X=(34, 12, 1), threshold_choice=median, threshold_value=232.9125
Classification mode with StratifiedKFold k=5, class_counts=[17, 17]
Fold 1: n_train=27 n_val=7 | Acc=0.714 BAcc=0.708 Prec=0.667 Rec=0.667 F1=0.667 AP=0.806 AUC=0.833

Fold 2: n_train=27 n_val=7 | Acc=0.571 BAcc=0.500 Prec=0.000 Rec=0.000 F1=0.000 AP=0.411 AUC=0.333
Fold 3: n_train=27 n_val=7 | Acc=0.714 BAcc=0.667 Prec=0.667 Rec=1.000 F1=0.800 AP=0.525 AUC=0.333
Fold 4: n_train=27 n_val=7 | Acc=1.000 BAcc=1.000 Prec=1.000 Rec=1.000 F1=1.000 AP=1.000 AUC=1.000
Fold 5: n_train=28 n_val=6 | Acc=0.333 BAcc=0.333 Prec=0.333 Rec=0.333 F1=0.333 AP=0.478 AUC=0.333

Mean classification metrics: {'acc': 0.6666666666666667, 'bacc': 0.6416666666666667, 'prec': 0.5333333333333333, 'rec': 0.6, 'f1': 0.56, 'ap': 0.643888888888889, 'auc': 0.5666666666666667}
(Threshold used: 232.9125 from median)
