In [None]:
# ===================== FINAL curve→meta (pyts-style) wrap-up =====================
# MiniRocket + shape stats, grouped CV; categorical + numeric(3-bin); time-window importance
# Assumes: `dfm` is in memory with columns:
#   - y_0..y_59 (normalized flux at 60 time points)
#   - article_id (or at least curve_id)
#   - metadata columns: material, oil_type, droplet_um, p_bar, cross_flow_ms, porosity, pore_nm, jw_lmh, oil_ppm, salt_gl, temp_c
# Outputs: ./pyts_final/ and pyts_final.zip with:
#   - curve_to_meta_results.csv
#   - time_window_importance.csv (if any targets)
#   - one example bar plot of time-window importance

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from scipy.sparse import hstack, csr_matrix
import shutil, os, warnings

RNG = 42
OUT = Path("pyts_final")
OUT.mkdir(exist_ok=True, parents=True)

assert 'dfm' in globals(), "Need `dfm` loaded (merged meta + y_0..y_59)."

# --------- shape stats on curves ----------
def shape_feats(A: np.ndarray) -> pd.DataFrame:
    """
    Simple shape descriptors from normalized curves A (n_samples, L):
      - total AUC
      - slope + mean for each third of the curve
      - curvature-like mid metric
      - last value and global std
    """
    n, L = A.shape
    t = np.linspace(0, 1, L)
    thirds = np.array_split(np.arange(L), 3)
    out = {}
    out["auc"] = np.trapz(A, x=t, axis=1)
    for k, idxs in enumerate(thirds, 1):
        x = t[idxs] - t[idxs].mean()
        denom = (x**2).sum() + 1e-12
        # per-curve slope in that segment
        slope = ((A[:, idxs] - A[:, idxs].mean(axis=1, keepdims=True)) @ x) / denom
        out[f"slope_{k}"] = slope
        out[f"mean_{k}"]  = A[:, idxs].mean(axis=1)
    # “curvature” in the middle third vs outer thirds
    out["curv_mid"] = out["slope_2"] - 0.5*(out["slope_1"] + out["slope_3"])
    out["last"] = A[:, -1]
    out["std"]  = A.std(axis=1)
    return pd.DataFrame(out)

# --------- build curves (Yshape, Ydiff) from y_0..y_59 ----------
def build_curves(df: pd.DataFrame, L: int = 60):
    """
    From df[y_0..y_59] build:
      - Yshape: monotone, shape-normalized curves
      - Ydiff:  first differences (with padding)
    Returns (ycols, Yshape, Ydiff)
    """
    ycols = sorted(
        [c for c in df.columns if str(c).startswith("y_")],
        key=lambda s: int(str(s).split("_")[1])
    )[:L]
    if len(ycols) < L:
        warnings.warn(f"Only {len(ycols)} y_* columns found, expected {L}.")
    Yraw = df[ycols].to_numpy(float)

    # physics-aware smoothing + shape normalization
    Ymono  = np.minimum.accumulate(Yraw, axis=1)              # enforce monotone decay
    y0     = np.maximum(Ymono[:, [0]], 1e-9)                  # avoid division by zero
    Yshape = Ymono / y0                                       # normalize by initial flux
    Ydiff  = np.c_[np.diff(Ymono, axis=1),
                   np.zeros((len(Ymono), 1))]                 # pad to length L

    return ycols, Yshape, Ydiff

# --------- OOF MiniRocket features ----------
def oof_minirocket(Xpanel: np.ndarray, groups: np.ndarray) -> np.ndarray:
    """
    Out-of-fold MiniRocket features (then SVD + scaling).
    Xpanel shape: (n_samples, n_channels, L)  (here 2 channels: Yshape, Ydiff)
    Returns Xstd: (n_samples, n_components)
    """
    gkf = GroupKFold(n_splits=min(5, max(3, len(np.unique(groups)))))
    oof = None
    for tr, te in gkf.split(Xpanel, groups=groups):
        mr = MiniRocketMultivariate(random_state=RNG)
        mr.fit(Xpanel[tr])
        Xtr = mr.transform(Xpanel[tr])
        Xte = mr.transform(Xpanel[te])
        if oof is None:
            oof = np.zeros((Xpanel.shape[0], Xtr.shape[1]), dtype=float)
        oof[tr] = Xtr
        oof[te] = Xte

    # compress with SVD + scale (no centering)
    n_comp = max(2, min(300, oof.shape[1]-1))
    svd = TruncatedSVD(n_components=n_comp, random_state=RNG)
    Xsvd = svd.fit_transform(oof)
    Xstd = StandardScaler(with_mean=False).fit_transform(Xsvd)
    return Xstd

# --------- grouped CV classifier on stacked features ----------
def run_categorical(df_sub, y, label_name, groups, Yshape, Ydiff, Xmr_oof):
    """
    Grouped CV for a categorical target:
      - shape_feats(Yshape) + MiniRocket OOF features
      - LogisticRegression with class_weight='balanced'
    Returns F1_macro and accuracy.
    """
    gkf = GroupKFold(n_splits=min(5, max(3, len(np.unique(groups)))))

    oof_pred = np.full(len(df_sub), -1, dtype=int)
    f1s, accs = [], []

    for tr, te in gkf.split(Yshape, groups=groups):
        # shape features
        shp_tr = shape_feats(Yshape[tr]).to_numpy(float)
        shp_te = shape_feats(Yshape[te]).to_numpy(float)

        # scalers for shape + MiniRocket parts
        ss_shp = StandardScaler(with_mean=True).fit(shp_tr)
        ss_mr  = StandardScaler(with_mean=False).fit(Xmr_oof[tr])

        Xtr = hstack([
            csr_matrix(ss_shp.transform(shp_tr)),
            csr_matrix(ss_mr.transform(Xmr_oof[tr]))
        ])
        Xte = hstack([
            csr_matrix(ss_shp.transform(shp_te)),
            csr_matrix(ss_mr.transform(Xmr_oof[te]))
        ])

        clf = LogisticRegression(
            max_iter=4000,
            class_weight='balanced',
            multi_class='auto',
            solver='lbfgs',
            random_state=RNG
        )
        clf.fit(Xtr, y[tr])
        pred = clf.predict(Xte)
        oof_pred[te] = pred
        f1s.append(f1_score(y[te], pred, average="macro"))
        accs.append(accuracy_score(y[te], pred))

    F1m, ACC = float(np.mean(f1s)), float(np.mean(accs))

    # Optional confusion matrix
    cm = confusion_matrix(y, oof_pred)
    labs = np.unique(y)
    plt.figure(figsize=(4.5, 4))
    im = plt.imshow(cm, cmap="Blues")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.title(f"{label_name} (stacked) – Confusion")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    xt = [f"pred {i}" for i in labs]
    yt = [f"true {i}" for i in labs]
    plt.xticks(range(len(labs)), xt, rotation=45, ha='right')
    plt.yticks(range(len(labs)), yt)
    plt.tight_layout()
    plt.savefig(OUT / f"{label_name}_confusion.png", dpi=200, bbox_inches="tight")
    plt.show()

    print(f"[{label_name}]  F1_macro={F1m:.3f} | Acc={ACC:.3f}")
    print(classification_report(y, oof_pred))

    return F1m, ACC

# --------- time-window permutation importance ----------
def time_window_importance(df_sub, target_series, scorer, Yshape, n_bins=6):
    """
    Permutation importance across time windows on Yshape.
    scorer(Yshape_alt) should:
      - perform grouped CV internally
      - return a scalar metric (e.g. F1_macro)
    Returns: fractional importance for S1..S6.
    """
    rng = np.random.default_rng(0)
    L = Yshape.shape[1]
    bins = np.array_split(np.arange(L), n_bins)
    base = scorer(Yshape)   # baseline with original Yshape
    drops = []
    for b in bins:
        Yp = Yshape.copy()
        Yp[:, b] = Yp[rng.permutation(len(Yp)), :][:, b]
        s = scorer(Yp)
        drops.append(max(0.0, base - s))
    drops = np.asarray(drops)
    frac = drops / (drops.sum() + 1e-12)
    return frac

# ---------- build curves & groups ----------
ycols, Yshape, Ydiff = build_curves(dfm)
groups = (dfm["article_id"] if "article_id" in dfm.columns else dfm["curve_id"])\
            .astype("category").cat.codes.to_numpy()
Xpanel = np.stack([Yshape, Ydiff], axis=1)
Xmr_oof = oof_minirocket(Xpanel, groups)

# ---------- targets ----------
results = []
timeimp_rows = []

# 1) oil_type / material (categorical)
for col in ["oil_type", "material"]:
    if col in dfm.columns and dfm[col].notna().sum() >= 20:
        y = dfm[col].astype("category").cat.codes.to_numpy()
        F1m, ACC = run_categorical(dfm, y, col, groups, Yshape, Ydiff, Xmr_oof)
        results.append({"target": col, "type": "categorical", "F1m": F1m, "Acc": ACC})

        # time-window importance for this categorical target
        def scorer_cat(Yshape_alt):
            gkf = GroupKFold(n_splits=min(5, max(3, len(np.unique(groups)))))
            sc = []
            for tr, te in gkf.split(Yshape_alt, groups=groups):
                shp_tr = shape_feats(Yshape_alt[tr]).to_numpy(float)
                shp_te = shape_feats(Yshape_alt[te]).to_numpy(float)
                ss = StandardScaler(with_mean=True).fit(shp_tr)
                Xtr = ss.transform(shp_tr)
                Xte = ss.transform(shp_te)
                clf = LogisticRegression(
                    max_iter=3000,
                    class_weight='balanced',
                    multi_class='auto',
                    solver='lbfgs',
                    random_state=RNG
                )
                clf.fit(Xtr, y[tr])
                pred = clf.predict(Xte)
                sc.append(f1_score(y[te], pred, average="macro"))
            return float(np.mean(sc))

        imp = time_window_importance(dfm, dfm[col], scorer_cat, Yshape, n_bins=6)
        timeimp_rows.append(pd.Series(imp, index=[f"S{i+1}" for i in range(6)], name=col))

# 2) droplet size (3-bin) if available
if "droplet_um" in dfm.columns:
    ynum = pd.to_numeric(dfm["droplet_um"], errors="coerce")
    mask = ynum.notna()
    sub = dfm.loc[mask].copy()
    Yshape_sub = Yshape[mask]
    Ydiff_sub  = Ydiff[mask]
    Xmr_sub    = Xmr_oof[mask]
    groups_sub = groups[mask]

    # 3-bin global quantiles
    qbins = pd.qcut(ynum[mask], q=3, labels=False, duplicates="drop").to_numpy()
    F1m, ACC = run_categorical(sub, qbins, "droplet_um_3bin", groups_sub, Yshape_sub, Ydiff_sub, Xmr_sub)
    results.append({"target": "droplet_um_3bin", "type": "categorical", "F1m": F1m, "Acc": ACC})

    # time-window importance for droplet_3bin (shape-only classifier inside scorer)
    def scorer_drop(Yshape_alt):
        gkf = GroupKFold(n_splits=min(5, max(3, len(np.unique(groups_sub)))))
        sc = []
        for tr, te in gkf.split(Yshape_alt, groups=groups_sub):
            shp_tr = shape_feats(Yshape_alt[tr]).to_numpy(float)
            shp_te = shape_feats(Yshape_alt[te]).to_numpy(float)
            ss = StandardScaler(with_mean=True).fit(shp_tr)
            Xtr = ss.transform(shp_tr)
            Xte = ss.transform(shp_te)
            clf = LogisticRegression(
                max_iter=3000,
                class_weight='balanced',
                multi_class='auto',
                solver='lbfgs',
                random_state=RNG
            )
            clf.fit(Xtr, qbins[tr])
            pred = clf.predict(Xte)
            sc.append(f1_score(qbins[te], pred, average="macro"))
        return float(np.mean(sc))

    imp = time_window_importance(sub, qbins, scorer_drop, Yshape_sub, n_bins=6)
    timeimp_rows.append(pd.Series(imp, index=[f"S{i+1}" for i in range(6)], name="droplet_um_3bin"))

# 3) numeric → 3-bin (coarse classification)
NUM_CANDS = ["p_bar","cross_flow_ms","porosity","pore_nm","jw_lmh",
             "oil_ppm","salt_gl","temp_c","droplet_um"]

for col in NUM_CANDS:
    if col in dfm.columns:
        ynum = pd.to_numeric(dfm[col], errors="coerce")
        mask = ynum.notna()
        if mask.sum() < 25:
            continue
        sub = dfm.loc[mask].copy()
        Yshape_sub = Yshape[mask]
        Ydiff_sub  = Ydiff[mask]
        Xmr_sub    = Xmr_oof[mask]
        groups_sub = groups[mask]
        try:
            ybins = pd.qcut(ynum[mask], q=3, labels=False, duplicates="drop").to_numpy()
        except Exception:
            continue
        F1m, ACC = run_categorical(sub, ybins, f"{col}_3bin", groups_sub, Yshape_sub, Ydiff_sub, Xmr_sub)
        results.append({"target": col, "type": "numeric→3bin", "F1m": F1m, "Acc": ACC})

        # time-window importance (shape-only inside scorer)
        def scorer_num(Yshape_alt):
            gkf = GroupKFold(n_splits=min(5, max(3, len(np.unique(groups_sub)))))
            sc = []
            for tr, te in gkf.split(Yshape_alt, groups=groups_sub):
                shp_tr = shape_feats(Yshape_alt[tr]).to_numpy(float)
                shp_te = shape_feats(Yshape_alt[te]).to_numpy(float)
                ss = StandardScaler(with_mean=True).fit(shp_tr)
                Xtr = ss.transform(shp_tr)
                Xte = ss.transform(shp_te)
                clf = LogisticRegression(
                    max_iter=3000,
                    class_weight='balanced',
                    multi_class='auto',
                    solver='lbfgs',
                    random_state=RNG
                )
                clf.fit(Xtr, ybins[tr])
                pred = clf.predict(Xte)
                sc.append(f1_score(ybins[te], pred, average="macro"))
            return float(np.mean(sc))

        imp = time_window_importance(sub, ybins, scorer_num, Yshape_sub, n_bins=6)
        timeimp_rows.append(pd.Series(imp, index=[f"S{i+1}" for i in range(6)], name=f"{col}_3bin"))

# ----- save tables -----
res_df = pd.DataFrame(results)
if not res_df.empty:
    res_df = res_df[["target","type","F1m","Acc"]]
res_df.to_csv(OUT / "curve_to_meta_results.csv", index=False)

if timeimp_rows:
    imp_df = pd.DataFrame(timeimp_rows)
    imp_df.to_csv(OUT / "time_window_importance.csv")
    # quick bar for one exemplar (oil_type or droplet)
    pick = [n for n in imp_df.index if "oil_type" in n or "droplet" in n]
    if pick:
        row = imp_df.loc[pick[0]]
        plt.figure(figsize=(5.5,3))
        row.plot(kind="bar")
        plt.title(f"Time-window importance: {pick[0]} (S1..S6)")
        plt.ylabel("fraction")
        plt.tight_layout()
        plt.savefig(OUT / f"time_importance_{pick[0]}.png", dpi=200, bbox_inches="tight")
        plt.show()

# ----- zip everything -----
zip_path = shutil.make_archive("pyts_final", "zip", base_dir=str(OUT))
print("\nSaved:")
print(" -", (OUT / "curve_to_meta_results.csv").resolve())
if timeimp_rows:
    print(" -", (OUT / "time_window_importance.csv").resolve())
print(" -", os.path.abspath(zip_path))
print("\nResults:")
print(res_df if not res_df.empty else "(no targets available)")
