In [8]:
# ============================================================
# STRICT stability analysis:
# Only reads EXACT (TIME_STEPS x ISOLEVELS) files you set.
# Ranks candidate third feature z by kNN local variance of Sd
# in [stretch, curvature, z]-space within each group (timestep).
# Then aggregates ranks for stability.
# ============================================================

import numpy as np
import pandas as pd
from pathlib import Path
from flamekit.io_fronts import Case, folder, front_path

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# -------------------------
# USER SETTINGS
# -------------------------
BASE_DIR   = Path("../data/isocontours")
PHI        = 0.40
LAT_SIZE   = "100"
POST       = True

# Grouping axis for stability:
#   "timestep"  -> ranks per timestep (recommended)
#   "isolevel"  -> ranks per isolevel
#   "both"      -> ranks per (timestep, isolevel)
STABILITY_AXIS = "timestep"

# Strict file selection (ONLY these will be read)
ISOLEVELS   = [3,4.0,4.5]
TIME_STEPS  = [140,150,160,170,180, 190, 200, 210]

TARGET_COL  = "DW_FDS"          # Sd-like target
STRETCH_COL = "stretch_rate"
CURV_COL    = "curvature"

KNN_K = 50
MIN_ROWS_PER_GROUP = 5000
SUBSAMPLE_PER_GROUP = None      # e.g. 20000 if you need speed

TOPK_LIST = [10]

SCALARS = [
    "H2","O2","H2O","H","O","OH","HO2","H2O2","HRR",
    "omega_H2O","omega_OH",
    "curvature","stretch_rate","DW_FDS",
    "abs_flame_prop_vel_normal",
    "flow_velocity_normal","flow_velocity_tangential",
    "strain_rate",
    "density_ratio_sigma",
    "gradT",
    "total_heat_conduction",
    "H2_diffusion_total","O2_diffusion_total","H_diffusion_total",
    "vorticity",
]

# Exclude any z containing these substrings (target is excluded separately)
EXCLUDE_SUBSTRINGS = ["FDS"]


# -------------------------
# STRICT LOADER: only reads requested files
# -------------------------
def load_isocontour_data_strict(
    base_dir: Path,
    phi: float,
    lat_size: str,
    post: bool,
    isolevels: list[float],
    timesteps: list[int],
    scalars: list[str],
) -> pd.DataFrame:
    assert isolevels is not None and timesteps is not None, "Set ISOLEVELS and TIME_STEPS (not None)."

    base_case = Case(base_dir=base_dir, phi=phi, lat_size=lat_size, time_step=0, post=post)
    phi_dir = folder(base_case)
    if not phi_dir.exists():
        raise FileNotFoundError(f"Directory not found: {phi_dir.resolve()}")

    files_to_read = []
    for t in timesteps:
        for iso in isolevels:
            case = Case(base_dir=base_dir, phi=phi, lat_size=lat_size, time_step=t, post=post)
            fp = front_path(case, float(iso))
            found = fp if fp.exists() else None

            if found is None:
                tag = "post_" if post else ""
                hits = sorted(phi_dir.glob(f"extracted_flame_front_{tag}{t}_iso_{iso}*.csv"))
                if hits:
                    found = hits[0]

            if found is None:
                raise FileNotFoundError(f"Missing file for timestep={t}, iso={iso} in {phi_dir}")

            files_to_read.append(found)

    print("Will read these files (STRICT):")
    for f in files_to_read:
        print("  ", f.name)

    frames = []
    for fp in files_to_read:
        df = pd.read_csv(fp)
        df["__timestep__"] = int(fp.stem.split("_post_")[1].split("_iso_")[0])
        df["__isolevel__"] = float(fp.stem.split("_iso_")[1])
        df["__source_file__"] = fp.name
        frames.append(df)

    out = pd.concat(frames, ignore_index=True)

    # Keep only requested scalar columns that exist + group tags
    keep = [c for c in scalars if c in out.columns] + ["__timestep__", "__isolevel__", "__source_file__"]
    return out[keep]


# -------------------------
# Metric: kNN local variance of Sd in 3D space
# -------------------------
def knn_nonuniqueness(d: pd.DataFrame, s_col: str, kappa_col: str, z_col: str, y_col: str, k: int):
    X = d[[s_col, kappa_col, z_col]].to_numpy(float)
    y = d[y_col].to_numpy(float)

    if len(d) <= k + 2:
        return np.nan, np.nan

    Xs = StandardScaler().fit_transform(X)
    nn = NearestNeighbors(n_neighbors=k + 1).fit(Xs)
    idx = nn.kneighbors(Xs, return_distance=False)[:, 1:]  # drop self
    local_var = np.var(y[idx], axis=1, ddof=1)

    return float(np.mean(local_var)), float(np.median(local_var))


# -------------------------
# Rank z candidates within one group
# -------------------------
def rank_candidates_in_group(dfg: pd.DataFrame) -> pd.DataFrame:
    for c in [STRETCH_COL, CURV_COL, TARGET_COL]:
        if c not in dfg.columns:
            raise KeyError(f"Missing required column: {c}")

    if len(dfg) < max(MIN_ROWS_PER_GROUP, KNN_K + 5):
        return pd.DataFrame()

    if SUBSAMPLE_PER_GROUP is not None and len(dfg) > SUBSAMPLE_PER_GROUP:
        dfg = dfg.sample(SUBSAMPLE_PER_GROUP, random_state=0).reset_index(drop=True)

    exclude_exact = {STRETCH_COL, CURV_COL, TARGET_COL, "__timestep__", "__isolevel__", "__source_file__"}

    candidates = []
    for c in dfg.columns:
        if c in exclude_exact:
            continue
        if any(sub in c for sub in EXCLUDE_SUBSTRINGS):
            continue
        if pd.api.types.is_numeric_dtype(dfg[c]):
            candidates.append(c)

    base_cols = [STRETCH_COL, CURV_COL, TARGET_COL]
    rows = []
    for z in candidates:
        d = dfg.dropna(subset=base_cols + [z])
        if len(d) < max(MIN_ROWS_PER_GROUP, KNN_K + 5):
            continue
        mean_var, med_var = knn_nonuniqueness(d, STRETCH_COL, CURV_COL, z, TARGET_COL, k=KNN_K)
        if np.isfinite(mean_var):
            rows.append({"z_feature": z, "n": len(d), "mean_knn_var": mean_var, "median_knn_var": med_var})

    r = pd.DataFrame(rows)
    if r.empty:
        return r

    r = r.sort_values(["mean_knn_var", "median_knn_var"], ascending=[True, True]).reset_index(drop=True)
    r["rank"] = np.arange(1, len(r) + 1)
    return r


# -------------------------
# Stability aggregation
# -------------------------
def stability_summary(per_group: pd.DataFrame, group_col: str) -> pd.DataFrame:
    g = per_group.copy()

    summary = g.groupby("z_feature").agg(
        n_groups=("rank", "count"),
        mean_rank=("rank", "mean"),
        median_rank=("rank", "median"),
        std_rank=("rank", "std"),
        mean_mean_knn_var=("mean_knn_var", "mean"),
        median_mean_knn_var=("mean_knn_var", "median"),
    ).reset_index()

    nG = g[group_col].nunique()
    for K in TOPK_LIST:
        topk = g[g["rank"] <= K].groupby("z_feature")[group_col].nunique()
        summary[f"frac_top{K}"] = summary["z_feature"].map((topk / nG).to_dict()).fillna(0.0)

    K0 = TOPK_LIST[0]
    summary = summary.sort_values(
        ["mean_rank", f"frac_top{K0}", "mean_mean_knn_var"],
        ascending=[True, False, True]
    ).reset_index(drop=True)
    return summary


# ============================================================
# RUN
# ============================================================
df = load_isocontour_data_strict(BASE_DIR, PHI, LAT_SIZE, POST, ISOLEVELS, TIME_STEPS, SCALARS)

if STABILITY_AXIS == "timestep":
    group_keys = ["__timestep__"]
elif STABILITY_AXIS == "isolevel":
    group_keys = ["__isolevel__"]
elif STABILITY_AXIS == "both":
    group_keys = ["__timestep__", "__isolevel__"]
else:
    raise ValueError("STABILITY_AXIS must be one of: 'timestep', 'isolevel', 'both'")

per_group_tables = []
for key, dfg in df.groupby(group_keys):
    ranks = rank_candidates_in_group(dfg)
    if ranks.empty:
        continue

    if isinstance(key, tuple):
        for kname, kval in zip(group_keys, key):
            ranks[kname] = kval
    else:
        ranks[group_keys[0]] = key

    per_group_tables.append(ranks)

if not per_group_tables:
    raise RuntimeError("No groups produced rankings (too few rows per group, or too many NaNs).")

per_group = pd.concat(per_group_tables, ignore_index=True)

# group id string
if len(group_keys) == 1:
    per_group["group_id"] = per_group[group_keys[0]].astype(str)
else:
    per_group["group_id"] = per_group[group_keys].astype(str).agg("_".join, axis=1)

per_group.to_csv("stability_per_group_rankings.csv", index=False)
print("Saved: stability_per_group_rankings.csv")

summary = stability_summary(per_group, group_col="group_id")
summary.to_csv("stability_summary.csv", index=False)
print("Saved: stability_summary.csv")

print("\nTop 20 most stable z-features:")
print(summary.head(20).to_string(index=False))


Will read these files (STRICT):
   extracted_flame_front_post_140_iso_3.0.csv
   extracted_flame_front_post_140_iso_4.0.csv
   extracted_flame_front_post_140_iso_4.5.csv
   extracted_flame_front_post_150_iso_3.0.csv
   extracted_flame_front_post_150_iso_4.0.csv
   extracted_flame_front_post_150_iso_4.5.csv
   extracted_flame_front_post_160_iso_3.0.csv
   extracted_flame_front_post_160_iso_4.0.csv
   extracted_flame_front_post_160_iso_4.5.csv
   extracted_flame_front_post_170_iso_3.0.csv
   extracted_flame_front_post_170_iso_4.0.csv
   extracted_flame_front_post_170_iso_4.5.csv
   extracted_flame_front_post_180_iso_3.0.csv
   extracted_flame_front_post_180_iso_4.0.csv
   extracted_flame_front_post_180_iso_4.5.csv
   extracted_flame_front_post_190_iso_3.0.csv
   extracted_flame_front_post_190_iso_4.0.csv
   extracted_flame_front_post_190_iso_4.5.csv
   extracted_flame_front_post_200_iso_3.0.csv
   extracted_flame_front_post_200_iso_4.0.csv
   extracted_flame_front_post_200_iso_4.5.csv
  

In [21]:
# ============================================================
# 3-FEATURE SELECTION + STABILITY (NO assumed fixed features)
#
# Goal: find (f1, f2, f3) such that Sd (TARGET_COL) is as close
# as possible to single-valued in [f1,f2,f3] space.
#
# Score(triple) = mean_kNN_var( Sd | neighbors in 3D feature space )
#
# Strict loader: reads ONLY the (TIME_STEPS x ISOLEVELS) files.
# ============================================================

import numpy as np
import pandas as pd
from pathlib import Path
from flamekit.io_fronts import Case, folder, front_path
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# -------------------------
# USER SETTINGS
# -------------------------
BASE_DIR   = Path("../data/isocontours")
PHI        = 0.40
LAT_SIZE   = "100"
POST       = True

# Stability grouping:
#   "timestep" | "isolevel" | "both"
STABILITY_AXIS = "isolevel"

# Strict file selection (ONLY these will be read)
ISOLEVELS   = [3.0,3.5,3.8,4.0,4.5]
TIME_STEPS  = [211]
TARGET_COL  = "DW_FDS"   # Sd-like target

# kNN parameters
KNN_K = 50

# Data thresholds / speed controls
MIN_ROWS_PER_GROUP     = 0
SUBSAMPLE_PER_GROUP    = None    # e.g. 20000 if too slow; None uses all

# Candidate handling
EXCLUDE_SUBSTRINGS = ["FDS"]      # excludes any feature containing these substrings
M_PREFILTER        = 15           # reduce candidate set per group before triple search
RANK_TOP_TRIPLES   = 30           # store top-N triples per group (for stability stats)
TOPK_LIST          = [1, 3, 5, 10] # stability metric: frac groups where triple is in top-K

SCALARS = [
    "curvature",
    "DW_FDS",
    "flow_velocity_normal",
    "flow_velocity_tangential",
    "strain_rate",
    "tangential_strain_rate",
    "normal_strain_rate",
    "vorticity",
]

# ============================================================
# STRICT LOADER (reads only requested combos)
# ============================================================
def load_isocontour_data_strict(base_dir: Path, phi: float, lat_size: str,
    post: bool,
                                isolevels: list[float], timesteps: list[int],
                                scalars: list[str]) -> pd.DataFrame:
    assert isolevels is not None and timesteps is not None, "Set ISOLEVELS and TIME_STEPS (not None)."

    base_case = Case(base_dir=base_dir, phi=phi, lat_size=lat_size, time_step=0, post=post)
    phi_dir = folder(base_case)
    if not phi_dir.exists():
        raise FileNotFoundError(f"Directory not found: {phi_dir.resolve()}")

    files_to_read = []
    for t in timesteps:
        for iso in isolevels:
            # Try common formatting variants
            candidates = [
                phi_dir / f"extracted_flame_front_post_{t}_iso_{iso}.csv",
                phi_dir / f"extracted_flame_front_post_{t}_iso_{iso:.1f}.csv",
                phi_dir / f"extracted_flame_front_post_{t}_iso_{iso:.2f}.csv",
                phi_dir / f"extracted_flame_front_post_{t}_iso_{iso:.3f}.csv",
            ]
            found = next((p for p in candidates if p.exists()), None)
            if found is None:
                hits = sorted(phi_dir.glob(f"extracted_flame_front_post_{t}_iso_{iso}*.csv"))
                if hits:
                    found = hits[0]
            if found is None:
                raise FileNotFoundError(f"Missing file for timestep={t}, iso={iso} in {phi_dir}")
            files_to_read.append(found)

    print("Will read these files (STRICT):")
    for f in files_to_read:
        print("  ", f.name)

    frames = []
    for fp in files_to_read:
        df = pd.read_csv(fp)
        # parse timestep & iso from stem
        stem = fp.stem  # extracted_flame_front_post_210_iso_4.5
        t = int(stem.split("_post_")[1].split("_iso_")[0])
        iso = float(stem.split("_iso_")[1])
        df["__timestep__"] = t
        df["__isolevel__"] = iso
        df["__source_file__"] = fp.name
        frames.append(df)

    out = pd.concat(frames, ignore_index=True)
    keep = [c for c in scalars if c in out.columns] + ["__timestep__", "__isolevel__", "__source_file__"]
    return out[keep]

# ============================================================
# kNN local-variance score (dimension-agnostic)
# ============================================================
def knn_local_variance_score(d: pd.DataFrame, feat_cols: list[str], y_col: str, k: int):
    """
    Score = mean over points of Var(y among kNN in feature space)
    Lower => closer to single-valued mapping y = f(features).
    """
    X = d[feat_cols].to_numpy(float)
    y = d[y_col].to_numpy(float)

    if len(d) <= k + 2:
        return np.nan, np.nan

    Xs = StandardScaler().fit_transform(X)

    nn = NearestNeighbors(n_neighbors=k + 1).fit(Xs)
    idx = nn.kneighbors(Xs, return_distance=False)[:, 1:]  # drop self
    local_var = np.var(y[idx], axis=1, ddof=1)

    return float(np.mean(local_var)), float(np.median(local_var))

# ============================================================
# Candidate extraction
# ============================================================
def get_candidates(df: pd.DataFrame) -> list[str]:
    exclude_exact = {TARGET_COL, "__timestep__", "__isolevel__", "__source_file__"}
    candidates = []
    for c in df.columns:
        if c in exclude_exact:
            continue
        if any(sub in c for sub in EXCLUDE_SUBSTRINGS):
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            candidates.append(c)
    return candidates

# ============================================================
# Rank triples in one group (prefilter then full triple search)
# ============================================================
def rank_triples_in_group(dfg: pd.DataFrame) -> pd.DataFrame:
    if TARGET_COL not in dfg.columns:
        raise KeyError(f"Missing required column: {TARGET_COL}")

    # basic group size / subsample
    if len(dfg) < max(MIN_ROWS_PER_GROUP, KNN_K + 5):
        return pd.DataFrame()

    if SUBSAMPLE_PER_GROUP is not None and len(dfg) > SUBSAMPLE_PER_GROUP:
        dfg = dfg.sample(SUBSAMPLE_PER_GROUP, random_state=0).reset_index(drop=True)

    candidates = get_candidates(dfg)
    if len(candidates) < 3:
        return pd.DataFrame()

    # Drop rows missing target early
    dfg = dfg.dropna(subset=[TARGET_COL])
    if len(dfg) < max(MIN_ROWS_PER_GROUP, KNN_K + 5):
        return pd.DataFrame()

    # ---- Stage 1: Prefilter using 1D kNN variance (fast)
    oneD_rows = []
    for f in candidates:
        d = dfg.dropna(subset=[f, TARGET_COL])
        if len(d) < max(MIN_ROWS_PER_GROUP, KNN_K + 5):
            continue
        mean_var, med_var = knn_local_variance_score(d, [f], TARGET_COL, KNN_K)
        if np.isfinite(mean_var):
            oneD_rows.append((f, mean_var, med_var, len(d)))

    if not oneD_rows:
        return pd.DataFrame()

    oneD = pd.DataFrame(oneD_rows, columns=["feature", "mean_var_1d", "median_var_1d", "n"])
    oneD = oneD.sort_values(["mean_var_1d", "median_var_1d"], ascending=[True, True])

    pre = oneD["feature"].head(M_PREFILTER).tolist()
    if len(pre) < 3:
        return pd.DataFrame()

    # ---- Stage 2: Full triple search over prefiltered set
    rows = []
    base_needed = [TARGET_COL] + pre
    dbase = dfg.dropna(subset=[TARGET_COL])  # keep target non-NaN

    for f1, f2, f3 in combinations(pre, 3):
        d = dbase.dropna(subset=[f1, f2, f3, TARGET_COL])
        if len(d) < max(MIN_ROWS_PER_GROUP, KNN_K + 5):
            continue
        mean_var, med_var = knn_local_variance_score(d, [f1, f2, f3], TARGET_COL, KNN_K)
        if np.isfinite(mean_var):
            rows.append({
                "f1": f1, "f2": f2, "f3": f3,
                "n": len(d),
                "mean_knn_var": mean_var,
                "median_knn_var": med_var
            })

    r = pd.DataFrame(rows)
    if r.empty:
        return r

    r = r.sort_values(["mean_knn_var", "median_knn_var"], ascending=[True, True]).reset_index(drop=True)
    r["rank"] = np.arange(1, len(r) + 1)
    return r

# ============================================================
# Stability aggregation over groups
# ============================================================
def stability_summary_triples(per_group: pd.DataFrame, group_col: str) -> pd.DataFrame:
    g = per_group.copy()
    g["triple"] = g.apply(lambda r: tuple(sorted([r["f1"], r["f2"], r["f3"]])), axis=1)

    summary = g.groupby("triple").agg(
        n_groups=("rank", "count"),
        mean_rank=("rank", "mean"),
        median_rank=("rank", "median"),
        std_rank=("rank", "std"),
        mean_mean_knn_var=("mean_knn_var", "mean"),
        median_mean_knn_var=("mean_knn_var", "median"),
    ).reset_index()

    nG = g[group_col].nunique()
    for K in TOPK_LIST:
        topk = g[g["rank"] <= K].groupby("triple")[group_col].nunique()
        summary[f"frac_top{K}"] = summary["triple"].map((topk / nG).to_dict()).fillna(0.0)

    # Sort: stable best = low mean rank, high frac_top1, then low mean var
    summary = summary.sort_values(
        ["mean_rank", "frac_top1", "mean_mean_knn_var"],
        ascending=[True, False, True]
    ).reset_index(drop=True)

    # Expand triple into columns for readability
    summary[["t1", "t2", "t3"]] = pd.DataFrame(summary["triple"].tolist(), index=summary.index)
    return summary

def feature_frequency(best_triple_per_group: pd.DataFrame) -> pd.DataFrame:
    # Counts how often each feature appears in the BEST triple (rank=1) across groups
    feats = []
    for _, r in best_triple_per_group.iterrows():
        feats.extend([r["f1"], r["f2"], r["f3"]])
    s = pd.Series(feats).value_counts().rename_axis("feature").reset_index(name="count_in_best_triples")
    s["frac_groups"] = s["count_in_best_triples"] / best_triple_per_group["group_id"].nunique()
    return s

# ============================================================
# RUN
# ============================================================
df = load_isocontour_data_strict(BASE_DIR, PHI, LAT_SIZE, POST, ISOLEVELS, TIME_STEPS, SCALARS)

if STABILITY_AXIS == "timestep":
    group_keys = ["__timestep__"]
elif STABILITY_AXIS == "isolevel":
    group_keys = ["__isolevel__"]
elif STABILITY_AXIS == "both":
    group_keys = ["__timestep__", "__isolevel__"]
else:
    raise ValueError("STABILITY_AXIS must be one of: 'timestep', 'isolevel', 'both'")

per_group_tables = []
for key, dfg in df.groupby(group_keys):
    ranked = rank_triples_in_group(dfg)
    if ranked.empty:
        continue

    # attach group identifiers
    if isinstance(key, tuple):
        for kname, kval in zip(group_keys, key):
            ranked[kname] = kval
        group_id = "_".join(str(v) for v in key)
    else:
        ranked[group_keys[0]] = key
        group_id = str(key)

    ranked["group_id"] = group_id

    # keep only top-N triples per group for stability statistics
    ranked = ranked.head(RANK_TOP_TRIPLES)
    per_group_tables.append(ranked)

if not per_group_tables:
    raise RuntimeError("No groups produced rankings. Reduce MIN_ROWS_PER_GROUP or check NaNs / filters.")

per_group = pd.concat(per_group_tables, ignore_index=True)

# Save per-group triple rankings
per_group.to_csv("stability_per_group_triple_rankings.csv", index=False)
print("Saved: stability_per_group_triple_rankings.csv")

# Best triple per group
best_per_group = per_group.sort_values(["group_id", "rank"]).groupby("group_id", as_index=False).first()
best_per_group.to_csv("stability_best_triple_per_group.csv", index=False)
print("Saved: stability_best_triple_per_group.csv")

# Stability summary over triples
summary = stability_summary_triples(per_group, group_col="group_id")
summary.to_csv("stability_triple_summary.csv", index=False)
print("Saved: stability_triple_summary.csv")

# Feature frequency in best triples
freq = feature_frequency(best_per_group)
freq.to_csv("stability_feature_frequency_in_best_triples.csv", index=False)
print("Saved: stability_feature_frequency_in_best_triples.csv")

print("\nBest triple per group:")
print(best_per_group[["group_id","f1","f2","f3","mean_knn_var","median_knn_var","n"]].to_string(index=False))

print("\nTop 20 most stable triples:")
print(summary.head(20)[["t1","t2","t3","n_groups","mean_rank","frac_top1","frac_top3","frac_top5","mean_mean_knn_var"]].to_string(index=False))

print("\nMost frequent features in BEST triples:")
print(freq.head(20).to_string(index=False))


Will read these files (STRICT):
   extracted_flame_front_post_211_iso_3.0.csv
   extracted_flame_front_post_211_iso_3.5.csv
   extracted_flame_front_post_211_iso_3.8.csv
   extracted_flame_front_post_211_iso_4.0.csv
   extracted_flame_front_post_211_iso_4.5.csv
Saved: stability_per_group_triple_rankings.csv
Saved: stability_best_triple_per_group.csv
Saved: stability_triple_summary.csv
Saved: stability_feature_frequency_in_best_triples.csv

Best triple per group:
group_id                       f1                   f2                   f3  mean_knn_var  median_knn_var    n
     3.0                curvature          strain_rate flow_velocity_normal      0.021552        0.003118 3331
     3.5                curvature          strain_rate flow_velocity_normal      0.026315        0.003946 3553
     3.8                curvature flow_velocity_normal          strain_rate      0.041639        0.005204 3781
     4.0     flow_velocity_normal            curvature          strain_rate      0.041520