In [4]:
# ============================================================
# Symbolic regression for Sd (DW_FDS) using NEW I/O
#   - Loads extracted flame-front CSV(s) via flamekit.io_fronts
#   - Uses ONLY curvature + stretch_rate as inputs
#   - Fits PySRRegressor to predict DW_FDS
#
# Notes:
#   • This notebook-level workflow no longer needs mpi4py/SEMDataset
#     because the CSVs are already extracted.
#   • If you still want MPI (e.g., to run different ISOLEVELS per rank),
#     you can wrap the loop over ISOLEVELS by rank.
# ============================================================

from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

from flamekit.io_fronts import Case, load_fronts

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from pysr import PySRRegressor

## Input Parameters

BASE_DIR = Path("../isocontours")

PHI = 0.40
LAT_SIZE = "200"
TIME_STEP = 230
POST = True

# Choose one or more isotherms (you can concatenate them)
ISOLEVELS = [4.4, 4.5, 4.6, 4.7]

# Target + features
TARGET_VAR = "DW_FDS"
FEATURES = ["curvature", "stretch_rate"]

# Optional: if you want to include which isotherm each row came from
ADD_ISO_COLUMN = True

# Data cleaning / sampling
DROP_NANS = True
MAX_ROWS = None  # e.g. 300000 for faster runs; None uses all
RANDOM_STATE = 0

# Scaling: often helpful for symbolic regression stability
STANDARDIZE_X = True  # z-score X features
STANDARDIZE_Y = False  # usually keep y in physical units; set True if needed

TEST_SIZE = 0.25

# -------------------------
# Load fronts via new I/O
# -------------------------
case = Case(
    base_dir=BASE_DIR,
    phi=PHI,
    lat_size=LAT_SIZE,
    time_step=TIME_STEP,
    post=POST,
)

fronts = load_fronts(case, ISOLEVELS)  # dict: {isolevel: DataFrame}

# Concatenate all selected isolevels
df_list = []
for iso, dfi in fronts.items():
    d = dfi.copy()
    if ADD_ISO_COLUMN:
        d["c_iso"] = float(iso)
    df_list.append(d)

df = pd.concat(df_list, ignore_index=True)

# -------------------------
# Basic validation + cleaning
# -------------------------
missing_cols = [c for c in FEATURES + [TARGET_VAR] if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns in loaded CSVs: {missing_cols}")

if DROP_NANS:
    df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=FEATURES + [TARGET_VAR])

if MAX_ROWS is not None and len(df) > MAX_ROWS:
    df = df.sample(n=MAX_ROWS, random_state=RANDOM_STATE).reset_index(drop=True)

print("Loaded rows:", len(df))
print("Columns:", list(df.columns))

# -------------------------
# Prepare X, y
# -------------------------
X_df = df[FEATURES].copy()
y = df[TARGET_VAR].to_numpy(dtype=np.float64)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_df, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Optional scaling
x_scaler = None
y_scaler = None

if STANDARDIZE_X:
    x_scaler = StandardScaler()
    X_train = pd.DataFrame(
        x_scaler.fit_transform(X_train),
        columns=FEATURES
    )
    X_test = pd.DataFrame(
        x_scaler.transform(X_test),
        columns=FEATURES
    )

if STANDARDIZE_Y:
    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
    y_test = y_scaler.transform(y_test.reshape(-1, 1)).ravel()

print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_test  shape:", X_test.shape, "y_test  shape:", y_test.shape)

## PySR symbolic regression (DW_FDS ~ f(curvature, stretch_rate))

# -------------------------
# PySR model
# -------------------------
# Keep operators modest first; add complexity later.
# Use a loss robust to outliers if needed (e.g. "loss(x, y) = (x-y)^2")
model = PySRRegressor(
    niterations=80,
    populations=30,
    population_size=40,
    model_selection="best",  # or "accuracy" / "best" depending on your preference
    batching=True,
    batch_size=1024,

    # Operators
    binary_operators=["+", "-", "*", "/"],
    unary_operators=["sin", "cos", "exp"],

    # Controls
    maxsize=15,  # complexity cap
    # You can force dimensionless safety if needed; skip for now.
    random_state=RANDOM_STATE,
    verbosity=1,
)

model.fit(X_train, y_train)

print(model)

## Evaluate on test set
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model.predict(X_test)

r2 = float(r2_score(y_test, y_pred))
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))

print(f"Test R^2:  {r2:.4f}")
print(f"Test RMSE: {rmse:.6e}")

# If you standardized y, convert back for interpretation
if STANDARDIZE_Y and y_scaler is not None:
    y_pred_phys = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).ravel()
    y_test_phys = y_scaler.inverse_transform(y_test.reshape(-1, 1)).ravel()
    r2_phys = float(r2_score(y_test_phys, y_pred_phys))
    rmse_phys = float(np.sqrt(mean_squared_error(y_test_phys, y_pred_phys)))
    print(f"[De-standardized] Test R^2:  {r2_phys:.4f}")
    print(f"[De-standardized] Test RMSE: {rmse_phys:.6e}")

## Best equation (SymPy form)
expr = model.sympy()
print(expr)

## If X was standardized: express equation in ORIGINAL units (optional helper)
# If you want the symbolic expression in original physical variables,
# you must substitute:
#   x_scaled = (x - mean)/std
# This helper builds a SymPy expression in terms of physical curvature/stretch_rate.

import sympy as sp

if STANDARDIZE_X and x_scaler is not None:
    curv, stretch = sp.symbols("curvature stretch_rate")

    mu = x_scaler.mean_
    sig = x_scaler.scale_

    subs_map = {
        sp.Symbol(FEATURES[0]): (curv - mu[0]) / sig[0],
        sp.Symbol(FEATURES[1]): (stretch - mu[1]) / sig[1],
    }

    # model.sympy() uses symbols named after your dataframe columns.
    # Build substitution using those exact names:
    expr_scaled = model.sympy()
    expr_phys = sp.simplify(expr_scaled.subs(subs_map))

    print("\nExpression in original units:")
    print(expr_phys)


Loaded rows: 77124
Columns: ['x', 'y', 'z', 'v', 'T', 'H2', 'O2', 'H2O', 'H', 'O', 'OH', 'HO2', 'H2O2', 'N2', 'HRR', 'curvature', 'stretch_rate', 'DW_FDS', 'abs_flame_prop_vel_normal', 'flow_velocity_normal', 'flow_velocity_tangential', 'strain_rate', 'tangential_strain_rate', 'normal_strain_rate', 'density_ratio_sigma', 'gradT', 'gradT_normal', 'total_heat_conduction', 'heat_conduction_normal', 'heat_conduction_tangential', 'FDS_src_term', 'FDS_diffusion_term', 'FDS_diff_velocity_term', 'H2_diffusion_total', 'O2_diffusion_total', 'H_diffusion_total', 'vorticity', 'p', 'phi_loc', 'omega_H2', 'omega_O2', 'omega_H2O', 'omega_H', 'omega_O', 'omega_OH', 'omega_HO2', 'omega_H2O2', 'omega_N2', 'u', 'c_iso']
X_train shape: (57843, 2) y_train shape: (57843,)
X_test  shape: (19281, 2) y_test  shape: (19281,)


[ Info: Started!
[ Info: Final population:
[ Info: Results saved to:



Expressions evaluated per second: 1.270e+04
Progress: 920 / 2400 total iterations (38.333%)
════════════════════════════════════════════════════════════════════════════════════════════════════
───────────────────────────────────────────────────────────────────────────────────────────────────
Complexity  Loss       Score      Equation
1           4.286e-01  0.000e+00  y = 0.44238
3           4.148e-01  1.632e-02  y = cos(cos(curvature))
4           3.903e-01  6.091e-02  y = sin(exp(cos(stretch_rate)))
5           3.670e-01  6.146e-02  y = cos(cos(curvature) - -0.26028)
6           3.525e-01  4.026e-02  y = sin(exp(cos(stretch_rate / -0.27253)))
7           3.147e-01  1.134e-01  y = cos(-0.26331 - cos(stretch_rate / 0.088341))
9           2.949e-01  3.255e-02  y = (stretch_rate / ((0.019169 / curvature) + curvature)) ...
                                      * 3.4356
10          2.317e-01  2.412e-01  y = 4.7991 * sin(stretch_rate / ((0.022924 / curvature) + ...
                         