In [7]:
"""
Mechanistic vs ML vs Hybrid WTP modelling using SCADA data.

- PART A: Mechanistic conventional WTP model with WaterTAP (DMF)
- PART B: Pure data-driven conventional WTP model (Random Forest)
- PART C: Hybrid model: ML learns residuals on top of mechanistic DMF for TSS_out
"""

import pandas as pd
import numpy as np

from idaes.core import FlowsheetBlock
from watertap.core.solvers import get_solver
from watertap.core.wt_database import Database
from watertap.core.zero_order_properties import WaterParameterBlock
from watertap.unit_models.zero_order import DualMediaFiltrationZO
from pyomo.environ import ConcreteModel

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [17]:
# ============================================================================
# PART A – MECHANISTIC MODEL (WaterTAP)
# ============================================================================

def build_dmf_flowsheet():
    """
    Build a simple DualMediaFiltration zero-order mechanistic model.

    This is a purely physical model: no SCADA, no ML.
    """
    m = ConcreteModel()
    m.db = Database()

    # Flowsheet & property package
    m.fs = FlowsheetBlock(dynamic=False)
    m.fs.params = WaterParameterBlock(
        solute_list=["nonvolatile_toc", "toc", "tss"]
    )

    # Unit model: dual media filter
    m.fs.unit = DualMediaFiltrationZO(
        property_package=m.fs.params,
        database=m.db,
    )

    # Load default parameters
    m.fs.unit.load_parameters_from_database()

    return m


def set_dmf_inlet_from_row(m, row):
    """
    Map a SCADA row to the DMF inlet conditions.

    Assumes SCADA CSV has:
        - flow_mass_H2O_kg_s
        - flow_mass_TSS_in_kg_s
        - flow_mass_TOC_in_kg_s
        - flow_mass_NVTOC_in_kg_s
    """
    m.fs.unit.inlet.flow_mass_comp[0, "H2O"].fix(float(row["flow_mass_H2O_kg_s"]))
    m.fs.unit.inlet.flow_mass_comp[0, "tss"].fix(float(row["flow_mass_TSS_in_kg_s"]))
    m.fs.unit.inlet.flow_mass_comp[0, "toc"].fix(float(row["flow_mass_TOC_in_kg_s"]))
    m.fs.unit.inlet.flow_mass_comp[0, "nonvolatile_toc"].fix(
        float(row["flow_mass_NVTOC_in_kg_s"])
    )


def simulate_mechanistic_for_dataset(m, df, max_rows=None):
    """
    Run the mechanistic DMF model for a subset of SCADA rows.

    Returns a DataFrame with a single column:
        'mech_TSS_out_kg_s'

    If the solver (e.g. Pynumero) is not available, returns None and
    the rest of the pipeline can still run (pure ML).
    """
    solver = None
    try:
        solver = get_solver()
    except RuntimeError as err:
        print("\n[Mechanistic] Could not obtain WaterTAP solver:")
        print(f"  {err}")
        print("Skipping mechanistic simulation. Pure ML will still work.\n")
        return None

    # Optional: limit the number of rows to speed up
    if max_rows is not None:
        df_use = df.iloc[:max_rows].copy()
    else:
        df_use = df.copy()

    mech_tss_out = []

    for idx, row in df_use.iterrows():
        try:
            set_dmf_inlet_from_row(m, row)
            solver.solve(m)
            # Extract mechanistic treated TSS mass flow
            tss_out = (
                m.fs.unit.properties_treated[0]
                .flow_mass_comp["tss"]
                .value
            )
        except Exception as e:
            print(f"[Mechanistic] Row {idx} failed to solve: {e}")
            tss_out = np.nan

        mech_tss_out.append((idx, tss_out))

    mech_df = pd.DataFrame(
        {
            "index": [i for i, _ in mech_tss_out],
            "mech_TSS_out_kg_s": [v for _, v in mech_tss_out],
        }
    ).set_index("index")

    return mech_df


# ============================================================================
# PART B – PURE ML MODEL (Random Forest on SCADA)
# ============================================================================

def load_scada_data(csv_path: str) -> pd.DataFrame:
    """
    Load SCADA-style training data from CSV.
    """
    df = pd.read_csv('wtp_watertap_training_dataset.csv')

    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"])

    df = df.dropna()  # simple cleanup for now
    return df


def get_feature_and_target_cols():
    """
    Define which columns are used as inputs (X) and outputs (y).

    Features (inputs): process conditions & raw water quality.
    Targets (outputs): plant performance indicators.
    """
    feature_cols = [
        "flow_mass_H2O_kg_s",
        "flow_mass_TSS_in_kg_s",
        "flow_mass_TOC_in_kg_s",
        "flow_mass_NVTOC_in_kg_s",
        "raw_temperature_C",
        "initial_turbidity_ntu",
        "coagulant_dose_mgL_as_Al",
        "polymer_dose_mgL",
        "rapid_mix_G_per_s",
        "rapid_mix_time_s",
        "floc_G_per_s",
        "floc_detention_time_min",
        "chlorine_dose_mgL",
        "contact_time_min",
    ]

    target_cols = [
        "flow_mass_TSS_out_kg_s",
        "filtered_turbidity_NTU",
        "free_chlorine_residual_mgL",
    ]

    return feature_cols, target_cols


def train_pure_ml_model(df: pd.DataFrame):
    """
    Train a pure data-driven WTP model (Random Forest) on SCADA.

    This model DOES NOT know about WaterTAP or physics.
    """
    feature_cols, target_cols = get_feature_and_target_cols()

    X = df[feature_cols].values
    y = df[target_cols].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    rf = RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
    )
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)
    
    # Older sklearn versions don't support "squared" kwarg, so do it manually
    mse_global = mean_squared_error(y_test, y_pred)  # MSE averaged over outputs
    rmse_global = np.sqrt(mse_global)
    
    # RMSE per target (same as before)
    rmse_per_target = np.sqrt(((y_pred - y_test) ** 2).mean(axis=0))


    print("\n=== PURE ML MODEL (Random Forest) – evaluation ===")
    print(f"Global RMSE (combined outputs): {rmse_global:.4f}")
    for col, val in zip(target_cols, rmse_per_target):
        print(f"  RMSE[{col}]: {val:.6f}")

    return rf, feature_cols, target_cols


# ============================================================================
# PART C – HYBRID MODEL (Mechanistic + ML residual for TSS)
# ============================================================================

def train_residual_ml_for_tss(df: pd.DataFrame, mech_df: pd.DataFrame):
    """
    Hybrid scheme:
        residual_TSS = SCADA_TSS_out - mechanistic_TSS_out

    Train an ML model to predict residual_TSS from SCADA features.

    At prediction time:
        TSS_out_hybrid = TSS_out_mechanistic + residualML_pred

    NOTE: This hybridisation only applies to TSS_out, because the DMF
          model naturally provides a solids removal prediction.
    """
    if mech_df is None:
        print("\n[Hybrid] Mechanistic results not available. Skipping residual model.")
        return None, None

    feature_cols, target_cols = get_feature_and_target_cols()

    # Align SCADA and mechanistic outputs by row index
    df_merged = df.join(mech_df, how="inner")
    df_merged = df_merged.dropna(subset=["mech_TSS_out_kg_s", "flow_mass_TSS_out_kg_s"])

    if df_merged.empty:
        print("\n[Hybrid] No overlapping rows for SCADA & mechanistic outputs.")
        return None, None

    # Compute residual
    df_merged["residual_TSS_out_kg_s"] = (
        df_merged["flow_mass_TSS_out_kg_s"]
        - df_merged["mech_TSS_out_kg_s"]
    )

    X = df_merged[feature_cols].values
    y = df_merged["residual_TSS_out_kg_s"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    rf_resid = RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
    )
    rf_resid.fit(X_train, y_train)

    y_pred = rf_resid.predict(X_test)
    mse_resid = mean_squared_error(y_test, y_pred)
    rmse_resid = np.sqrt(mse_resid)


    print("\n=== HYBRID MODEL (Residual RF on top of DMF) – evaluation ===")
    print(f"RMSE[residual_TSS_out_kg_s]: {rmse_resid:.6f}")

    # Compare pure ML vs hybrid on the same subset (optional quick check)
    # Reconstruct TSS predictions: mech + residual
    # (Here we use the test set rows only)
    # For a more rigorous comparison you could restructure this.
    return rf_resid, feature_cols


# ============================================================================
# MAIN – orchestrate everything and show how they relate
# ============================================================================

def main():
    # 1) Load SCADA data
    csv_path = "wtp_watertap_training_dataset.csv"  # adjust as needed
    df = load_scada_data(csv_path)
    print(f"\nLoaded SCADA dataset with shape: {df.shape}")

    # 2) Build mechanistic DMF model
    mech_model = build_dmf_flowsheet()

    # 3) Run mechanistic model for a subset of SCADA rows (e.g., first 200)
    mech_df = simulate_mechanistic_for_dataset(mech_model, df, max_rows=200)
    if mech_df is not None:
        print(f"[Mechanistic] Computed DMF outputs for {len(mech_df)} rows.")

    # 4) Train pure ML model (does NOT use mechanistic results)
    rf_pure, feature_cols, target_cols = train_pure_ml_model(df)

    # 5) Train hybrid residual model for TSS_out (mechanistic → ML)
    rf_resid, feature_cols_resid = train_residual_ml_for_tss(df, mech_df)

    # 6) Example: show predictions of each approach for a single row
    example_idx = df.index[-1]
    example_row = df.loc[example_idx]

    X_example = example_row[feature_cols].values.reshape(1, -1)
    pure_pred = rf_pure.predict(X_example)[0]  # [TSS_out, turbidity, chlorine]

    print("\n=== EXAMPLE PREDICTION (last SCADA row) ===")
    print(f"Row index: {example_idx}")

    print("\nObserved (SCADA):")
    for col in target_cols:
        print(f"  {col}: {example_row[col]}")

    print("\nPure ML prediction:")
    for col, val in zip(target_cols, pure_pred):
        print(f"  {col}: {val}")

    # Mechanistic and hybrid predictions for TSS only (if available)
    if mech_df is not None and example_idx in mech_df.index:
        mech_tss = mech_df.loc[example_idx, "mech_TSS_out_kg_s"]
        print(f"\nMechanistic TSS_out (DMF only): {mech_tss}")

        if rf_resid is not None:
            resid_pred = rf_resid.predict(X_example)[0]
            hybrid_tss = mech_tss + resid_pred
            print(f"Hybrid TSS_out (DMF + residual ML): {hybrid_tss}")

In [19]:
if __name__ == "__main__":
    main()


Loaded SCADA dataset with shape: (720, 29)
[Mechanistic] Computed DMF outputs for 200 rows.

=== PURE ML MODEL (Random Forest) – evaluation ===
Global RMSE (combined outputs): 0.0496
  RMSE[flow_mass_TSS_out_kg_s]: 0.000040
  RMSE[filtered_turbidity_NTU]: 0.021625
  RMSE[free_chlorine_residual_mgL]: 0.083232

=== HYBRID MODEL (Residual RF on top of DMF) – evaluation ===
RMSE[residual_TSS_out_kg_s]: 0.000036

=== EXAMPLE PREDICTION (last SCADA row) ===
Row index: 719

Observed (SCADA):
  flow_mass_TSS_out_kg_s: 0.0002190021266849
  filtered_turbidity_NTU: 0.0979893549872773
  free_chlorine_residual_mgL: 0.7350885247556416

Pure ML prediction:
  flow_mass_TSS_out_kg_s: 0.00020332642276313197
  filtered_turbidity_NTU: 0.08827934921079757
  free_chlorine_residual_mgL: 0.7583723184705063


In [21]:
import numpy as np
import matplotlib.pyplot as plt

# ---------------------------------------------------------
# 1. Build evaluation dataset with SCADA + mechanistic data
# ---------------------------------------------------------

if mech_df is None:
    raise RuntimeError(
        "Mechanistic results (mech_df) are not available. "
        "Run the mechanistic simulation part first."
    )

# Join SCADA (df) and mechanistic outputs (mech_df) on the index
df_eval = df.join(mech_df, how="inner").copy()

# We need all target columns and mechanistic TSS_out for plotting
needed_cols = [
    "flow_mass_TSS_out_kg_s",
    "filtered_turbidity_NTU",
    "free_chlorine_residual_mgL",
    "mech_TSS_out_kg_s",
]
df_eval = df_eval.dropna(subset=needed_cols)

print(f"Evaluation dataset size: {len(df_eval)} rows")

# Optional: limit number of points for clean plots
max_points = 200
df_eval = df_eval.iloc[:max_points].copy()

# ---------------------------------------------------------
# 2. Prepare features, true values, and RF predictions
# ---------------------------------------------------------

X_eval = df_eval[feature_cols].values
y_true = df_eval[target_cols].values           # shape (n_samples, 3)
y_rf   = rf_pure.predict(X_eval)               # shape (n_samples, 3)

idx = np.arange(len(df_eval))                  # simple x-axis

# ---------------------------------------------------------
# 3. Plot for each parameter:
#    Actual vs WaterTAP vs RF
#    (WaterTAP only available for TSS_out)
# ---------------------------------------------------------

for i, name in enumerate(target_cols):
    plt.figure(figsize=(10, 5))

    # Actual SCADA values
    plt.plot(idx, y_true[:, i], label="Actual (SCADA)", linewidth=2)

    # WaterTAP mechanistic prediction (only for TSS_out)
    if name == "flow_mass_TSS_out_kg_s":
        plt.plot(
            idx,
            df_eval["mech_TSS_out_kg_s"].values,
            label="WaterTAP (mechanistic)",
            linestyle="--",
        )

    # RF prediction
    plt.plot(
        idx,
        y_rf[:, i],
        label="RF prediction",
        linestyle="-.",
    )

    plt.title(name)
    plt.xlabel("Sample index")
    plt.ylabel(name)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


NameError: name 'mech_df' is not defined