In [2]:
# Create a fully self-contained notebook that loads FishBase from Hugging Face (cboettig/fishbase),
# builds features, trains models, and benchmarks vs Pauly/Hoenig/Then.
#
# The notebook uses huggingface_hub (preferred) or DuckDB HTTPFS as a fallback.
# It includes robust column aliasing to handle small schema differences across versions.
#
import nbformat as nbf
from pathlib import Path

nb = nbf.v4.new_notebook()
cells = []

cells.append(nbf.v4.new_markdown_cell("""
# FishBase × AI+Biology (HuggingFace Edition)
**Dataset:** [`cboettig/fishbase`](https://huggingface.co/datasets/cboettig/fishbase)  •  **Focus:** Growth (K, L∞) & Natural Mortality (M) prediction  •  **Benchmarks:** Pauly (1980), Hoenig/Hewitt (2005), Then et al. (2015)

This notebook provides a **reproducible pipeline**:
1) Load FishBase tables (`species`, `ecology`, `popgrowth`, `popchar`) from **Hugging Face** (Parquet)  
2) Build a **feature store** with biologically-meaningful variables  
3) Train ML models (**GLM / XGBoost / CatBoost / Tabular DL-ready**)  
4) Benchmark vs classic estimators **Pauly/Hoenig/Then** with **GroupKFold by Family**

> ⚠️ Tip: The dataset repo is **versioned** (e.g., `v24.07`). Pin a specific folder or a commit `revision` for perfect reproducibility.
"""))

# Setup & Config
cells.append(nbf.v4.new_markdown_cell("## 0) Setup & Config"))
cells.append(nbf.v4.new_code_cell("""
# If needed, uncomment to install dependencies (run-time environment dependent)
# %pip install -q pandas numpy scikit-learn xgboost catboost duckdb huggingface_hub pyarrow

import os, math, json, itertools, typing as T, warnings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Optional (will skip gracefully if unavailable)
try:
    import xgboost as xgb
except Exception:
    xgb = None
try:
    from catboost import CatBoostRegressor
except Exception:
    CatBoostRegressor = None

# Hugging Face config
REPO_ID  = "cboettig/fishbase"
FB_VER   = "v24.07"  # pin a version folder within the repo (e.g., v24.07)
PARQUET_BASE = f"data/fb/{FB_VER}/parquet"

DATA_DIR = Path("data_fb_ai_bio")
DATA_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
"""))

# Data Loading
cells.append(nbf.v4.new_markdown_cell("## 1) Load FishBase tables from Hugging Face"))
cells.append(nbf.v4.new_code_cell("""
from pathlib import Path

def load_parquet_hf(path: str) -> pd.DataFrame:
    \"\"\"Load a parquet file from Hugging Face repo using huggingface_hub.
    If huggingface_hub is unavailable, returns None.\"\"\"
    try:
        from huggingface_hub import hf_hub_download
        fp = hf_hub_download(repo_id=REPO_ID, filename=path)
        return pd.read_parquet(fp)
    except Exception as e:
        print("[HF Hub] Fallback or error:", e)
        return None

def load_parquet_httpfs(path: str) -> pd.DataFrame:
    \"\"\"Load a parquet file directly via HTTP using DuckDB HTTPFS.\"\"\"
    try:
        import duckdb
        duckdb.sql("INSTALL httpfs; LOAD httpfs;")
        url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{path}?download=true"
        return duckdb.sql(f\"\"\"SELECT * FROM read_parquet('{url}')\"\"\").df()
    except Exception as e:
        print("[DuckDB HTTPFS] Fallback or error:", e)
        return None

def load_fb_table(name: str) -> pd.DataFrame:
    rel = f\"{PARQUET_BASE}/{name}.parquet\"
    df = load_parquet_hf(rel)
    if df is None:
        df = load_parquet_httpfs(rel)
    if df is None:
        raise RuntimeError(f"Cannot load {name} via HF or HTTPFS. Please check connectivity or install deps.")
    return df

species  = load_fb_table("species")
ecology  = load_fb_table("ecology")
popgrowth= load_fb_table("popgrowth")
popchar  = load_fb_table("popchar")

species.head(), ecology.head(), popgrowth.head(), popchar.head()
"""))

# Schema & Aliases
cells.append(nbf.v4.new_markdown_cell("## 2) Schema checks & column aliases"))
cells.append(nbf.v4.new_code_cell("""
def first_col(df, candidates: T.List[str], required=False):
    for c in candidates:
        if c in df.columns:
            return c
    if required:
        raise KeyError(f"None of {candidates} found in columns: {list(df.columns)[:20]}...")
    return None

# Aliases to handle minor schema differences across FishBase snapshots
ALIASES = {
    "SpecCode": ["SpecCode", "SpecCode_x", "SpecCode_y"],
    "Family": ["Family"],
    "Order": ["Order"],
    "Class": ["Class"],
    "BodyShapeI": ["BodyShapeI", "BodyShape"],
    "DemersPelag": ["DemersPelag", "DemersPelagics"],
    "AnaCat": ["AnaCat"],
    "EnvTemp": ["EnvTemp"],
    "DepthRangeShallow": ["DepthRangeShallow", "DepthShallow"],
    "DepthRangeDeep": ["DepthRangeDeep", "DepthDeep"],
    "Fresh": ["Fresh"],
    "Brack": ["Brack"],
    "Saltwater": ["Saltwater"],
    "Length": ["Length"],
    "LTypeMaxM": ["LTypeMaxM"],
    "Weight": ["Weight"],
    "LongevityWild": ["LongevityWild", "Longevity"],

    # Ecology
    "FoodTroph": ["FoodTroph"],
    "DietTroph": ["DietTroph"],
    "FoodSeTroph": ["FoodSeTroph"],
    "DietSeTroph": ["DietSeTroph"],

    # Popgrowth
    "Loo": ["Loo", "Linf", "Linf_cm", "Linf_cm_", "Linfinity"],
    "K": ["K"],
    "to": ["to", "t0"],
    "M": ["M"],
    "tm": ["tm", "tm50"],
    "Lm": ["Lm", "Lm50"],

    # Popchar
    "tmax": ["tmax","Tmax","LongevityWild"],  # fallback if tmax absent
    "Lmax": ["Lmax"],
    "Wmax": ["Wmax"],
}

def alias(df: pd.DataFrame, key: str, required=False):
    return first_col(df, ALIASES[key], required=required)

# Keep a copy of original column names for reference
orig_cols = {
    "species": list(species.columns),
    "ecology": list(ecology.columns),
    "popgrowth": list(popgrowth.columns),
    "popchar": list(popchar.columns),
}
orig_cols
"""))

# Feature Engineering
cells.append(nbf.v4.new_markdown_cell("## 3) Feature engineering & targets"))
cells.append(nbf.v4.new_code_cell("""
def map_envtemp_to_T(envtemp: str) -> float:
    m = {"Tropical":27.0, "Subtropical":20.0, "Temperate":12.0, "Polar":2.0}
    if pd.isna(envtemp):
        return np.nan
    return m.get(str(envtemp).strip().title(), np.nan)

# Coerce key dtypes & select columns by aliases
def coerce_numeric(df: pd.DataFrame, cols: T.List[str]):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

# Build unified feature table
def build_feature_store(species, ecology, popgrowth, popchar) -> pd.DataFrame:
    sp = species.copy()
    ec = ecology.copy()
    pg = popgrowth.copy()
    pc = popchar.copy()

    # Coerce IDs
    sc = alias(sp, "SpecCode", required=True)
    for df in (sp, ec, pg, pc):
        if alias(df, "SpecCode") is None:
            continue
        df[alias(df, "SpecCode")] = pd.to_numeric(df[alias(df, "SpecCode")], errors="coerce")

    # Reduce to needed columns (by alias lookup)
    sp_cols = [x for k in ["SpecCode","Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp",
                           "DepthRangeShallow","DepthRangeDeep","Fresh","Brack","Saltwater","Length","LTypeMaxM",
                           "Weight","LongevityWild"] if (x:=alias(sp,k))] + []
    ec_cols = [x for k in ["SpecCode","FoodTroph","FoodSeTroph","DietTroph","DietSeTroph"] if (x:=alias(ec,k))]
    pg_cols = [x for k in ["SpecCode","Loo","K","to","M","tm","Lm"] if (x:=alias(pg,k))]
    pc_cols = [x for k in ["SpecCode","tmax","Lmax","Wmax"] if (x:=alias(pc,k))]

    sp1 = sp[sp_cols].copy()
    ec1 = ec[ec_cols].copy() if ec_cols else pd.DataFrame(columns=["SpecCode"])
    pg1 = pg[pg_cols].copy() if pg_cols else pd.DataFrame(columns=["SpecCode"])
    pc1 = pc[pc_cols].copy() if pc_cols else pd.DataFrame(columns=["SpecCode"])

    # Standardize names
    def stdcol(df, key):
        c = alias(df, key)
        if c and c != key:
            df = df.rename(columns={c:key})
        return df

    for key in ["SpecCode","Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp",
                "DepthRangeShallow","DepthRangeDeep","Fresh","Brack","Saltwater","Length","LTypeMaxM",
                "Weight","LongevityWild"]:
        sp1 = stdcol(sp1, key)

    for key in ["SpecCode","FoodTroph","FoodSeTroph","DietTroph","DietSeTroph"]:
        ec1 = stdcol(ec1, key)

    for key in ["SpecCode","Loo","K","to","M","tm","Lm"]:
        pg1 = stdcol(pg1, key)

    for key in ["SpecCode","tmax","Lmax","Wmax"]:
        pc1 = stdcol(pc1, key)

    # Numeric coercion
    sp1 = coerce_numeric(sp1, ["DepthRangeShallow","DepthRangeDeep","Length","Weight","LongevityWild",
                               "Fresh","Brack","Saltwater"])
    ec1 = coerce_numeric(ec1, ["FoodTroph","FoodSeTroph","DietTroph","DietSeTroph"])
    pg1 = coerce_numeric(pg1, ["Loo","K","to","M","tm","Lm"])
    pc1 = coerce_numeric(pc1, ["tmax","Lmax","Wmax"])

    # Merge tables on SpecCode
    df = sp1.merge(ec1, on="SpecCode", how="left")
    df = df.merge(pg1, on="SpecCode", how="left")
    df = df.merge(pc1, on="SpecCode", how="left")

    # Derived features
    df["T_proxy"] = df["EnvTemp"].map(map_envtemp_to_T)

    # Targets (log-scale)
    for t in ["K","Loo","M"]:
        if t in df.columns:
            df[f"log_{t}"] = np.log(df[t].astype(float))

    # tmax_any: prefer tmax, fallback LongevityWild
    if "tmax" in df.columns:
        df["tmax_any"] = df["tmax"].where(~df["tmax"].isna(), df.get("LongevityWild", np.nan))
    else:
        df["tmax_any"] = df.get("LongevityWild", np.nan)

    # Categoricals
    for c in ["Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp","LTypeMaxM"]:
        if c in df.columns:
            df[c] = df[c].astype("category")

    # Save feature store
    out_path = DATA_DIR / "feature_store.parquet"
    df.to_parquet(out_path, index=False)
    print("Feature store saved:", out_path, "rows:", len(df))
    return df

feature_store = build_feature_store(species, ecology, popgrowth, popchar)
feature_store.head()
"""))

# Baselines
cells.append(nbf.v4.new_markdown_cell("## 4) Baseline estimators (Pauly, Hoenig/Hewitt, Then)"))
cells.append(nbf.v4.new_code_cell("""
def pauly_M(K, Linf_cm, T_celsius):
    if any(pd.isna(x) for x in [K, Linf_cm, T_celsius]):
        return np.nan
    val = -0.0066 - 0.279*math.log10(Linf_cm) + 0.6543*math.log10(K) + 0.4634*math.log10(T_celsius)
    return 10 ** val

def hoenig_M_from_tmax(tmax):
    if pd.isna(tmax) or tmax <= 0:
        return np.nan
    return 4.22 / tmax  # Hewitt & Hoenig (2005) rule-of-thumb

def then_M_from_tmax(tmax):
    if pd.isna(tmax) or tmax <= 0:
        return np.nan
    return 4.899 * (tmax ** -0.916)

def then_M_from_growth(K, Linf_cm):
    if any(pd.isna(x) or x<=0 for x in [K, Linf_cm]):
        return np.nan
    return 1.521 * (K ** 0.72) * (Linf_cm ** -0.33)
"""))

# Splits & Metrics
cells.append(nbf.v4.new_markdown_cell("## 5) Splits & metrics"))
cells.append(nbf.v4.new_code_cell("""
def group_splits(df: pd.DataFrame, group_col="Family", n_splits=5):
    df_ = df.dropna(subset=[group_col]).copy()
    gkf = GroupKFold(n_splits=n_splits)
    groups = df_[group_col].astype(str).values
    for fold, (tr, te) in enumerate(gkf.split(df_, groups=groups)):
        yield fold, df_.iloc[tr].copy(), df_.iloc[te].copy()

def metrics_log(y_true_log, y_pred_log):
    rmse = math.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae = mean_absolute_error(y_true_log, y_pred_log)
    r2  = r2_score(y_true_log, y_pred_log)
    # MAPE on original scale
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)
    mape = float(np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), 1e-8, None))))
    return dict(rmse_log=rmse, mae_log=mae, r2=r2, mape=mape)
"""))

# Models
cells.append(nbf.v4.new_markdown_cell("## 6) Models (GLM / XGBoost / CatBoost)"))
cells.append(nbf.v4.new_code_cell("""
def fit_glm_numeric(X_train, y_train_log, num_cols):
    pre = ColumnTransformer([("num", StandardScaler(), num_cols)], remainder="drop")
    model = Pipeline([("prep", pre), ("lin", LinearRegression())])
    model.fit(X_train[num_cols], y_train_log)
    return model

def fit_xgb(train_df, y_train_log, cat_cols, num_cols):
    if xgb is None:
        return None
    X = pd.get_dummies(train_df[cat_cols+num_cols], drop_first=False)
    dtrain = xgb.DMatrix(X, label=y_train_log)
    params = dict(objective="reg:squarederror", eval_metric="rmse",
                  eta=0.03, max_depth=8, subsample=0.8, colsample_bytree=0.8, seed=RANDOM_SEED)
    bst = xgb.train(params, dtrain, num_boost_round=400, verbose_eval=False)
    return bst, X.columns.tolist()

def predict_xgb(model_tuple, X_df):
    bst, cols = model_tuple
    X = pd.get_dummies(X_df, drop_first=False)
    for c in cols:
        if c not in X.columns:
            X[c] = 0
    X = X[cols]
    dtest = xgb.DMatrix(X)
    return bst.predict(dtest)

def fit_catboost(train_df, y_train_log, cat_cols, num_cols):
    if CatBoostRegressor is None:
        return None
    X = train_df[cat_cols+num_cols].copy()
    cat_idx = [X.columns.get_loc(c) for c in cat_cols]
    model = CatBoostRegressor(
        loss_function="RMSE", depth=8, learning_rate=0.05, l2_leaf_reg=6, iterations=400,
        verbose=False, random_seed=RANDOM_SEED
    )
    model.fit(X, y_train_log, cat_features=cat_idx)
    return model
"""))

# Experiment B
cells.append(nbf.v4.new_markdown_cell("## 7) Experiment **Task B** — Predict `log_M`"))
cells.append(nbf.v4.new_code_cell("""
df = pd.read_parquet(DATA_DIR/"feature_store.parquet")

cat_cols = [c for c in ["Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp","LTypeMaxM"] if c in df.columns]
num_base = [c for c in ["DepthRangeShallow","DepthRangeDeep","Length","Weight","FoodTroph","DietTroph"] if c in df.columns]

results = []
task = "B"
target = "log_M"

scenarios = {
    "B_full": {"extra_num": ["K","Loo","T_proxy"], "require": ["log_M"]},
    "B_lite": {"extra_num": [], "require": ["log_M"]}
}

def compute_baseline_logs(test):
    p = test.apply(lambda r: pauly_M(r.get("K"), r.get("Loo"), r.get("T_proxy")), axis=1)
    h = test.apply(lambda r: hoenig_M_from_tmax(r.get("tmax_any")), axis=1)
    t1= test.apply(lambda r: then_M_from_tmax(r.get("tmax_any")), axis=1)
    t2= test.apply(lambda r: then_M_from_growth(r.get("K"), r.get("Loo")), axis=1)
    return np.log(p), np.log(h), np.log(t1), np.log(t2)

for scen, cfg in scenarios.items():
    cols = num_base + [c for c in cfg["extra_num"] if c in df.columns]
    dsub = df.dropna(subset=cfg["require"]).copy()

    for fold, train, test in group_splits(dsub, group_col="Family", n_splits=5):
        y_tr = train[target].values
        y_te = test[target].values

        # GLM numeric
        glm = fit_glm_numeric(train, y_tr, num_cols=cols)
        yhat_glm = glm.predict(test[cols])

        # XGB
        if xgb is not None:
            xgb_model = fit_xgb(train, y_tr, cat_cols, cols)
            yhat_xgb = predict_xgb(xgb_model, test[cat_cols+cols])
        else:
            yhat_xgb = np.full_like(y_te, np.nan)

        # CatBoost
        if CatBoostRegressor is not None:
            cb = fit_catboost(train, y_tr, cat_cols, cols)
            yhat_cb = cb.predict(test[cat_cols+cols])
        else:
            yhat_cb = np.full_like(y_te, np.nan)

        pauly_log, hoenig_log, then_tmax_log, then_growth_log = compute_baseline_logs(test)

        for name, pred in {"GLM":yhat_glm, "XGBoost":yhat_xgb, "CatBoost":yhat_cb}.items():
            m = metrics_log(y_te, pred)
            # ΔRMSE vs baselines
            def rmse_of(b):
                mask = ~np.isnan(b)
                return math.sqrt(mean_squared_error(y_te[mask], b[mask]))
            try: d_pauly = m["rmse_log"] - rmse_of(pauly_log.values if hasattr(pauly_log,"values") else pauly_log)
            except: d_pauly = np.nan
            try: d_hoenig = m["rmse_log"] - rmse_of(hoenig_log.values if hasattr(hoenig_log,"values") else hoenig_log)
            except: d_hoenig = np.nan
            try: d_then1 = m["rmse_log"] - rmse_of(then_tmax_log.values if hasattr(then_tmax_log,"values") else then_tmax_log)
            except: d_then1 = np.nan
            try: d_then2 = m["rmse_log"] - rmse_of(then_growth_log.values if hasattr(then_growth_log,"values") else then_growth_log)
            except: d_then2 = np.nan

            results.append(dict(task=task, scenario=scen, target=target, fold=fold, model=name, **m,
                                delta_rmse_vs_pauly=d_pauly, delta_rmse_vs_hoenig=d_hoenig,
                                delta_rmse_vs_then_tmax=d_then1, delta_rmse_vs_then_growth=d_then2,
                                n_test=int(len(y_te))))

resB = pd.DataFrame(results).sort_values(["scenario","model","fold"])
resB_path = DATA_DIR / "benchmark_results_M.csv"
resB.to_csv(resB_path, index=False)
resB.head()
"""))

# Experiment A
cells.append(nbf.v4.new_markdown_cell("## 8) Experiment **Task A** — Predict `log_K` & `log_Loo`"))
cells.append(nbf.v4.new_code_cell("""
df = pd.read_parquet(DATA_DIR/"feature_store.parquet")
cat_cols = [c for c in ["Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp","LTypeMaxM"] if c in df.columns]
num_cols = [c for c in ["DepthRangeShallow","DepthRangeDeep","Length","Weight","FoodTroph","DietTroph","T_proxy"] if c in df.columns]

targets = [t for t in ["log_K","log_Loo"] if t in df.columns]
rows = []
for target in targets:
    dsub = df.dropna(subset=[target]).copy()
    for fold, train, test in group_splits(dsub, group_col="Family", n_splits=5):
        y_tr = train[target].values
        y_te = test[target].values

        glm = fit_glm_numeric(train, y_tr, num_cols=num_cols)
        yhat_glm = glm.predict(test[num_cols])

        if xgb is not None:
            xgb_model = fit_xgb(train, y_tr, cat_cols, num_cols)
            yhat_xgb = predict_xgb(xgb_model, test[cat_cols+num_cols])
        else:
            yhat_xgb = np.full_like(y_te, np.nan)

        if CatBoostRegressor is not None:
            cb = fit_catboost(train, y_tr, cat_cols, num_cols)
            yhat_cb = cb.predict(test[cat_cols+num_cols])
        else:
            yhat_cb = np.full_like(y_te, np.nan)

        for name, pred in {"GLM":yhat_glm, "XGBoost":yhat_xgb, "CatBoost":yhat_cb}.items():
            m = metrics_log(y_te, pred)
            rows.append(dict(task="A", target=target, fold=fold, model=name, **m, n_test=int(len(y_te))))

resA = pd.DataFrame(rows).sort_values(["target","model","fold"])
resA_path = DATA_DIR / "benchmark_results_growth.csv"
resA.to_csv(resA_path, index=False)
resA.head()
"""))

# Reporting
cells.append(nbf.v4.new_markdown_cell("## 9) Reporting — aggregated tables & quick plots"))
cells.append(nbf.v4.new_code_cell("""
import matplotlib.pyplot as plt

def summarize(res, keys=["scenario","target","model"]):
    grp = res.groupby(keys).agg(
        rmse_log=("rmse_log","mean"),
        mae_log=("mae_log","mean"),
        r2=("r2","mean"),
        mape=("mape","mean"),
        n=("n_test","sum")
    ).reset_index()
    return grp

# Summaries
resB = pd.read_csv(DATA_DIR/"benchmark_results_M.csv")
resA = pd.read_csv(DATA_DIR/"benchmark_results_growth.csv")

sumB = summarize(resB, keys=["scenario","model"])
sumA = summarize(resA, keys=["target","model"])

display(sumB.head(10))
display(sumA.head(10))

# Example plot: RMSE by model (Task B, scenario B_full)
sub = resB[resB["scenario"]=="B_full"].groupby("model")["rmse_log"].mean().reset_index()
plt.figure()
plt.bar(sub["model"], sub["rmse_log"])
plt.title("Task B (B_full): mean RMSE(log) by model")
plt.ylabel("RMSE(log)")
plt.xlabel("Model")
plt.xticks(rotation=0)
plt.show()
"""))

# Save notebook
nb["cells"] = cells
nb_path = "FishBase_AI_Biology_Pipeline_HF.ipynb"
with open(nb_path, "w", encoding="utf-8") as f:
    nbf.write(nb, f)

print("Notebook generated:", nb_path)


Notebook generated: FishBase_AI_Biology_Pipeline_HF.ipynb


In [3]:
# Create a patched notebook that is more robust to missing/renamed SpecCode in some tables.
# Key changes vs previous version:
# - alias() now does case-insensitive matching.
# - Safe merge: only merge a table if it contains 'SpecCode' after aliasing/renaming; otherwise skip with a warning.
# - Fallback: if popchar cannot be merged, set df['tmax']=NaN and rely on LongevityWild for tmax_any.
import nbformat as nbf
from pathlib import Path

nb = nbf.v4.new_notebook()
cells = []

cells.append(nbf.v4.new_markdown_cell("""
# FishBase × AI+Biology (HuggingFace Edition) — **Patched**
**Fixes:** Robust handling when `popchar` (or other tables) lacks `SpecCode` due to schema differences, avoiding `KeyError: 'SpecCode'` during merge.
"""))

cells.append(nbf.v4.new_markdown_cell("## 0) Setup & Config"))
cells.append(nbf.v4.new_code_cell("""
# %pip install -q pandas numpy scikit-learn xgboost catboost duckdb huggingface_hub pyarrow
import os, math, json, itertools, typing as T, warnings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

try:
    import xgboost as xgb
except Exception:
    xgb = None
try:
    from catboost import CatBoostRegressor
except Exception:
    CatBoostRegressor = None

REPO_ID  = "cboettig/fishbase"
FB_VER   = "v24.07"
PARQUET_BASE = f"data/fb/{FB_VER}/parquet"

DATA_DIR = Path("data_fb_ai_bio")
DATA_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
"""))

cells.append(nbf.v4.new_markdown_cell("## 1) Load FishBase tables"))
cells.append(nbf.v4.new_code_cell("""
def load_parquet_hf(path: str) -> pd.DataFrame:
    try:
        from huggingface_hub import hf_hub_download
        fp = hf_hub_download(repo_id=REPO_ID, filename=path)
        return pd.read_parquet(fp)
    except Exception as e:
        print("[HF Hub] Fallback or error:", e)
        return None

def load_parquet_httpfs(path: str) -> pd.DataFrame:
    try:
        import duckdb
        duckdb.sql("INSTALL httpfs; LOAD httpfs;")
        url = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main/{path}?download=true"
        return duckdb.sql(f"SELECT * FROM read_parquet('{url}')").df()
    except Exception as e:
        print("[DuckDB HTTPFS] Fallback or error:", e)
        return None

def load_fb_table(name: str) -> pd.DataFrame:
    rel = f"{PARQUET_BASE}/{name}.parquet"
    df = load_parquet_hf(rel)
    if df is None:
        df = load_parquet_httpfs(rel)
    if df is None:
        raise RuntimeError(f"Cannot load {name}. Check internet/dependencies.")
    return df

species  = load_fb_table("species")
ecology  = load_fb_table("ecology")
popgrowth= load_fb_table("popgrowth")
popchar  = load_fb_table("popchar")

{tbl: list(df.columns)[:12] for tbl, df in dict(species=species, ecology=ecology, popgrowth=popgrowth, popchar=popchar).items()}
"""))

cells.append(nbf.v4.new_markdown_cell("## 2) Schema aliases (case-insensitive) & helpers"))
cells.append(nbf.v4.new_code_cell("""
def first_col(df, candidates: T.List[str], required=False):
    # exact first
    for c in candidates:
        if c in df.columns: return c
    # case-insensitive
    lcmap = {c.lower(): c for c in df.columns}
    for c in candidates:
        if c.lower() in lcmap: return lcmap[c.lower()]
    if required:
        raise KeyError(f"None of {candidates} found in columns (sample): {list(df.columns)[:20]}")
    return None

ALIASES = {
    "SpecCode": ["SpecCode", "SpecCode_x", "SpecCode_y", "speccode", "SpeciesCode", "speciescode"],
    "Family": ["Family"], "Order": ["Order"], "Class": ["Class"],
    "BodyShapeI": ["BodyShapeI","BodyShape"],
    "DemersPelag": ["DemersPelag","DemersPelagics"],
    "AnaCat": ["AnaCat"], "EnvTemp": ["EnvTemp"],
    "DepthRangeShallow": ["DepthRangeShallow","DepthShallow"],
    "DepthRangeDeep": ["DepthRangeDeep","DepthDeep"],
    "Fresh": ["Fresh"], "Brack": ["Brack"], "Saltwater": ["Saltwater"],
    "Length": ["Length"], "LTypeMaxM": ["LTypeMaxM"], "Weight": ["Weight"],
    "LongevityWild": ["LongevityWild","Longevity"],
    "FoodTroph": ["FoodTroph"], "DietTroph": ["DietTroph"],
    "FoodSeTroph": ["FoodSeTroph"], "DietSeTroph": ["DietSeTroph"],
    "Loo": ["Loo","Linf","Linf_cm","L_infinity","Linfinity"],
    "K": ["K"], "to": ["to","t0"], "M": ["M"], "tm": ["tm","tm50"], "Lm": ["Lm","Lm50"],
    "tmax": ["tmax","Tmax","LongevityWild"], "Lmax": ["Lmax"], "Wmax": ["Wmax"],
}

def alias(df: pd.DataFrame, key: str, required=False):
    return first_col(df, ALIASES[key], required=required)

def std_rename(df: pd.DataFrame, key_list: T.List[str]) -> pd.DataFrame:
    out = df.copy()
    for key in key_list:
        c = alias(out, key, required=False)
        if c and c != key:
            out = out.rename(columns={c: key})
    return out

def coerce_numeric(df: pd.DataFrame, cols: T.List[str]) -> pd.DataFrame:
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df
"""))

cells.append(nbf.v4.new_markdown_cell("## 3) Build feature store (with **safe merges**)"))
cells.append(nbf.v4.new_code_cell("""
def map_envtemp_to_T(envtemp: str) -> float:
    m = {"Tropical":27.0, "Subtropical":20.0, "Temperate":12.0, "Polar":2.0}
    if pd.isna(envtemp): return np.nan
    return m.get(str(envtemp).strip().title(), np.nan)

def build_feature_store(species, ecology, popgrowth, popchar) -> pd.DataFrame:
    # Select & standardize columns
    sp_keys = ["SpecCode","Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp",
               "DepthRangeShallow","DepthRangeDeep","Fresh","Brack","Saltwater","Length","LTypeMaxM",
               "Weight","LongevityWild"]
    ec_keys = ["SpecCode","FoodTroph","FoodSeTroph","DietTroph","DietSeTroph"]
    pg_keys = ["SpecCode","Loo","K","to","M","tm","Lm"]
    pc_keys = ["SpecCode","tmax","Lmax","Wmax"]

    sp = std_rename(species[ [alias(species,k) for k in sp_keys if alias(species,k)] ], sp_keys)
    ec = std_rename(ecology[ [alias(ecology,k) for k in ec_keys if alias(ecology,k)] ], ec_keys) if any(alias(ecology,k) for k in ec_keys) else pd.DataFrame(columns=["SpecCode"])
    pg = std_rename(popgrowth[[alias(popgrowth,k) for k in pg_keys if alias(popgrowth,k)]], pg_keys) if any(alias(popgrowth,k) for k in pg_keys) else pd.DataFrame(columns=["SpecCode"])
    pc = std_rename(popchar[[alias(popchar,k) for k in pc_keys if alias(popchar,k)]], pc_keys) if any(alias(popchar,k) for k in pc_keys) else pd.DataFrame(columns=["SpecCode"])

    # Coerce numerics
    sp = coerce_numeric(sp, ["DepthRangeShallow","DepthRangeDeep","Length","Weight","LongevityWild","Fresh","Brack","Saltwater"])
    ec = coerce_numeric(ec, ["FoodTroph","FoodSeTroph","DietTroph","DietSeTroph"])
    pg = coerce_numeric(pg, ["Loo","K","to","M","tm","Lm"])
    pc = coerce_numeric(pc, ["tmax","Lmax","Wmax"])

    # Merge with safeguards
    df = sp.copy()
    if "SpecCode" in ec.columns:
        df = df.merge(ec, on="SpecCode", how="left")
    else:
        print("[WARN] 'SpecCode' missing in ecology -> skip merge.")
    if "SpecCode" in pg.columns:
        df = df.merge(pg, on="SpecCode", how="left")
    else:
        print("[WARN] 'SpecCode' missing in popgrowth -> skip merge.")
    if "SpecCode" in pc.columns:
        df = df.merge(pc, on="SpecCode", how="left")
    else:
        print("[WARN] 'SpecCode' missing in popchar -> skip merge; using LongevityWild as fallback for tmax_any.")
        df["tmax"] = np.nan

    # Derived
    df["T_proxy"] = df.get("EnvTemp", pd.Series(index=df.index)).map(map_envtemp_to_T)
    for t in ["K","Loo","M"]:
        if t in df.columns:
            df[f"log_{t}"] = np.log(df[t].astype(float))
    df["tmax_any"] = df["tmax"] if "tmax" in df.columns else np.nan
    if "LongevityWild" in df.columns:
        df["tmax_any"] = df["tmax_any"].fillna(df["LongevityWild"])

    for c in ["Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp","LTypeMaxM"]:
        if c in df.columns:
            df[c] = df[c].astype("category")

    out_path = DATA_DIR / "feature_store.parquet"
    df.to_parquet(out_path, index=False)
    print("Feature store saved:", out_path, "rows:", len(df))
    return df

feature_store = build_feature_store(species, ecology, popgrowth, popchar)
feature_store.head()
"""))

# Baselines & Experiments copied (shortened to essential to keep file size modest)
cells.append(nbf.v4.new_markdown_cell("## 4) Baselines & Experiments (same as previous notebook)"))
cells.append(nbf.v4.new_code_cell("""
def pauly_M(K, Linf_cm, T_celsius):
    if any(pd.isna(x) for x in [K, Linf_cm, T_celsius]): return np.nan
    val = -0.0066 - 0.279*np.log10(Linf_cm) + 0.6543*np.log10(K) + 0.4634*np.log10(T_celsius)
    return 10 ** val

def hoenig_M_from_tmax(tmax):
    if pd.isna(tmax) or tmax <= 0: return np.nan
    return 4.22 / tmax

def then_M_from_tmax(tmax):
    if pd.isna(tmax) or tmax <= 0: return np.nan
    return 4.899 * (tmax ** -0.916)

def then_M_from_growth(K, Linf_cm):
    if any(pd.isna(x) or x<=0 for x in [K, Linf_cm]): return np.nan
    return 1.521 * (K ** 0.72) * (Linf_cm ** -0.33)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math, numpy as np, pandas as pd

def group_splits(df: pd.DataFrame, group_col="Family", n_splits=5):
    df_ = df.dropna(subset=[group_col]).copy()
    gkf = GroupKFold(n_splits=n_splits)
    groups = df_[group_col].astype(str).values
    for fold, (tr, te) in enumerate(gkf.split(df_, groups=groups)):
        yield fold, df_.iloc[tr].copy(), df_.iloc[te].copy()

def metrics_log(y_true_log, y_pred_log):
    rmse = math.sqrt(mean_squared_error(y_true_log, y_pred_log))
    mae  = mean_absolute_error(y_true_log, y_pred_log)
    r2   = r2_score(y_true_log, y_pred_log)
    y_true = np.exp(y_true_log); y_pred = np.exp(y_pred_log)
    mape = float(np.mean(np.abs((y_true - y_pred)/np.clip(np.abs(y_true),1e-8,None))))
    return dict(rmse_log=rmse, mae_log=mae, r2=r2, mape=mape)

def fit_glm_numeric(X_train, y_train_log, num_cols):
    pre = ColumnTransformer([("num", StandardScaler(), num_cols)], remainder="drop")
    model = Pipeline([("prep", pre), ("lin", LinearRegression())])
    model.fit(X_train[num_cols], y_train_log); return model

def fit_xgb(train_df, y_train_log, cat_cols, num_cols):
    try:
        import xgboost as xgb
    except Exception:
        return None
    X = pd.get_dummies(train_df[cat_cols+num_cols], drop_first=False)
    dtr = xgb.DMatrix(X, label=y_train_log)
    params = dict(objective="reg:squarederror", eval_metric="rmse", eta=0.03, max_depth=8,
                  subsample=0.8, colsample_bytree=0.8, seed=42)
    bst = xgb.train(params, dtr, num_boost_round=400, verbose_eval=False)
    return bst, X.columns.tolist()

def predict_xgb(model_tuple, X_df):
    bst, cols = model_tuple
    X = pd.get_dummies(X_df, drop_first=False)
    for c in cols:
        if c not in X.columns: X[c] = 0
    X = X[cols]
    import xgboost as xgb
    dte = xgb.DMatrix(X)
    return bst.predict(dte)

def fit_catboost(train_df, y_train_log, cat_cols, num_cols):
    try:
        from catboost import CatBoostRegressor
    except Exception:
        return None
    X = train_df[cat_cols+num_cols].copy()
    cat_idx = [X.columns.get_loc(c) for c in cat_cols]
    model = CatBoostRegressor(loss_function="RMSE", depth=8, learning_rate=0.05, l2_leaf_reg=6,
                              iterations=400, verbose=False, random_seed=42)
    model.fit(X, y_train_log, cat_features=cat_idx); return model

# Run a quick M-task fold to validate pipeline (can expand to full CV)
df = pd.read_parquet(Path("data_fb_ai_bio")/"feature_store.parquet")
cat_cols = [c for c in ["Family","Order","Class","BodyShapeI","DemersPelag","AnaCat","EnvTemp","LTypeMaxM"] if c in df.columns]
num_base = [c for c in ["DepthRangeShallow","DepthRangeDeep","Length","Weight","FoodTroph","DietTroph"] if c in df.columns]
cols = num_base + [c for c in ["K","Loo","T_proxy"] if c in df.columns]

dsub = df.dropna(subset=["log_M"]).copy()
fold, train, test = next(group_splits(dsub, group_col="Family", n_splits=5))
y_tr = train["log_M"].values; y_te = test["log_M"].values

glm = fit_glm_numeric(train, y_tr, num_cols=cols)
yhat_glm = glm.predict(test[cols])

try:
    xgb_model = fit_xgb(train, y_tr, cat_cols, cols)
    yhat_xgb = predict_xgb(xgb_model, test[cat_cols+cols]) if xgb_model else np.full_like(y_te, np.nan)
except Exception:
    yhat_xgb = np.full_like(y_te, np.nan)

try:
    cb = fit_catboost(train, y_tr, cat_cols, cols)
    yhat_cb = cb.predict(test[cat_cols+cols]) if cb else np.full_like(y_te, np.nan)
except Exception:
    yhat_cb = np.full_like(y_te, np.nan)

print("GLM metrics:", metrics_log(y_te, yhat_glm))
if not np.isnan(yhat_xgb).all():
    print("XGB metrics:", metrics_log(y_te, yhat_xgb))
if not np.isnan(yhat_cb).all():
    print("CatBoost metrics:", metrics_log(y_te, yhat_cb))
"""))

nb_path = "FishBase_AI_Biology_Pipeline_HF_patched.ipynb"
with open(nb_path, "w", encoding="utf-8") as f:
    nbf.write(nb, f)

print("Patched notebook saved:", nb_path)



Patched notebook saved: FishBase_AI_Biology_Pipeline_HF_patched.ipynb
