In [1]:
import os, time, json, base64, requests
import pandas as pd
import numpy as np
import tensorflow as tf
from IPython.display import Markdown, display

2025-10-15 23:07:01.935288: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-15 23:07:02.201941: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-15 23:07:02.224723: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-12.4/lib64
2025-10-15 23:07:02.224743: I tensorflow/stream_executor/cuda/cudart_stub.cc:29]

In [32]:
NUMERIC = ["Age", "RestingBP", "Cholesterol", "FastingBS", "MaxHR", "Oldpeak"]
CATEGORICAL = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
LABEL = "HeartDisease"

DATA_CSV = "data/heart.csv"  # sesuaikan jika beda
MODEL_NAME = os.getenv("MODEL_NAME", "heart")
SERVE_HOST = os.getenv(
    "SERVE_HOST", "https://mlops-dicoding-deploy-production.up.railway.app"
)
PRED_URL = f"{SERVE_HOST}/v1/models/{MODEL_NAME}:predict"
STATUS_URL = f"{SERVE_HOST}/v1/models/{MODEL_NAME}"
TIMEOUT_S = 10
THRESHOLD = 0.5

In [3]:
def wait_model_ready(max_wait=30):
    """Tunggu TF Serving model jadi AVAILABLE."""
    t0 = time.time()
    last = None
    while time.time() - t0 < max_wait:
        try:
            r = requests.get(STATUS_URL, timeout=TIMEOUT_S)
            if r.ok:
                last = r.json()
                # Format standar: {"model_version_status": [{"state": "AVAILABLE", ...}, ...]}
                mvs = last.get("model_version_status") or last.get(
                    "model_version_statuses"
                )
                if isinstance(mvs, list) and any(
                    s.get("state") == "AVAILABLE" for s in mvs if isinstance(s, dict)
                ):
                    print(f"✅ Model {MODEL_NAME} AVAILABLE at {SERVE_HOST}")
                    return True
        except Exception as e:
            last = {"error": str(e)}
        time.sleep(1.5)
    print("⚠️ Model belum AVAILABLE:", last)
    return False


_ = wait_model_ready(max_wait=20)

✅ Model heart AVAILABLE at http://localhost:8501


In [4]:
df_all = pd.read_csv(DATA_CSV).copy()

# hitung persentil untuk fitur numerik (dipakai untuk penjelasan singkat)
percentile_maps = {}
for col in NUMERIC:
    vals = df_all[col].dropna().values
    if len(vals) == 0:
        percentile_maps[col] = lambda x: np.nan
        continue

    def pct_func(x):
        return float((vals <= x).mean() * 100.0)

    percentile_maps[col] = np.vectorize(pct_func)

# base rate keseluruhan dan per kategori
overall_rate = df_all[LABEL].mean() if LABEL in df_all.columns else np.nan
cat_rates = {}
for col in CATEGORICAL:
    tmp = (
        df_all[[col, LABEL]].dropna()
        if LABEL in df_all.columns
        else df_all[[col]].assign(**{LABEL: np.nan}).dropna()
    )
    try:
        cat_rates[col] = tmp.groupby(col)[LABEL].mean().to_dict()
    except Exception:
        cat_rates[col] = {}

print(
    f"Overall disease rate: {overall_rate:.3f}"
    if not np.isnan(overall_rate)
    else "Overall disease rate: N/A"
)
for col in CATEGORICAL:
    print(
        f"{col} base rates:",
        {
            k: (None if pd.isna(v) else round(float(v), 3))
            for k, v in cat_rates[col].items()
        },
    )

Overall disease rate: 0.553
Sex base rates: {'F': 0.259, 'M': 0.632}
ChestPainType base rates: {'ASY': 0.79, 'ATA': 0.139, 'NAP': 0.355, 'TA': 0.435}
RestingECG base rates: {'LVH': 0.564, 'Normal': 0.516, 'ST': 0.657}
ExerciseAngina base rates: {'N': 0.351, 'Y': 0.852}
ST_Slope base rates: {'Down': 0.778, 'Flat': 0.828, 'Up': 0.197}


In [5]:
INT_COLS = ["Age", "RestingBP", "Cholesterol", "FastingBS", "MaxHR"]
FLOAT_COLS = ["Oldpeak"]


def row_to_tfexample(row: pd.Series) -> tf.train.Example:
    """Bangun tf.Example sesuai raw schema: kategori=bytes, int=int64, float=float32/64."""
    feat = {}

    # Kategorikal -> bytes
    for k in CATEGORICAL:
        v = "" if (k not in row or pd.isna(row[k])) else str(row[k])
        feat[k] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[v.encode()]))

    # Integer numerik -> int64
    for k in INT_COLS:
        v = 0 if (k not in row or pd.isna(row[k])) else int(row[k])
        feat[k] = tf.train.Feature(int64_list=tf.train.Int64List(value=[v]))

    # Float -> float_list
    for k in FLOAT_COLS:
        v = 0.0 if (k not in row or pd.isna(row[k])) else float(row[k])
        feat[k] = tf.train.Feature(float_list=tf.train.FloatList(value=[v]))

    return tf.train.Example(features=tf.train.Features(feature=feat))


def make_predict_payload(df: pd.DataFrame) -> dict:
    instances = []
    for i in range(len(df)):
        ex = row_to_tfexample(df.iloc[i])
        instances.append(
            {
                "examples": {
                    "b64": base64.b64encode(ex.SerializeToString()).decode("utf-8")
                }
            }
        )
    return {"instances": instances}

In [6]:
def predict_examples(
    df: pd.DataFrame, url=PRED_URL, timeout=TIMEOUT_S, retries=2
) -> np.ndarray:
    payload = make_predict_payload(df)
    last_err = None
    for attempt in range(retries + 1):
        try:
            resp = requests.post(url, json=payload, timeout=timeout)
            if resp.status_code == 200:
                js = resp.json()
                preds = js.get("predictions")
                if preds is None:
                    raise RuntimeError(
                        f"Tidak ada field 'predictions' pada response: {js}"
                    )
                # bentuk output bisa [[p],[p],...] atau [p,...]
                out = [p[0] if isinstance(p, list) else float(p) for p in preds]
                return np.array(out, dtype=float)
            else:
                try:
                    detail = resp.json()
                except Exception:
                    detail = resp.text
                raise RuntimeError(f"HTTP {resp.status_code}: {detail}")
        except Exception as e:
            last_err = e
            time.sleep(1.0)
    raise SystemExit(f"Request gagal setelah retry: {last_err}")

In [35]:
samples = df_all.sample(5, random_state=42).reset_index(drop=True)
preds = predict_examples(samples)
labels = (preds >= THRESHOLD).astype(int)

rows = []
for i, row in samples.iterrows():
    detail = {
        "probability": round(float(preds[i]), 4),
        "pred_label": int(labels[i]),
        "true_label": (
            int(row[LABEL]) if LABEL in row and not pd.isna(row[LABEL]) else None
        ),
    }
    # fitur numerik + persentil
    for col in NUMERIC:
        val = row.get(col, np.nan)
        try:
            pct = float(percentile_maps[col](val))
        except Exception:
            pct = np.nan
        detail[col] = val
        detail[f"{col}_pct"] = None if np.isnan(pct) else int(round(pct))
    # fitur kategorikal + base rate kategori
    for col in CATEGORICAL:
        cat = str(row.get(col, ""))
        detail[col] = cat
        base = cat_rates.get(col, {}).get(cat, None)
        detail[f"{col}_rate"] = (
            None if base is None or pd.isna(base) else round(float(base), 3)
        )
    rows.append(detail)

results_df = pd.DataFrame(rows)

In [36]:
order = (
    ["probability", "pred_label", "true_label"]
    + sum(([c, f"{c}_pct"] for c in NUMERIC), [])
    + sum(([c, f"{c}_rate"] for c in CATEGORICAL), [])
)
results_df = results_df[order]

display(results_df)

def explain_row(r):
    bits = []
    bits.append(f"Prob={r['probability']:.3f} → Pred {r['pred_label']} (thr={THRESHOLD})")
    if pd.notna(r.get("true_label")):
        bits.append(f"• True={int(r['true_label'])}")
    strong = []
    for col in ["Age", "RestingBP", "Cholesterol", "MaxHR", "Oldpeak"]:
        pct = r.get(f"{col}_pct")
        if pct is not None:
            if pct >= 80:
                strong.append(f"{col} tinggi (~{pct}p)")
            elif pct <= 20:
                strong.append(f"{col} rendah (~{pct}p)")
    for col in ["ChestPainType", "ExerciseAngina", "ST_Slope", "Sex", "RestingECG"]:
        rate = r.get(f"{col}_rate")
        val = r.get(col)
        if rate is not None:
            strong.append(f"{col}={val} (rate={rate:.2f})")
    if strong:
        bits.append("• Konteks: " + ", ".join(strong))
    return "  ".join(bits)

display(Markdown(f"**Overall disease rate (dataset): {overall_rate:.2f}**" if not np.isnan(overall_rate) else "**Overall disease rate (dataset): N/A**"))
for idx, r in results_df.iterrows():
    display(Markdown(f"- **Sample {idx+1}** — {explain_row(r)}"))

Unnamed: 0,probability,pred_label,true_label,Age,Age_pct,RestingBP,RestingBP_pct,Cholesterol,Cholesterol_pct,FastingBS,...,Sex,Sex_rate,ChestPainType,ChestPainType_rate,RestingECG,RestingECG_rate,ExerciseAngina,ExerciseAngina_rate,ST_Slope,ST_Slope_rate
0,0.04,0,0,63,100,140,100,195,100,0,...,F,0.259,ATA,0.139,Normal,0.516,N,0.351,Up,0.197
1,0.8084,1,1,53,100,145,100,518,100,0,...,M,0.632,NAP,0.355,Normal,0.516,N,0.351,Flat,0.828
2,0.9897,1,1,65,100,160,100,0,42,1,...,M,0.632,ASY,0.79,ST,0.657,N,0.351,Flat,0.828
3,0.958,1,1,56,100,130,100,0,42,0,...,M,0.632,ASY,0.79,LVH,0.564,Y,0.852,Flat,0.828
4,0.0554,0,0,54,100,108,100,309,100,0,...,M,0.632,ATA,0.139,Normal,0.516,N,0.351,Up,0.197


**Overall disease rate (dataset): 0.55**

- **Sample 1** — Prob=0.040 → Pred 0 (thr=0.5)  • True=0  • Konteks: Age tinggi (~100p), RestingBP tinggi (~100p), Cholesterol tinggi (~100p), MaxHR tinggi (~100p), ChestPainType=ATA (rate=0.14), ExerciseAngina=N (rate=0.35), ST_Slope=Up (rate=0.20), Sex=F (rate=0.26), RestingECG=Normal (rate=0.52)

- **Sample 2** — Prob=0.808 → Pred 1 (thr=0.5)  • True=1  • Konteks: Age tinggi (~100p), RestingBP tinggi (~100p), Cholesterol tinggi (~100p), MaxHR tinggi (~100p), ChestPainType=NAP (rate=0.35), ExerciseAngina=N (rate=0.35), ST_Slope=Flat (rate=0.83), Sex=M (rate=0.63), RestingECG=Normal (rate=0.52)

- **Sample 3** — Prob=0.990 → Pred 1 (thr=0.5)  • True=1  • Konteks: Age tinggi (~100p), RestingBP tinggi (~100p), MaxHR tinggi (~100p), ChestPainType=ASY (rate=0.79), ExerciseAngina=N (rate=0.35), ST_Slope=Flat (rate=0.83), Sex=M (rate=0.63), RestingECG=ST (rate=0.66)

- **Sample 4** — Prob=0.958 → Pred 1 (thr=0.5)  • True=1  • Konteks: Age tinggi (~100p), RestingBP tinggi (~100p), MaxHR tinggi (~100p), ChestPainType=ASY (rate=0.79), ExerciseAngina=Y (rate=0.85), ST_Slope=Flat (rate=0.83), Sex=M (rate=0.63), RestingECG=LVH (rate=0.56)

- **Sample 5** — Prob=0.055 → Pred 0 (thr=0.5)  • True=0  • Konteks: Age tinggi (~100p), RestingBP tinggi (~100p), Cholesterol tinggi (~100p), MaxHR tinggi (~100p), ChestPainType=ATA (rate=0.14), ExerciseAngina=N (rate=0.35), ST_Slope=Up (rate=0.20), Sex=M (rate=0.63), RestingECG=Normal (rate=0.52)

In [37]:
os.makedirs("artifacts", exist_ok=True)
results_df.to_csv("artifacts/prediction_details.csv", index=False)
with open("artifacts/prediction_details.md", "w") as f:
    f.write(
        f"Overall disease rate (dataset): {overall_rate:.2f}\n\n"
        if not np.isnan(overall_rate)
        else "Overall disease rate (dataset): N/A\n\n"
    )
    for idx, r in results_df.iterrows():
        f.write(f"- Sample {idx+1} — {explain_row(r)}\n")   
print(
    "✅ Tersimpan: artifacts/prediction_details.csv & artifacts/prediction_details.md"
)

✅ Tersimpan: artifacts/prediction_details.csv & artifacts/prediction_details.md
