core5_04_counterfactual_stress_tests.ipynb

목적:
- 예측이 거의 완벽해도(toggle 남음)
- 예측이 극도로 나빠도(개입 폭주)
→ 구조적으로 안정성이 보장되지 않음을 반례로 증명

Counterfactuals:
CF1: noise_scale → ~0
CF2: noise_scale → 매우 큼

주의:
- cutoff 없음
- 성능 비교 없음
- '왜 실패하는지'만 기록

In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

TRACE_PATH = Path("../artifact/core2/prediction_trace.csv")
CANDIDATE_PATH = Path("../artifact/core2/mutation_candidate_set.csv")

trace_base = pd.read_csv(TRACE_PATH)
cand_base = pd.read_csv(CANDIDATE_PATH)

print("trace shape:", trace_base.shape)
print("candidate shape:", cand_base.shape)

trace_base.head()

TARGET_ANTIBODIES = [
    "GDPa1-001",
    "GDPa1-045",
    "GDPa1-183",
]

trace_3 = trace_base[trace_base["antibody_key"].isin(TARGET_ANTIBODIES)]
trace_3 = trace_3.sort_values(["antibody_key", "step"]).reset_index(drop=True)

initial_states = {}
for ab in TARGET_ANTIBODIES:
    g = trace_3[trace_3["antibody_key"] == ab]
    if len(g) == 0:
        continue
    r0 = g.iloc[0]
    initial_states[ab] = {
        "sequence_current": r0["sequence_current"],
        "pred_score": float(r0["pred_score"])
    }

initial_states # 항체 3개 고정 + 초기 상태 추출

COUNTERFACTUALS = {
    "CF1_noise_almost_zero": {
        "noise_scale": 1e-6
    },
    "CF2_noise_extreme": {
        "noise_scale": 0.5
    }
}

# 구조는 고정
COOLDOWN = 2
LOOKBACK = 3
RISK_MODE = "mean"
CANDIDATE_SAMPLE_N = 5

MAX_STEPS = int(trace_3["step"].max()) if "step" in trace_3.columns else 20 # Counterfactual 실험 설정

trace shape: (60, 8)
candidate shape: (180, 10)


In [2]:
def simulate_pred_score(prev_score, noise_scale, rng):
    return prev_score + rng.uniform(-noise_scale, noise_scale)


def decision_rule(delta, cooldown_left):
    if cooldown_left > 0:
        return "HOLD"
    return "MUTATE" if delta > 0 else "HOLD"


def build_candidate_view(cand_df, antibody_key, step, sample_n, rng):
    sub = cand_df[
        (cand_df["antibody_key"] == antibody_key) &
        (cand_df["step"] == step)
    ]
    if len(sub) <= sample_n:
        return sub
    idx = rng.choice(sub.index.values, size=sample_n, replace=False)
    return sub.loc[idx] # 예측/결정/후보 로직 (Core5 재사용)

In [3]:
def run_counterfactual(
    antibody_key,
    initial_state,
    noise_scale,
    cand_df,
    seed
):
    rng = np.random.default_rng(seed)
    
    seq = initial_state["sequence_current"]
    pred_history = [initial_state["pred_score"]]
    cooldown_left = 0
    intervention_count = 0
    prev_decision = None
    
    rows = []
    
    for step in range(1, MAX_STEPS + 1):
        base = pred_history[-1]
        pred = simulate_pred_score(base, noise_scale, rng)
        
        if len(pred_history) >= LOOKBACK:
            ref = pred_history[-LOOKBACK]
        else:
            ref = pred_history[0]
        
        delta = pred - ref
        decision = decision_rule(delta, cooldown_left)
        
        mutation_id_applied = None
        
        if decision == "MUTATE":
            view = build_candidate_view(
                cand_df, antibody_key, step,
                CANDIDATE_SAMPLE_N, rng
            )
            if len(view) > 0:
                chosen = view.sample(1, random_state=seed + step).iloc[0]
                seq = chosen["sequence_after"]
                mutation_id_applied = chosen["mutation_id"]
                intervention_count += 1
                cooldown_left = COOLDOWN
        
        if cooldown_left > 0:
            cooldown_left -= 1
        
        rows.append({
            "antibody_key": antibody_key,
            "step": step,
            "pred_score": pred,
            "pred_score_delta": delta,
            "decision": decision,
            "mutation_id_applied": mutation_id_applied,
            "intervention_count_cum": intervention_count,
            "prev_decision": prev_decision
        })
        
        prev_decision = decision
        pred_history.append(pred)
    
    return pd.DataFrame(rows) # 단일 Counterfactual 시뮬레이터

In [4]:
def compute_toggle_rate(decisions):
    prev = decisions.shift(1)
    toggle = (decisions != prev).astype(int)
    toggle.iloc[0] = 0
    return toggle.sum() / len(toggle), int(toggle.sum())


def compute_burst_mean(decisions):
    bursts = []
    current = 0
    for d in decisions:
        if d == "MUTATE":
            current += 1
        else:
            if current > 0:
                bursts.append(current)
                current = 0
    if current > 0:
        bursts.append(current)
    if not bursts:
        return 0.0, 0
    return float(np.mean(bursts)), int(max(bursts)) # 불안정 지표 계산 함수

In [5]:
records = []

for cf_name, cf_conf in COUNTERFACTUALS.items():
    for ab in TARGET_ANTIBODIES:
        if ab not in initial_states:
            continue
        
        sim = run_counterfactual(
            antibody_key=ab,
            initial_state=initial_states[ab],
            noise_scale=cf_conf["noise_scale"],
            cand_df=cand_base,
            seed=42
        )
        
        decisions = sim["decision"]
        toggle_rate, toggle_count = compute_toggle_rate(decisions)
        burst_mean, burst_max = compute_burst_mean(decisions)
        interventions = sim["intervention_count_cum"].max()
        
        records.append({
            "counterfactual": cf_name,
            "antibody_key": ab,
            "noise_scale": cf_conf["noise_scale"],
            "toggle_rate": toggle_rate,
            "toggle_count": toggle_count,
            "burst_mean": burst_mean,
            "burst_max": burst_max,
            "interventions": interventions
        })

counterfactual_df = pd.DataFrame(records)
counterfactual_df # Counterfactual 실행

Unnamed: 0,counterfactual,antibody_key,noise_scale,toggle_rate,toggle_count,burst_mean,burst_max,interventions
0,CF1_noise_almost_zero,GDPa1-001,1e-06,0.7,14,1.0,1,8
1,CF1_noise_almost_zero,GDPa1-045,1e-06,0.7,14,1.0,1,8
2,CF1_noise_almost_zero,GDPa1-183,1e-06,0.7,14,1.0,1,8
3,CF2_noise_extreme,GDPa1-001,0.5,0.7,14,1.0,1,8
4,CF2_noise_extreme,GDPa1-045,0.5,0.7,14,1.0,1,8
5,CF2_noise_extreme,GDPa1-183,0.5,0.7,14,1.0,1,8


In [6]:
OUTPUT_DIR = Path("../artifact/core5")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

out_path = OUTPUT_DIR / "counterfactual_results.csv"
counterfactual_df.to_csv(out_path, index=False)

out_path

summary = (
    counterfactual_df
    .groupby("counterfactual")
    .agg(
        toggle_rate_mean=("toggle_rate", "mean"),
        burst_mean_mean=("burst_mean", "mean"),
        interventions_mean=("interventions", "mean")
    )
    .reset_index()
)

summary

Unnamed: 0,counterfactual,toggle_rate_mean,burst_mean_mean,interventions_mean
0,CF1_noise_almost_zero,0.7,1.0,8.0
1,CF2_noise_extreme,0.7,1.0,8.0
