# Core 8 — Fallback Spec & Judge (core8_06)

Core 8의 목표는 설계 성능이 아니라 **설계 거부/권한 박탈 능력의 설계**다.

Fallback은 예외처리 루트가 아니라 **사전 설계된 차선 운영 모드(operational mode)** 이다.

본 노트북의 산출물:
1) Fallback 트리거를 **정량 수식/규칙**으로 고정
2) 동일 입력(CSV) → 동일 판정(enter/hold) 재현
3) “즉시 발동 금지(MIN_STEPS)”를 **로그로 증명**
4) Core 10의 “회귀 조건”으로 그대로 이식 가능한 형태로 Export (JSON/CSV/DB 옵션)

## 상태 변수 (CSV에 존재)
- step
- blocked_rate_window
- veto_streak
- action_toggle_rate
- SoMS_cumsum_window
- refusal_triggered
- refusal_reason_code

## 파생 변수 (본 노트북에서 정의)
- cooldown_ok = (step >= MIN_STEPS)
- block_s, veto_s, tog_s, soms_s : 0~1 정규화 점수
- fallback_score : 가중합 점수 (0~1)
- fallback_stage : HOLD / MONITOR / PARTIAL_SEAL / REFUSAL / FALLBACK_ENTER
- fallback_reason_code : 판정 사유 코드
- fallback_rule_id : 룰 버전 추적용 (예: core8_06_v1)

## 정책 문장
- “Fallback is accumulation-driven, not reaction-driven.”
- “Fallback is entered only after sustained governance pressure.”

## 즉시 발동 금지(필수 로그)
- step < MIN_STEPS 인 구간에서는 어떤 경우에도 fallback_entered = False
- 대신 fallback_evaluated = True + fallback_reason_code = REASON_MIN_STEPS_NOT_REACHED 를 남긴다.

## Thresholds (core8_06_v1)
- MIN_STEPS = 10
- BLOCK_RATE_TH = 0.35
- VETO_STREAK_TH = 6
- TOGGLE_RATE_TH = 0.45
- SOMS_CUMSUM_TH = 25.0
- PARTIAL_SEAL_TH = 0.20
- FALLBACK_SCORE_TH = 0.85

## Weights (score-based)
- w_block = 0.25
- w_veto = 0.25
- w_toggle = 0.25
- w_soms = 0.25

In [13]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

RULE_VERSION = "core8_06_v1"

MIN_STEPS = 10

BLOCK_RATE_TH   = 0.35
VETO_STREAK_TH  = 6
TOGGLE_RATE_TH  = 0.45
SOMS_CUMSUM_TH  = 25.0
PARTIAL_SEAL_TH = 0.20

W_BLOCK  = 0.25
W_VETO   = 0.25
W_TOGGLE = 0.25
W_SOMS   = 0.25

FALLBACK_SCORE_TH = 0.85

PARAMS = {
    "rule_id": RULE_VERSION,
    "min_steps": MIN_STEPS,
    "thresholds": {
        "block_rate": BLOCK_RATE_TH,
        "veto_streak": VETO_STREAK_TH,
        "toggle_rate": TOGGLE_RATE_TH,
        "soms_cumsum": SOMS_CUMSUM_TH,
        "partial_seal": PARTIAL_SEAL_TH,
        "fallback_score": FALLBACK_SCORE_TH,
    },
    "weights": {
        "block": W_BLOCK,
        "veto": W_VETO,
        "toggle": W_TOGGLE,
        "soms": W_SOMS,
    }
}

PARAMS

{'rule_id': 'core8_06_v1',
 'min_steps': 10,
 'thresholds': {'block_rate': 0.35,
  'veto_streak': 6,
  'toggle_rate': 0.45,
  'soms_cumsum': 25.0,
  'partial_seal': 0.2,
  'fallback_score': 0.85},
 'weights': {'block': 0.25, 'veto': 0.25, 'toggle': 0.25, 'soms': 0.25}}

In [14]:
CSV_PATH = Path("../../core/artifact/core8/core8_03_refusal_state_trace_counterfactual.csv")
assert CSV_PATH.exists(), "CSV not found"

expected_cols = [
    "run_id","case_id","antibody_id","step",
    "blocked_rate_window","veto_streak","action_toggle_rate",
    "SoMS_cumsum_window","refusal_triggered","refusal_reason_code"
]

df = pd.read_csv(CSV_PATH)

missing = [c for c in expected_cols if c not in df.columns]
assert not missing, f"Missing columns: {missing}"

work = df[expected_cols].copy()

work["step"] = pd.to_numeric(work["step"], errors="coerce").astype("Int64")
work["blocked_rate_window"] = pd.to_numeric(work["blocked_rate_window"], errors="coerce")
work["veto_streak"] = pd.to_numeric(work["veto_streak"], errors="coerce").astype("Int64")
work["action_toggle_rate"] = pd.to_numeric(work["action_toggle_rate"], errors="coerce")
work["SoMS_cumsum_window"] = pd.to_numeric(work["SoMS_cumsum_window"], errors="coerce")

work["refusal_triggered"] = (
    work["refusal_triggered"].astype(str).str.lower()
    .map({"true":True,"false":False,"1":True,"0":False})
    .fillna(False).astype(bool)
)

work.head()

Unnamed: 0,run_id,case_id,antibody_id,step,blocked_rate_window,veto_streak,action_toggle_rate,SoMS_cumsum_window,refusal_triggered,refusal_reason_code
0,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,0,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED
1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,1,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED
2,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,2,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED
3,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,3,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED
4,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,4,0.0,0,0.0,0.6,False,REASON_MIN_STEPS_NOT_REACHED


In [15]:
def clip01(x):
    return np.nan if pd.isna(x) else float(np.clip(x, 0, 1))

work["cooldown_ok"] = work["step"].fillna(-1).astype(int) >= MIN_STEPS

work["block_s"] = (work["blocked_rate_window"] / BLOCK_RATE_TH).map(clip01)
work["veto_s"]  = (work["veto_streak"].astype(float) / VETO_STREAK_TH).map(clip01)
work["tog_s"]   = (work["action_toggle_rate"] / TOGGLE_RATE_TH).map(clip01)
work["soms_s"]  = (work["SoMS_cumsum_window"] / SOMS_CUMSUM_TH).map(clip01)

work["fallback_score"] = (
    W_BLOCK*work["block_s"].fillna(0) +
    W_VETO*work["veto_s"].fillna(0) +
    W_TOGGLE*work["tog_s"].fillna(0) +
    W_SOMS*work["soms_s"].fillna(0)
)

work[["step","cooldown_ok","fallback_score"]].head(15)

Unnamed: 0,step,cooldown_ok,fallback_score
0,0,False,0.0
1,1,False,0.0
2,2,False,0.0
3,3,False,0.0
4,4,False,0.006
5,5,False,0.0115
6,6,False,0.023
7,7,False,0.034
8,8,False,0.0445
9,9,False,0.0545


In [16]:
def judge_fallback(r):
    step = int(r["step"]) if pd.notna(r["step"]) else -1

    # HOLD: 즉시 발동 금지
    if step < MIN_STEPS:
        return {
            "fallback_evaluated": True,
            "fallback_entered": False,
            "fallback_stage": "HOLD",
            "fallback_reason_code": "REASON_MIN_STEPS_NOT_REACHED",
            "fallback_rule_id": RULE_VERSION,
            "fallback_score": r["fallback_score"],
        }

    # REFUSAL (이미 Core8에서 발생한 중단)
    if r["refusal_triggered"]:
        return {
            "fallback_evaluated": True,
            "fallback_entered": True,
            "fallback_stage": "REFUSAL",
            "fallback_reason_code": r["refusal_reason_code"],
            "fallback_rule_id": RULE_VERSION,
            "fallback_score": r["fallback_score"],
        }

    # A: Rule-based
    a_enter = (
        r["SoMS_cumsum_window"] >= SOMS_CUMSUM_TH and
        (
            r["action_toggle_rate"] >= TOGGLE_RATE_TH or
            r["blocked_rate_window"] >= BLOCK_RATE_TH or
            r["veto_streak"] >= VETO_STREAK_TH
        )
    )

    # B: Score-based
    b_enter = r["fallback_score"] >= FALLBACK_SCORE_TH

    if a_enter or b_enter:
        return {
            "fallback_evaluated": True,
            "fallback_entered": True,
            "fallback_stage": "FALLBACK_ENTER",
            "fallback_reason_code": "REASON_ACCUMULATED_GOVERNANCE_PRESSURE",
            "fallback_rule_id": RULE_VERSION,
            "fallback_score": r["fallback_score"],
        }

    # PARTIAL SEAL
    if r["blocked_rate_window"] >= PARTIAL_SEAL_TH:
        return {
            "fallback_evaluated": True,
            "fallback_entered": False,
            "fallback_stage": "PARTIAL_SEAL",
            "fallback_reason_code": "REASON_PARTIAL_SEAL_THRESHOLD",
            "fallback_rule_id": RULE_VERSION,
            "fallback_score": r["fallback_score"],
        }

    return {
        "fallback_evaluated": True,
        "fallback_entered": False,
        "fallback_stage": "MONITOR",
        "fallback_reason_code": "REASON_WITHIN_GOVERNANCE_BOUNDS",
        "fallback_rule_id": RULE_VERSION,
        "fallback_score": r["fallback_score"],
    }

judge_df = work.apply(judge_fallback, axis=1, result_type="expand")
out = pd.concat([work, judge_df], axis=1)

out.head()

Unnamed: 0,run_id,case_id,antibody_id,step,blocked_rate_window,veto_streak,action_toggle_rate,SoMS_cumsum_window,refusal_triggered,refusal_reason_code,...,veto_s,tog_s,soms_s,fallback_score,fallback_evaluated,fallback_entered,fallback_stage,fallback_reason_code,fallback_rule_id,fallback_score.1
0,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,0,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED,...,0.0,0.0,0.0,0.0,True,False,HOLD,REASON_MIN_STEPS_NOT_REACHED,core8_06_v1,0.0
1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,1,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED,...,0.0,0.0,0.0,0.0,True,False,HOLD,REASON_MIN_STEPS_NOT_REACHED,core8_06_v1,0.0
2,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,2,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED,...,0.0,0.0,0.0,0.0,True,False,HOLD,REASON_MIN_STEPS_NOT_REACHED,core8_06_v1,0.0
3,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,3,0.0,0,0.0,0.0,False,REASON_MIN_STEPS_NOT_REACHED,...,0.0,0.0,0.0,0.0,True,False,HOLD,REASON_MIN_STEPS_NOT_REACHED,core8_06_v1,0.0
4,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,4,0.0,0,0.0,0.6,False,REASON_MIN_STEPS_NOT_REACHED,...,0.0,0.0,0.024,0.006,True,False,HOLD,REASON_MIN_STEPS_NOT_REACHED,core8_06_v1,0.006


In [20]:
def summarize_group(g):
    return pd.Series({
        "has_hold": (g["fallback_stage"] == "HOLD").any(),
        "has_partial_seal": (g["fallback_stage"] == "PARTIAL_SEAL").any(),
        "has_refusal": (g["fallback_stage"] == "REFUSAL").any(),
        "has_fallback": (g["fallback_stage"] == "FALLBACK_ENTER").any(),
        "max_score": g["fallback_score"].max(),
        "max_soms": g["SoMS_cumsum_window"].max(),
    })

summary = (
    out
    .groupby(["run_id", "case_id"], group_keys=False)
    .apply(summarize_group)
    .reset_index()
)

summary

  .apply(summarize_group)


Unnamed: 0,run_id,case_id,has_hold,has_partial_seal,has_refusal,has_fallback,max_score,max_soms
0,core7_04_1767776352,A_ALWAYS_ALLOW,True,False,False,False,fallback_score 0.2415 fallback_score 0.2...,24.15
1,core7_04_1767776352,B_GOVERNED,True,True,False,False,fallback_score 0.521271 fallback_score 0...,24.2


In [29]:
EXPORT_DIR = Path("../../core/artifact/core8")
EXPORT_DIR.mkdir(exist_ok=True)

(EXPORT_DIR / "core8_06_fallback_params.json").write_text(
    json.dumps(PARAMS, indent=2), encoding="utf-8"
)

out.to_csv(EXPORT_DIR / "core8_06_fallback_decisions.csv", index=False)
summary.to_csv(EXPORT_DIR / "core8_06_fallback_summary.csv", index=False)

print("Exported:")
print("- core8_06_fallback_params.json")
print("- core8_06_fallback_decisions.csv")
print("- core8_06_fallback_summary.csv")

Exported:
- core8_06_fallback_params.json
- core8_06_fallback_decisions.csv
- core8_06_fallback_summary.csv


In [26]:
# Explain 전용: 중복 fallback_score 제거
explain_df = out.loc[:, ~out.columns.duplicated()].copy()

# sanity check
assert "fallback_score" in explain_df.columns

In [27]:
# 판정에는 영향 없음 — 설명 생성 전용 (완전 안정)

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You explain governance decisions. Do not change them."),
    ("user", "stage={stage}, reason={reason}, score={score}\n한 문장 한국어 설명.")
])

def explain_row(r):
    try:
        score = float(r["fallback_score"])  # ✅ 이제 scalar

        prompt_value = prompt.format_prompt(
            stage=r["fallback_stage"],
            reason=r["fallback_reason_code"],
            score=round(score, 3),
        )

        text = prompt_value.to_string()
        resp = llm.invoke(text)
        return resp.content

    except Exception as e:
        return f"[EXPLAIN_FAILED] {type(e).__name__}: {e}"

sample = explain_df.tail(10).copy()
sample["explain"] = sample.apply(explain_row, axis=1)

sample[
    ["case_id", "step", "fallback_stage",
     "fallback_reason_code", "fallback_score", "explain"]
]

Unnamed: 0,case_id,step,fallback_stage,fallback_reason_code,fallback_score,explain
170,B_GOVERNED,20,MONITOR,REASON_WITHIN_GOVERNANCE_BOUNDS,0.258885,"모니터 단계에서, 거버넌스 범위 내의 이유로 점수가 0.259로 평가되었습니다."
171,B_GOVERNED,21,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.441709,"이 결정은 부분 봉인 상태로, 봉인 기준 점수인 0.442로 인해 부분 봉인이 적용..."
172,B_GOVERNED,22,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.469771,"부분 봉인 단계로, 봉인 기준 점수인 0.47로 인해 부분 봉인이 결정되었습니다."
173,B_GOVERNED,23,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.477771,"이 결정은 부분 봉인 상태로, 봉인 기준 점수인 0.478이 미달하여 부분 봉인이 ..."
174,B_GOVERNED,24,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.492271,"이 결정은 부분 봉인 상태로, 봉인 기준 점수인 0.492로 인해 부분적으로만 봉인..."
175,B_GOVERNED,25,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.506771,"이 결정은 부분 봉인 상태로, 봉인 기준 점수인 0.507을 초과하지 못했기 때문에..."
176,B_GOVERNED,26,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.521271,"이 결정은 부분 봉인 상태로, 봉인 기준 점수인 0.521이 설정된 임계값을 초과하..."
177,B_GOVERNED,27,PARTIAL_SEAL,REASON_PARTIAL_SEAL_THRESHOLD,0.474042,"이 결정은 부분 봉인 상태로, 봉인 기준 점수인 0.474로 인해 부분적으로만 봉인..."
178,B_GOVERNED,28,MONITOR,REASON_WITHIN_GOVERNANCE_BOUNDS,0.355385,"모니터 단계에서의 결정은 거버넌스 범위 내에서 이루어졌으며, 점수는 0.355입니다."
179,B_GOVERNED,29,MONITOR,REASON_WITHIN_GOVERNANCE_BOUNDS,0.369885,"모니터 단계에서, 거버넌스 범위 내의 이유로 점수가 0.37로 평가되었습니다."


In [32]:
import pandas as pd
from sqlalchemy import create_engine, text
from datetime import datetime

ENGINE_URI = "mysql+pymysql://cube_user:cube_user_hi@localhost:3306/Developability"

engine = create_engine(
    ENGINE_URI,
    pool_pre_ping=True,
    future=True
)


with engine.connect() as conn:
    conn.execute(text("SELECT 1"))


In [33]:
# out에 중복 컬럼이 있으면 to_sql이 깨짐 (fallback_score가 대표 사례)
out = out.loc[:, ~out.columns.duplicated()].copy()

# sanity check
dup_cols = out.columns[out.columns.duplicated()].tolist()
print("duplicated cols:", dup_cols)  # [] 여야 정상

cols = [
    "rule_id","run_id","case_id","antibody_id","step",
    "fallback_stage","fallback_reason_code","fallback_score"
]

db_df = out.copy()
db_df["rule_id"] = RULE_VERSION

# 타입 정리 (MySQL 넣을 때 안전)
db_df["step"] = pd.to_numeric(db_df["step"], errors="coerce").astype("Int64")
db_df["fallback_score"] = pd.to_numeric(db_df["fallback_score"], errors="coerce")

db_df = db_df[cols].copy()

# 마지막 확인: 컬럼 중복 없어야 함
assert db_df.columns.duplicated().sum() == 0, "db_df has duplicated columns"
db_df.head()

duplicated cols: []


Unnamed: 0,rule_id,run_id,case_id,antibody_id,step,fallback_stage,fallback_reason_code,fallback_score
0,core8_06_v1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,0,HOLD,REASON_MIN_STEPS_NOT_REACHED,0.0
1,core8_06_v1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,1,HOLD,REASON_MIN_STEPS_NOT_REACHED,0.0
2,core8_06_v1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,2,HOLD,REASON_MIN_STEPS_NOT_REACHED,0.0
3,core8_06_v1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,3,HOLD,REASON_MIN_STEPS_NOT_REACHED,0.0
4,core8_06_v1,core7_04_1767776352,A_ALWAYS_ALLOW,antibody_A,4,HOLD,REASON_MIN_STEPS_NOT_REACHED,0.006


In [34]:
create_sql = """
CREATE TABLE IF NOT EXISTS fallback_decision_log (
    id BIGINT AUTO_INCREMENT PRIMARY KEY,
    rule_id VARCHAR(32) NOT NULL,
    run_id VARCHAR(64),
    case_id VARCHAR(64),
    antibody_id VARCHAR(64),
    step INT,
    fallback_stage VARCHAR(32),
    fallback_reason_code VARCHAR(128),
    fallback_score FLOAT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""

with engine.begin() as conn:
    conn.execute(text(create_sql))

    db_df.to_sql(
    "fallback_decision_log",
    con=engine,
    if_exists="append",
    index=False,
    chunksize=2000,
    method="multi"
)

with engine.connect() as conn:
    n = conn.execute(text("SELECT COUNT(*) FROM fallback_decision_log")).scalar()
    print("row_count:", n)

row_count: 180
