# Core 9 — Feature Engineering with Accumulation Windows

본 노트북은 예측 모델을 만들지 않는다.

목적은 Core 9에서 사용할 **입력 특징량(feature)**을
“누적 기반 / 지속성 기반” 철학에 맞게 고정하는 것이다.

- 순간 반응형(feature at t) 금지
- 이동 평균, 누적 기울기, 지속 구간(streak)만 허용
- Core 8에서 정의한 정규화/가중치 규칙을 재사용하여
  거버넌스 판단과 예측 입력의 의미적 일관성을 유지한다.

In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd

# 입력 경로
TRACE_PATH = Path("../artifact/core8/core8_03_refusal_state_trace_counterfactual.csv")
PARAMS_PATH = Path("../artifact/core8/core8_06_fallback_params.json")

assert TRACE_PATH.exists(), "state trace CSV not found"
assert PARAMS_PATH.exists(), "fallback params not found"

# 출력 경로
EXPORT_DIR = Path("../artifact/core9")
EXPORT_DIR.mkdir(exist_ok=True)

# fallback 파라미터 로드 (정규화 규칙 재사용)
PARAMS = json.loads(PARAMS_PATH.read_text(encoding="utf-8"))

TH = PARAMS["thresholds"]
W  = PARAMS["weights"]

In [3]:
df = pd.read_csv(TRACE_PATH)

expected_cols = [
    "run_id","case_id","antibody_id","step",
    "blocked_rate_window","veto_streak",
    "action_toggle_rate","SoMS_cumsum_window"
]

missing = [c for c in expected_cols if c not in df.columns]
assert not missing, f"Missing columns: {missing}"

work = df[expected_cols].copy()

# dtype 정규화
work["step"] = pd.to_numeric(work["step"], errors="coerce").astype("Int64")
for c in ["blocked_rate_window","action_toggle_rate","SoMS_cumsum_window"]:
    work[c] = pd.to_numeric(work[c], errors="coerce")

work["veto_streak"] = pd.to_numeric(work["veto_streak"], errors="coerce").astype("Int64")

# 정렬 (시계열 필수)
work = work.sort_values(["run_id","case_id","antibody_id","step"]).reset_index(drop=True)

In [4]:
# 윈도우 크기들 (고정 계약)
WINDOWS = {
    "short": 5,
    "mid": 10,
    "long": 20,
}

In [5]:
def add_trend_features(g, k):
    # SoMS 선형 기울기 (단순 slope proxy)
    g[f"soms_slope_{k}"] = (
        g["SoMS_cumsum_window"]
        .rolling(k, min_periods=k)
        .apply(lambda x: np.polyfit(range(len(x)), x, 1)[0], raw=False)
    )

    # 이동 평균
    g[f"toggle_rate_ma_{k}"] = g["action_toggle_rate"].rolling(k, min_periods=k).mean()
    g[f"blocked_rate_ma_{k}"] = g["blocked_rate_window"].rolling(k, min_periods=k).mean()

    return g

In [6]:
def rolling_streak(series, th):
    streak = []
    cnt = 0
    for v in series:
        if pd.notna(v) and v >= th:
            cnt += 1
        else:
            cnt = 0
        streak.append(cnt)
    return streak


def add_streak_features(g, k):
    g[f"toggle_over_th_streak_{k}"] = rolling_streak(
        g["action_toggle_rate"].rolling(k, min_periods=1).max(),
        TH["toggle_rate"]
    )

    g[f"blocked_over_partialseal_streak_{k}"] = rolling_streak(
        g["blocked_rate_window"].rolling(k, min_periods=1).max(),
        TH["partial_seal"]
    )

    return g

In [7]:
def estimate_recovery_time(series, th):
    times = []
    last_over = None

    for i, v in enumerate(series):
        if pd.notna(v) and v >= th:
            last_over = i
            times.append(np.nan)
        else:
            if last_over is not None:
                times.append(i - last_over)
            else:
                times.append(np.nan)
    return times


def add_recovery_feature(g):
    g["recovery_time_est"] = estimate_recovery_time(
        g["action_toggle_rate"], TH["toggle_rate"]
    )
    return g

In [8]:
def clip01(x):
    return np.nan if pd.isna(x) else float(np.clip(x, 0, 1))


def add_pressure_index(g):
    block_s = (g["blocked_rate_window"] / TH["block_rate"]).map(clip01)
    veto_s  = (g["veto_streak"].astype(float) / TH["veto_streak"]).map(clip01)
    tog_s   = (g["action_toggle_rate"] / TH["toggle_rate"]).map(clip01)
    soms_s  = (g["SoMS_cumsum_window"] / TH["soms_cumsum"]).map(clip01)

    g["pressure_index"] = (
        W["block"]  * block_s.fillna(0) +
        W["veto"]   * veto_s.fillna(0) +
        W["toggle"] * tog_s.fillna(0) +
        W["soms"]   * soms_s.fillna(0)
    )
    return g

In [9]:
features = (
    work
    .groupby(["run_id","case_id","antibody_id"], group_keys=False)
    .apply(
        lambda g: (
            g
            .pipe(add_recovery_feature)
            .pipe(add_pressure_index)
            .pipe(lambda x: add_trend_features(x, WINDOWS["short"]))
            .pipe(lambda x: add_trend_features(x, WINDOWS["mid"]))
            .pipe(lambda x: add_streak_features(x, WINDOWS["mid"]))
        )
    )
)

  .apply(


In [10]:
# 필요하면 core9_01 결과 병합
TARGET_PATH = Path("../artifact_core9_01/core9_01_targets_preview.csv")

if TARGET_PATH.exists():
    targets = pd.read_csv(TARGET_PATH)
    features = features.merge(
        targets,
        on=["run_id","case_id","antibody_id","step"],
        how="left"
    )

In [11]:
EXPORT_PATH = EXPORT_DIR / "core9_02_features.csv"
features.to_csv(EXPORT_PATH, index=False)

print(f"Exported features → {EXPORT_PATH}")
print("shape:", features.shape)

Exported features → ../artifact/core9/core9_02_features.csv
shape: (180, 18)
