"""
core3_01_feature_builder_for_instability.ipynb

목적:
- Core 2 prediction_trace를 기반으로
  decision instability를 설명하기 위한 step-state feature를 생성한다.
- 이 단계에서는 학습/모델링을 하지 않는다.
"""


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Core 2 artifact 경로
ARTIFACT_DIR = Path("../artifact/core2")

prediction_trace_path = ARTIFACT_DIR / "prediction_trace.csv"

prediction_trace = pd.read_csv(prediction_trace_path)

prediction_trace.head()

Unnamed: 0,antibody_key,step,sequence_current,pred_score,pred_score_delta,decision,mutation_id_applied,intervention_count_cum
0,GDPa1-001,1,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,2.013943,0.013943,MUTATE,GDPa1-001_mut0_1,1
1,GDPa1-001,2,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.966444,-0.047499,HOLD,,1
2,GDPa1-001,3,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.943947,-0.022497,HOLD,,1
3,GDPa1-001,4,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.916268,-0.027679,HOLD,,1
4,GDPa1-001,5,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.939915,0.023647,MUTATE,GDPa1-001_mut1_5,2


In [2]:
print("컬럼 리스트:")
print(prediction_trace.columns.tolist())

print("\nRow 수:", len(prediction_trace))

컬럼 리스트:
['antibody_key', 'step', 'sequence_current', 'pred_score', 'pred_score_delta', 'decision', 'mutation_id_applied', 'intervention_count_cum']

Row 수: 60


In [3]:
TARGET_ANTIBODIES = [
    "GDPa1-001",  # abagovomab
    "GDPa1-045",  # cixutumumab
    "GDPa1-183",  # prolgolimab
]

prediction_trace = prediction_trace[
    prediction_trace["antibody_key"].isin(TARGET_ANTIBODIES)
].reset_index(drop=True)

prediction_trace[["antibody_key"]].drop_duplicates()

Unnamed: 0,antibody_key
0,GDPa1-001
20,GDPa1-045
40,GDPa1-183


In [4]:
prediction_trace = prediction_trace.sort_values(
    ["antibody_key", "step"]
).reset_index(drop=True)

prediction_trace.head(10) # time-series 정렬

Unnamed: 0,antibody_key,step,sequence_current,pred_score,pred_score_delta,decision,mutation_id_applied,intervention_count_cum
0,GDPa1-001,1,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,2.013943,0.013943,MUTATE,GDPa1-001_mut0_1,1
1,GDPa1-001,2,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.966444,-0.047499,HOLD,,1
2,GDPa1-001,3,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.943947,-0.022497,HOLD,,1
3,GDPa1-001,4,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.916268,-0.027679,HOLD,,1
4,GDPa1-001,5,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.939915,0.023647,MUTATE,GDPa1-001_mut1_5,2
5,GDPa1-001,6,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPEPNYKCRV...,1.957585,0.01767,MUTATE,GDPa1-001_mut2_6,3
6,GDPa1-001,7,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.996803,0.039218,MUTATE,GDPa1-001_mut1_7,4
7,GDPa1-001,8,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.955497,-0.041306,HOLD,,4
8,GDPa1-001,9,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.947689,-0.007808,HOLD,,4
9,GDPa1-001,10,EAKIIFEVDWQCADHITYAVHVQIRWKAGQMKFHMEDPENNYKCRV...,1.900669,-0.04702,HOLD,,4


기본 decision 파생 feature

생성 항목
	•	prev_decision
	•	toggle_event (타깃 후보)

In [5]:
prediction_trace["prev_decision"] = (
    prediction_trace
    .groupby("antibody_key")["decision"]
    .shift(1)
)

prediction_trace["toggle_event"] = (
    prediction_trace["decision"] != prediction_trace["prev_decision"]
).astype(int)

prediction_trace[[
    "antibody_key",
    "step",
    "decision",
    "prev_decision",
    "toggle_event"
]].head(10) #  toggle_event가 Core 3의 기본 타깃

Unnamed: 0,antibody_key,step,decision,prev_decision,toggle_event
0,GDPa1-001,1,MUTATE,,1
1,GDPa1-001,2,HOLD,MUTATE,1
2,GDPa1-001,3,HOLD,HOLD,0
3,GDPa1-001,4,HOLD,HOLD,0
4,GDPa1-001,5,MUTATE,HOLD,1
5,GDPa1-001,6,MUTATE,MUTATE,0
6,GDPa1-001,7,MUTATE,MUTATE,0
7,GDPa1-001,8,HOLD,MUTATE,1
8,GDPa1-001,9,HOLD,HOLD,0
9,GDPa1-001,10,HOLD,HOLD,0


rolling prediction feature (연속성만 반영)

cutoff 없음
단순 rolling만 허용

In [6]:
ROLLING_WINDOW = 3  # 고정

prediction_trace["rolling_mean_pred_score"] = (
    prediction_trace
    .groupby("antibody_key")["pred_score"]
    .rolling(ROLLING_WINDOW)
    .mean()
    .reset_index(level=0, drop=True)
)

prediction_trace["rolling_std_pred_score"] = (
    prediction_trace
    .groupby("antibody_key")["pred_score"]
    .rolling(ROLLING_WINDOW)
    .std()
    .reset_index(level=0, drop=True)
)

prediction_trace[[
    "antibody_key",
    "step",
    "pred_score",
    "rolling_mean_pred_score",
    "rolling_std_pred_score"
]].head(10)

Unnamed: 0,antibody_key,step,pred_score,rolling_mean_pred_score,rolling_std_pred_score
0,GDPa1-001,1,2.013943,,
1,GDPa1-001,2,1.966444,,
2,GDPa1-001,3,1.943947,1.974778,0.035734
3,GDPa1-001,4,1.916268,1.942219,0.025133
4,GDPa1-001,5,1.939915,1.933376,0.014953
5,GDPa1-001,6,1.957585,1.937922,0.02073
6,GDPa1-001,7,1.996803,1.964768,0.029116
7,GDPa1-001,8,1.955497,1.969961,0.023269
8,GDPa1-001,9,1.947689,1.966663,0.026392
9,GDPa1-001,10,1.900669,1.934618,0.029659


최근 mutation 빈도 feature

mutation 발생 여부는 decision == MUTATE 로 대체
(Core 2 정책 정의와 일치)

In [7]:
K_RECENT = 3  # 최근 step 범위

prediction_trace["is_mutation"] = (
    prediction_trace["decision"] == "MUTATE"
).astype(int)

prediction_trace["recent_mutation_count_k"] = (
    prediction_trace
    .groupby("antibody_key")["is_mutation"]
    .rolling(K_RECENT)
    .sum()
    .reset_index(level=0, drop=True)
)

prediction_trace[[
    "antibody_key",
    "step",
    "is_mutation",
    "recent_mutation_count_k"
]].head(10)

Unnamed: 0,antibody_key,step,is_mutation,recent_mutation_count_k
0,GDPa1-001,1,1,
1,GDPa1-001,2,0,
2,GDPa1-001,3,0,1.0
3,GDPa1-001,4,0,0.0
4,GDPa1-001,5,1,1.0
5,GDPa1-001,6,1,2.0
6,GDPa1-001,7,1,3.0
7,GDPa1-001,8,0,2.0
8,GDPa1-001,9,0,1.0
9,GDPa1-001,10,0,0.0


cooldown_left (정책 확장 대비, 현재는 0)

Core 2 baseline에서는 cooldown = 0
→ 구조만 유지

In [8]:
prediction_trace["cooldown_left"] = 0

prediction_trace[[
    "antibody_key",
    "step",
    "cooldown_left"
]].head()

Unnamed: 0,antibody_key,step,cooldown_left
0,GDPa1-001,1,0
1,GDPa1-001,2,0
2,GDPa1-001,3,0
3,GDPa1-001,4,0
4,GDPa1-001,5,0


In [9]:
FEATURE_COLUMNS = [
    "pred_score",
    "rolling_mean_pred_score",
    "rolling_std_pred_score",
    "recent_mutation_count_k",
    "cooldown_left"
]

TARGET_COLUMN = "toggle_event"

core3_ml_df = prediction_trace[[
    "antibody_key",
    "step",
    *FEATURE_COLUMNS,
    TARGET_COLUMN
]].copy()

core3_ml_df.head() # ML 입력용 feature 테이블 구성

Unnamed: 0,antibody_key,step,pred_score,rolling_mean_pred_score,rolling_std_pred_score,recent_mutation_count_k,cooldown_left,toggle_event
0,GDPa1-001,1,2.013943,,,,0,1
1,GDPa1-001,2,1.966444,,,,0,1
2,GDPa1-001,3,1.943947,1.974778,0.035734,1.0,0,0
3,GDPa1-001,4,1.916268,1.942219,0.025133,0.0,0,0
4,GDPa1-001,5,1.939915,1.933376,0.014953,1.0,0,1


In [None]:
print(core3_ml_df.isna().sum()) # 결측치 확인, drop 여부는 다음 Core에서 결정

antibody_key               0
step                       0
pred_score                 0
rolling_mean_pred_score    6
rolling_std_pred_score     6
recent_mutation_count_k    6
cooldown_left              0
toggle_event               0
dtype: int64


In [11]:
OUTPUT_DIR = Path("../artifact/core3")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

output_path = OUTPUT_DIR / "core3_ml_dataset.csv"
core3_ml_df.to_csv(output_path, index=False)

output_path

PosixPath('../artifact/core3/core3_ml_dataset.csv')