# Core10_04 — Survivability Baselines (Operational Usefulness)

목적
- 모델 성능이 아니라 **운영 유용성**(lead time / 안정성 / 비용)을 기준으로 survivability baseline 산출
- Core10_03d에서 생성된 fallback pool을 입력으로 사용
- 규칙 기반 점수를 주 계약으로 유지하고, 모델은 설명 가능한 보조 지표로만 사용

입력
- core10_03d_fallback_pool.csv
- Antibody_Developability.csv

출력
- core10_04_survivability_scores.csv

In [29]:
from pathlib import Path
import numpy as np
import pandas as pd
import re

ART_DIR = Path("../artifact/core10")
ART_DIR.mkdir(parents=True, exist_ok=True)

POOL_PATH = ART_DIR / "core10_03d_fallback_pool.csv"
DEV_PATH  = Path("../../data_csv/Antibody_Developability.csv")
OUT_PATH  = ART_DIR / "core10_04_survivability_scores.csv"

assert POOL_PATH.exists(), f"Missing: {POOL_PATH.resolve()}"
assert DEV_PATH.exists(),  f"Missing: {DEV_PATH.resolve()}"

print("POOL:", POOL_PATH)
print("DEV :", DEV_PATH)
print("OUT :", OUT_PATH)

POOL: ../artifact/core10/core10_03d_fallback_pool.csv
DEV : ../../data_csv/Antibody_Developability.csv
OUT : ../artifact/core10/core10_04_survivability_scores.csv


In [30]:
pool = pd.read_csv(POOL_PATH)
dev  = pd.read_csv(DEV_PATH)

print("pool rows:", len(pool))
print("dev  rows:", len(dev))

pool.head(3)

pool rows: 19
dev  rows: 246


Unnamed: 0,antibody_id,signature,proxy_survivability_score,tie_break_risk,core10_operational_risk,cluster_size,hc_subtype,lc_subtype,hierarchical_cluster_IgG_isotype_stratified_fold
0,GDPa1-060,IgG1|Kappa|0,0.858995,1.27875,0.156673,54,IgG1,Kappa,0
1,GDPa1-021,IgG1|Kappa|1,0.844606,1.27625,0.17266,49,IgG1,Kappa,1
2,GDPa1-085,IgG1|Kappa|4,0.844606,1.28125,0.17266,49,IgG1,Kappa,4


In [31]:
cand = pool.merge(
    dev,
    on="antibody_id",
    how="left",
    validate="one_to_one"
)

print("merged rows:", len(cand))
cand.head(3) # Merge Pool ↔ Developability DB

merged rows: 19


Unnamed: 0,antibody_id,signature,proxy_survivability_score,tie_break_risk,core10_operational_risk,cluster_size,hc_subtype_x,lc_subtype_x,hierarchical_cluster_IgG_isotype_stratified_fold_x,antibody_name,vh_protein_sequence,vl_protein_sequence,light_aligned_aho,heavy_aligned_aho,hc_subtype_y,lc_subtype_y,hierarchical_cluster_IgG_isotype_stratified_fold_y
0,GDPa1-060,IgG1|Kappa|0,0.858995,1.27875,0.156673,54,IgG1,Kappa,0,domagrozumab,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCKASQDVSTAVAWYQQKPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCKAS--QDVS------TAVAWYQQ...,EVQLLES-GGGLVQPGGSLRLSCAASG-FTFSS-----YAMSWVRQ...,IgG1,Kappa,0
1,GDPa1-021,IgG1|Kappa|1,0.844606,1.27625,0.17266,49,IgG1,Kappa,1,bemarituzumab,QVQLVQSGAEVKKPGSSVKVSCKASGYIFTTYNVHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCKASQGVSNDVAWYQQKPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCKAS--QGVS------NDVAWYQQ...,QVQLVQS-GAEVKKPGSSVKVSCKASG-YIFTT-----YNVHWVRQ...,IgG1,Kappa,1
2,GDPa1-085,IgG1|Kappa|4,0.844606,1.28125,0.17266,49,IgG1,Kappa,4,ficlatuzumab,QVQLVQPGAEVKKPGTSVKLSCKASGYTFTTYWMHWVRQAPGQGLE...,DIVMTQSPDSLAMSLGERVTLNCKASENVVSYVSWYQQKPGQSPKL...,DIVMTQSPDSLAMSLGERVTLNCKAS--ENVV------SYVSWYQQ...,QVQLVQP-GAEVKKPGTSVKLSCKASG-YTFTT-----YWMHWVRQ...,IgG1,Kappa,4


In [32]:
GLYCO_PATTERN = re.compile(r"N[^P][ST]")

KD = {
    "A": 1.8, "C": 2.5, "D": -3.5, "E": -3.5, "F": 2.8,
    "G": -0.4, "H": -3.2, "I": 4.5, "K": -3.9, "L": 3.8,
    "M": 1.9, "N": -3.5, "P": -1.6, "Q": -3.5, "R": -4.5,
    "S": -0.8, "T": -0.7, "V": 4.2, "W": -0.9, "Y": -1.3,
}

def safe(seq):
    return "" if pd.isna(seq) else str(seq)

def glyco_count(seq):
    return len(GLYCO_PATTERN.findall(safe(seq)))

def cys_count(seq):
    return safe(seq).count("C")

def seq_len(seq):
    return len(safe(seq))

def kd_mean(seq):
    vals = [KD[c] for c in safe(seq) if c in KD]
    return float(np.mean(vals)) if vals else 0.0

def kd_p90(seq):
    vals = [KD[c] for c in safe(seq) if c in KD]
    return float(np.percentile(vals, 90)) if vals else 0.0 # Sequence Utility Functions

In [33]:
# glycosylation
cand["glyco_total"] = (
    cand["vh_protein_sequence"].apply(glyco_count) +
    cand["vl_protein_sequence"].apply(glyco_count)
)

# cysteine
cand["cys_total"] = (
    cand["vh_protein_sequence"].apply(cys_count) +
    cand["vl_protein_sequence"].apply(cys_count)
)

# length
cand["seq_len_total"] = (
    cand["vh_protein_sequence"].apply(seq_len) +
    cand["vl_protein_sequence"].apply(seq_len)
)

# hydrophobicity
cand["hydro_mean"] = (
    cand["vh_protein_sequence"].apply(kd_mean) +
    cand["vl_protein_sequence"].apply(kd_mean)
)

cand["hydro_p90"] = (
    cand["vh_protein_sequence"].apply(kd_p90) +
    cand["vl_protein_sequence"].apply(kd_p90)
)

cand[[
    "antibody_id","glyco_total","cys_total",
    "seq_len_total","hydro_mean","hydro_p90"
]].head() # Deterministic Feature Engineering

Unnamed: 0,antibody_id,glyco_total,cys_total,seq_len_total,hydro_mean,hydro_p90
0,GDPa1-060,0,4,223,-0.637987,7.76
1,GDPa1-021,0,4,221,-0.840761,8.16
2,GDPa1-085,0,4,225,-0.870149,8.12
3,GDPa1-025,0,4,226,-0.607701,8.0
4,GDPa1-165,0,4,229,-0.575714,8.0


In [34]:
# canonical antibody: cys count는 분해력 없음
# anomaly flag만 사용
CYS_LOW, CYS_HIGH = 8, 10

cand["cys_anomaly"] = (
    (cand["cys_total"] < CYS_LOW) |
    (cand["cys_total"] > CYS_HIGH)
).astype(int)

cand["risk_cys"] = cand["cys_anomaly"]

cand[["antibody_id","cys_total","cys_anomaly"]].head() # Cys 처리 

Unnamed: 0,antibody_id,cys_total,cys_anomaly
0,GDPa1-060,4,1
1,GDPa1-021,4,1
2,GDPa1-085,4,1
3,GDPa1-025,4,1
4,GDPa1-165,4,1


In [35]:
# 1) hc_subtype 복구 (y 우선, 없으면 x)
if "hc_subtype_y" in cand.columns:
    cand["hc_subtype"] = cand["hc_subtype_y"]
elif "hc_subtype_x" in cand.columns:
    cand["hc_subtype"] = cand["hc_subtype_x"]
else:
    raise KeyError("No hc_subtype column found (_x or _y)")

# 2) lc_subtype 복구
if "lc_subtype_y" in cand.columns:
    cand["lc_subtype"] = cand["lc_subtype_y"]
elif "lc_subtype_x" in cand.columns:
    cand["lc_subtype"] = cand["lc_subtype_x"]

# 3) cluster id 복구
if "hierarchical_cluster_IgG_isotype_stratified_fold_y" in cand.columns:
    cand["hierarchical_cluster_IgG_isotype_stratified_fold"] = \
        cand["hierarchical_cluster_IgG_isotype_stratified_fold_y"]
elif "hierarchical_cluster_IgG_isotype_stratified_fold_x" in cand.columns:
    cand["hierarchical_cluster_IgG_isotype_stratified_fold"] = \
        cand["hierarchical_cluster_IgG_isotype_stratified_fold_x"]

# 4) *_x, *_y 컬럼 제거 (완전 정리)
cand = cand.drop(
    columns=[c for c in cand.columns if c.endswith("_x") or c.endswith("_y")],
    errors="ignore"
)

print("✅ normalized columns:")
print(cand.columns.tolist())

✅ normalized columns:
['antibody_id', 'signature', 'proxy_survivability_score', 'tie_break_risk', 'core10_operational_risk', 'cluster_size', 'antibody_name', 'vh_protein_sequence', 'vl_protein_sequence', 'light_aligned_aho', 'heavy_aligned_aho', 'glyco_total', 'cys_total', 'seq_len_total', 'hydro_mean', 'hydro_p90', 'cys_anomaly', 'risk_cys', 'hc_subtype', 'lc_subtype', 'hierarchical_cluster_IgG_isotype_stratified_fold']


In [36]:
CONTEXT_PENALTY = {
    "IgG1": 0.0,
    "IgG4": 0.05,
    "IgG2": 0.15,
    "IgG3": 0.30
}

cand["context_penalty"] = (
    cand["hc_subtype"]
    .map(CONTEXT_PENALTY)
    .fillna(0.2)
)

cand[["antibody_id", "hc_subtype", "context_penalty"]].head()

Unnamed: 0,antibody_id,hc_subtype,context_penalty
0,GDPa1-060,IgG1,0.0
1,GDPa1-021,IgG1,0.0
2,GDPa1-085,IgG1,0.0
3,GDPa1-025,IgG1,0.0
4,GDPa1-165,IgG1,0.0


In [37]:
def norm01(s):
    s = pd.to_numeric(s, errors="coerce").fillna(0.0)
    return s / s.max() if s.max() > 0 else s

cand["risk_glyco"] = norm01(cand["glyco_total"])
cand["risk_len"]   = norm01(cand["seq_len_total"])
cand["risk_hydro"] = norm01(norm01(cand["hydro_mean"]) + norm01(cand["hydro_p90"]))

cand["tie_break_risk"] = (
    0.35 * cand["risk_glyco"] +
    0.20 * cand["risk_len"] +
    0.25 * cand["risk_hydro"] +
    0.20 * norm01(cand["context_penalty"]) +
    1.00 * cand["risk_cys"]   # anomaly 강한 패널티
)

cand[["antibody_id","tie_break_risk"]].head() # Normalization + Tie-break Risk

Unnamed: 0,antibody_id,tie_break_risk
0,GDPa1-060,1.308744
1,GDPa1-021,1.244671
2,GDPa1-085,1.234429
3,GDPa1-025,1.335
4,GDPa1-165,1.350459


In [38]:
cand["rule_risk"] = (
    0.7 * cand["core10_operational_risk"] +
    0.3 * norm01(cand["tie_break_risk"])
)

cand["rule_survivability_score"] = (1.0 - cand["rule_risk"]).clip(0, 1)

def hazard(x):
    if x >= 0.75: return "LOW"
    if x >= 0.50: return "MID"
    return "HIGH"

cand["hazard_rule"] = cand["rule_survivability_score"].map(hazard)

cand[["antibody_id","rule_survivability_score","hazard_rule"]].head() # Rule-based Survivability

Unnamed: 0,antibody_id,rule_survivability_score,hazard_rule
0,GDPa1-060,0.65053,MID
1,GDPa1-021,0.651079,MID
2,GDPa1-085,0.652956,MID
3,GDPa1-025,0.634528,MID
4,GDPa1-165,0.631695,MID


In [39]:
q = cand["core10_operational_risk"].quantile(0.30)

cand["pseudo_safe"] = (
    (cand["core10_operational_risk"] <= q) &
    (cand["cys_anomaly"] == 0) &
    (cand["risk_hydro"] <= cand["risk_hydro"].quantile(0.6))
).astype(int)

cand["pseudo_safe"].value_counts() # Pseudo Label

pseudo_safe
0    19
Name: count, dtype: int64

In [41]:
# ---------- 0. 방어적 feature 생성 ----------

def norm01(s):
    s = pd.to_numeric(s, errors="coerce").fillna(0.0)
    return s / s.max() if s.max() > 0 else s

# glyco / len / hydro는 cand에 이미 있음
cand["risk_glyco"] = norm01(cand["glyco_total"])
cand["risk_len"]   = norm01(cand["seq_len_total"])
cand["risk_hydro"] = norm01(
    norm01(cand["hydro_mean"]) + norm01(cand["hydro_p90"])
)

# context_penalty가 없다면 hc_subtype으로 복구
if "context_penalty" not in cand.columns:
    CONTEXT_PENALTY = {
        "IgG1": 0.0,
        "IgG4": 0.05,
        "IgG2": 0.15,
        "IgG3": 0.30
    }
    cand["context_penalty"] = (
        cand["hc_subtype"]
        .map(CONTEXT_PENALTY)
        .fillna(0.2)
    )

# pseudo label (Core10 gate 통과 여부)
if "pseudo_safe" not in cand.columns:
    cand["pseudo_safe"] = (cand["proxy_survivability_score"] > 0).astype(int)

cand[[
    "antibody_id",
    "core10_operational_risk",
    "tie_break_risk",
    "risk_glyco",
    "risk_len",
    "risk_hydro",
    "risk_cys",
    "context_penalty",
    "pseudo_safe"
]].head()

Unnamed: 0,antibody_id,core10_operational_risk,tie_break_risk,risk_glyco,risk_len,risk_hydro,risk_cys,context_penalty,pseudo_safe
0,GDPa1-060,0.156673,1.308744,0,0.969565,0.459325,1,0.0,0
1,GDPa1-021,0.17266,1.244671,0,0.96087,0.209987,1,0.0,0
2,GDPa1-085,0.17266,1.234429,0,0.978261,0.155107,1,0.0,0
3,GDPa1-025,0.17266,1.335,0,0.982609,0.553912,1,0.0,0
4,GDPa1-165,0.17266,1.350459,0,0.995652,0.605316,1,0.0,0


In [None]:
# 하위 30%를 safe로 간주 (조절 가능: 0.2 ~ 0.4)
SAFE_QUANTILE = 0.30

threshold = cand["core10_operational_risk"].quantile(SAFE_QUANTILE)

cand["pseudo_safe_soft"] = (
    cand["core10_operational_risk"] <= threshold
).astype(int)

# 분포 확인 (반드시 0과 1 둘 다 있어야 함)
cand["pseudo_safe_soft"].value_counts() #  Soft pseudo-label 생성

pseudo_safe_soft
0    13
1     6
Name: count, dtype: int64

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

FEATURES = [
    "core10_operational_risk",
    "tie_break_risk",
    "context_penalty",
    "risk_glyco",
    "risk_len",
    "risk_hydro",
    "risk_cys",
    "cluster_size"
]

X = cand[FEATURES].fillna(0.0)
y = cand["pseudo_safe_soft"]   # ✅ soft pseudo-label 사용

logit = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

logit.fit(X, y) # Logistic Regression 

0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [49]:
print("\nFEATURE sanity:")
for c in FEATURES:
    s = pd.to_numeric(cand[c], errors="coerce")
    nun = s.nunique(dropna=True)
    nanr = s.isna().mean()
    print(f"- {c:25s} nunique={nun:4d}  nan_rate={nanr:.3f}")

# Logistic 기반 survivability probability
cand["logit_safe_prob"] = logit.predict_proba(X)[:, 1]

# Core10 hazard 정의 (확률 → 위험)
cand["hazard_logit"] = 1.0 - cand["logit_safe_prob"]

cand[[
    "antibody_id",
    "pseudo_safe_soft",
    "logit_safe_prob",
    "hazard_logit"
]].head(10) # Survivability / Hazard 산출


FEATURE sanity:
- core10_operational_risk   nunique=  12  nan_rate=0.000
- tie_break_risk            nunique=  19  nan_rate=0.000
- context_penalty           nunique=   3  nan_rate=0.000
- risk_glyco                nunique=   1  nan_rate=0.000
- risk_len                  nunique=   9  nan_rate=0.000
- risk_hydro                nunique=  19  nan_rate=0.000
- risk_cys                  nunique=   1  nan_rate=0.000
- cluster_size              nunique=   4  nan_rate=0.000


Unnamed: 0,antibody_id,pseudo_safe_soft,logit_safe_prob,hazard_logit
0,GDPa1-060,1,0.87386,0.12614
1,GDPa1-021,1,0.842986,0.157014
2,GDPa1-085,1,0.919651,0.080349
3,GDPa1-025,1,0.776621,0.223379
4,GDPa1-165,1,0.818172,0.181828
5,GDPa1-017,1,0.938738,0.061262
6,GDPa1-010,0,0.531072,0.468928
7,GDPa1-138,0,0.457319,0.542681
8,GDPa1-039,0,0.040565,0.959435
9,GDPa1-050,0,0.132322,0.867678


In [None]:
cand[[
    "antibody_id",
    "core10_operational_risk",
    "hazard_logit",
    "tie_break_risk"
]].sort_values("hazard_logit").head(10) # Rule vs Logistic 비교 체크

Unnamed: 0,antibody_id,core10_operational_risk,hazard_logit,tie_break_risk
5,GDPa1-017,0.176257,0.061262,1.154242
2,GDPa1-085,0.17266,0.080349,1.234429
0,GDPa1-060,0.156673,0.12614,1.308744
1,GDPa1-021,0.17266,0.157014,1.244671
4,GDPa1-165,0.17266,0.181828,1.350459
3,GDPa1-025,0.17266,0.223379,1.335
6,GDPa1-010,0.18392,0.468928,1.336067
7,GDPa1-138,0.549561,0.542681,1.335473
11,GDPa1-229,0.569145,0.656668,1.391663
9,GDPa1-050,0.565548,0.867678,1.426338


In [50]:
from sklearn.tree import DecisionTreeClassifier

# X, y 준비
X = cand[FEATURES].fillna(0.0)
y = cand["pseudo_safe_soft"]

# ✅ 후보 수에 따라 leaf 자동 조절 (너무 크면 split이 막힘)
n = len(cand)
min_leaf = max(5, int(0.03 * n))   # 대략 3% (최소 5)
min_split = max(2*min_leaf, 10)

tree = DecisionTreeClassifier(
    max_depth=4,                # 기존 3 -> 4로 약간 완화
    min_samples_leaf=min_leaf,  # 자동
    min_samples_split=min_split,
    class_weight="balanced",
    random_state=42
)

tree.fit(X, y)

print("tree params:", {
    "n": n,
    "max_depth": tree.get_depth(),
    "n_leaves": tree.get_n_leaves(),
    "min_samples_leaf": min_leaf,
    "min_samples_split": min_split
})

tree params: {'n': 19, 'max_depth': 1, 'n_leaves': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}


In [51]:
# 확률 산출
cand["tree_safe_prob"] = tree.predict_proba(X)[:, 1]

# hazard가 문자열 반환이면 그대로, 아니면 1-p로 위험도로 써도 됨
cand["hazard_tree"] = cand["tree_safe_prob"].map(hazard)

# ✅ 0.5로 쏠리는지 체크
print("tree_safe_prob describe:\n", cand["tree_safe_prob"].describe())
print("\nmost common probs:\n", cand["tree_safe_prob"].value_counts().head(10))

cand[["antibody_id", "tree_safe_prob", "hazard_tree"]].head(10)

tree_safe_prob describe:
 count    19.000000
mean      0.315789
std       0.477567
min       0.000000
25%       0.000000
50%       0.000000
75%       1.000000
max       1.000000
Name: tree_safe_prob, dtype: float64

most common probs:
 tree_safe_prob
0.0    13
1.0     6
Name: count, dtype: int64


Unnamed: 0,antibody_id,tree_safe_prob,hazard_tree
0,GDPa1-060,1.0,LOW
1,GDPa1-021,1.0,LOW
2,GDPa1-085,1.0,LOW
3,GDPa1-025,1.0,LOW
4,GDPa1-165,1.0,LOW
5,GDPa1-017,1.0,LOW
6,GDPa1-010,0.0,HIGH
7,GDPa1-138,0.0,HIGH
8,GDPa1-039,0.0,HIGH
9,GDPa1-050,0.0,HIGH


In [52]:
# 리프별 샘플 수/클래스 비율 확인
leaf_id = tree.apply(X)
tmp = cand.assign(leaf_id=leaf_id)

leaf_stats = (
    tmp.groupby("leaf_id")["pseudo_safe_soft"]
       .agg(n="size", pos_rate="mean")
       .sort_values(["n"], ascending=False)
)

leaf_stats.head(20)

Unnamed: 0_level_0,n,pos_rate
leaf_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,13,0.0
1,6,1.0


In [56]:
if "hc_subtype_y" in cand.columns:
    cand["hc_subtype"] = cand["hc_subtype_y"]
elif "hc_subtype_x" in cand.columns:
    cand["hc_subtype"] = cand["hc_subtype_x"]

if "lc_subtype_y" in cand.columns:
    cand["lc_subtype"] = cand["lc_subtype_y"]
elif "lc_subtype_x" in cand.columns:
    cand["lc_subtype"] = cand["lc_subtype_x"]

# 불필요한 중복 컬럼 제거
cand = cand.drop(
    columns=[c for c in cand.columns if c.endswith("_x") or c.endswith("_y")],
    errors="ignore"
)

print("Resolved subtype columns:")
print(cand[["hc_subtype", "lc_subtype"]].head())

Resolved subtype columns:
  hc_subtype lc_subtype
0       IgG1      Kappa
1       IgG1      Kappa
2       IgG1      Kappa
3       IgG1     Lambda
4       IgG1     Lambda


In [57]:
EXPORT_COLS = [
    "antibody_id",
    "signature",

    # Core10 rule-based
    "core10_operational_risk",
    "proxy_survivability_score",

    # Tie-break
    "tie_break_risk",

    # Logistic baseline
    "logit_safe_prob",
    "hazard_logit",

    # Tree baseline
    "tree_safe_prob",
    "hazard_tree",

    # Interpretation helpers
    "hc_subtype",
    "lc_subtype",
    "cluster_size",
]

In [58]:
OUT_PATH = Path("../artifact/core10/core10_04_survivability_scores.csv")

cand[EXPORT_COLS] \
    .sort_values(
        ["proxy_survivability_score", "tie_break_risk"],
        ascending=[False, True]
    ) \
    .to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH.resolve())

Saved: /Users/mac/Desktop/De/Developability_Data/core/artifact/core10/core10_04_survivability_scores.csv
