In [22]:
from pathlib import Path
import pandas as pd
import numpy as np

POST_PATH = Path("../artifact/core10/core10_02_shutdown_snapshot.csv")
DEV_PATH  = Path("../../data_csv/Antibody_Developability.csv")

assert POST_PATH.exists(), f"POST not found: {POST_PATH}"
assert DEV_PATH.exists(),  f"DEV not found: {DEV_PATH}"

print("POST:", POST_PATH.resolve())
print("DEV :", DEV_PATH.resolve())

POST: /Users/mac/Desktop/De/Developability_Data/core/artifact/core10/core10_02_shutdown_snapshot.csv
DEV : /Users/mac/Desktop/De/Developability_Data/data_csv/Antibody_Developability.csv


In [20]:
dev = pd.read_csv(DEV_PATH)
print("dev rows:", len(dev))
print("dev cols:", dev.columns.tolist())

dev.head()

dev rows: 246
dev cols: ['antibody_id', 'antibody_name', 'vh_protein_sequence', 'vl_protein_sequence', 'light_aligned_aho', 'heavy_aligned_aho', 'hc_subtype', 'lc_subtype', 'hierarchical_cluster_IgG_isotype_stratified_fold']


Unnamed: 0,antibody_id,antibody_name,vh_protein_sequence,vl_protein_sequence,light_aligned_aho,heavy_aligned_aho,hc_subtype,lc_subtype,hierarchical_cluster_IgG_isotype_stratified_fold
0,GDPa1-001,abagovomab,QVKLQESGAELARPGASVKLSCKASGYTFTNYWMQWVKQRPGQGLD...,DIELTQSPASLSASVGETVTITCQASENIYSYLAWHQQKQGKSPQL...,DIELTQSPASLSASVGETVTITCQAS--ENIY------SYLAWHQQ...,QVKLQES-GAELARPGASVKLSCKASG-YTFTN-----YWMQWVKQ...,IgG1,Kappa,2
1,GDPa1-002,abituzumab,QVQLQQSGGELAKPGASVKVSCKASGYTFSSFWMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQDISNYLAWYQQKPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCRAS--QDIS------NYLAWYQQ...,QVQLQQS-GGELAKPGASVKVSCKASG-YTFSS-----FWMHWVRQ...,IgG2,Kappa,0
2,GDPa1-003,abrezekimab,QVTLKESGPVLVKPTETLTLTCTVSGFSLTNYHVQWIRQPPGKALE...,DIQMTQSPSSLSASVGDRVTITCLASEDISNYLAWYQQKPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCLAS--EDIS------NYLAWYQQ...,QVTLKES-GPVLVKPTETLTLTCTVSG-FSLTN-----YHVQWIRQ...,IgG4,Kappa,2
3,GDPa1-004,abrilumab,QVQLVQSGAEVKKPGASVKVSCKVSGYTLSDLSIHWVRQAPGKGLE...,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,DIQMTQSPSSVSASVGDRVTITCRAS--QGIS------SWLAWYQQ...,QVQLVQS-GAEVKKPGASVKVSCKVSG-YTLSD-----LSIHWVRQ...,IgG2,Kappa,0
4,GDPa1-005,adalimumab,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,DIQMTQSPSSLSASVGDRVTITCRAS--QGIR------NYLAWYQQ...,EVQLVES-GGGLVQPGRSLRLSCAASG-FTFDD-----YAMHWVRQ...,IgG1,Kappa,0


In [17]:
# shutdown_step 분포: case 단위 vs antibody 단위
KEY_CASE = ["run_id", "case_id"]
KEY_AB   = ["run_id", "case_id", "antibody_id"]

print("rows:", len(df))
print("unique (run,case):", df.drop_duplicates(KEY_CASE).shape[0])
print("unique (run,case,antibody):", df.drop_duplicates(KEY_AB).shape[0])

# case별 shutdown_step 다양성
tmp = (
    df.groupby(KEY_CASE)["shutdown_step"]
      .nunique()
      .reset_index(name="n_unique_shutdown_step")
)
print(tmp)

rows: 9
unique (run,case): 1
unique (run,case,antibody): 1
                run_id     case_id  n_unique_shutdown_step
0  core7_04_1767776352  B_GOVERNED                       1


In [9]:
rank_base = (
    df
    .groupby(["run_id","case_id","antibody_id"], as_index=False)
    .agg(
        soft_score=("soft_survivability_score","mean"),
        max_step=("step","max"),
        mean_soms=("SoMS_cumsum_window","mean"),
        reentry_rate=("reentry_flag","mean")
    )
)

# 위험 보정 점수 (낮을수록 위험)
rank_base["risk_penalty"] = (
    0.5 * rank_base["reentry_rate"] +
    0.5 * (rank_base["mean_soms"] / rank_base["mean_soms"].max())
)

rank_base["final_rank_score"] = (
    rank_base["soft_score"] - rank_base["risk_penalty"]
)

rank_base = rank_base.sort_values(
    ["run_id","case_id","final_rank_score"],
    ascending=[True, True, False]
) # 항체 간 상대 랭킹 생성

In [10]:
MIN_SOFT_SCORE = 0.1    # 운영적으로 “버틴다”고 말할 최소값
MAX_REENTRY_RATE = 0.2 # 재진입 너무 잦으면 탈락

rank_base["eligible"] = (
    (rank_base["soft_score"] >= MIN_SOFT_SCORE) &
    (rank_base["reentry_rate"] <= MAX_REENTRY_RATE)
)

rank_base["selection_status"] = np.where(
    rank_base["eligible"],
    "CANDIDATE",
    "REJECTED"
)

In [11]:
TTL_STEPS = 5

rank_base["ttl_steps"] = np.where(
    rank_base["selection_status"] == "CANDIDATE",
    TTL_STEPS,
    0
)

rank_base["next_review_step"] = (
    rank_base["max_step"] + rank_base["ttl_steps"]
)

In [12]:
final = rank_base.copy()

final["final_decision"] = final.groupby(
    ["run_id","case_id"]
)["eligible"].transform(
    lambda x: "SELECTABLE" if x.any() else "NO_VALID_FALLBACK"
)

final.to_csv(OUT_RANKING, index=False)
print("Exported:", OUT_RANKING)

Exported: ../artifact/core10/core10_03c_candidate_ranking.csv


In [13]:
decision = (
    final[final["eligible"]]
    .groupby(["run_id","case_id"], as_index=False)
    .head(1)   # 최고 rank 1개
)

decision.to_csv(OUT_DECISION, index=False)
print("Exported:", OUT_DECISION)

Exported: ../artifact/core10/core10_03c_selection_decision.csv
