In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_PATH = Path("../../data_csv/curieus")
ANTIBODIES = ["001", "045", "183"]


PRED_SUFFIX = "Prediction_Result.csv"
VISCO_SUFFIX = "DeepViscosity_classes.csv"

OUT_DIR = Path("../../data_csv/curieus") 
OUT_CORE6_TRACE = Path("core6_state_trace.csv")
OUT_DIAG = Path("core6_run_diagnostics.csv")


def load_curieus_outputs(ab_id):
    base = BASE_PATH / ab_id
    
    pred = pd.read_csv(base / f"{ab_id}Prediction_Result.csv")
    visco = pd.read_csv(base / f"{ab_id}DeepViscosity_classes.csv")
    
    pred["antibody_key"] = ab_id
    visco["antibody_key"] = ab_id
    
    return pred, visco

raw_events = {}

for ab in ANTIBODIES:
    raw_events[ab] = load_curieus_outputs(ab)

	•	001은 baseline
	•	045 / 183만 state trace 후보

In [2]:
def _safe_read_csv(path: Path) -> pd.DataFrame:
    if not path.exists():
        return pd.DataFrame({"__read_error__": [f"missing:{path.name}"]})
    try:
        df = pd.read_csv(path)
        # 헤더만 있고 데이터가 없는 경우도 있음: df.empty True
        return df
    except Exception as e:
        return pd.DataFrame({"__read_error__": [f"read_fail:{path.name}:{type(e).__name__}:{e}"]})

def load_curieus_outputs(ab_id: str):
    base = BASE_PATH / ab_id
    pred_path = base / f"{ab_id}{PRED_SUFFIX}"
    visco_path = base / f"{ab_id}{VISCO_SUFFIX}"

    pred = _safe_read_csv(pred_path)
    visco = _safe_read_csv(visco_path)

    pred["antibody_key"] = ab_id
    visco["antibody_key"] = ab_id

    pred["__source_file__"] = pred_path.name
    visco["__source_file__"] = visco_path.name

    return pred, visco

raw_events = {ab: load_curieus_outputs(ab) for ab in ANTIBODIES}

# 빠른 확인
for ab, (pred, visco) in raw_events.items():
    print(f"[{ab}] pred_rows={len(pred)} pred_cols={list(pred.columns)[:10]}")
    print(f"     visco_rows={len(visco)} visco_cols={list(visco.columns)[:10]}")

[001] pred_rows=1 pred_cols=['Name', 'ACSINS_transformed', 'AS', 'BVP', 'CIC_transformed', 'CSI_transformed', 'ELISA', 'HIC', 'HEK', 'PSR']
     visco_rows=1 visco_cols=['Name', 'Prob_Mean', 'Prob_Std', 'DeepViscosity_classes', 'antibody_key', '__source_file__']
[045] pred_rows=1 pred_cols=['Name', 'ACSINS_transformed', 'AS', 'BVP', 'CIC_transformed', 'CSI_transformed', 'ELISA', 'HIC', 'HEK', 'PSR']
     visco_rows=1 visco_cols=['Name', 'Prob_Mean', 'Prob_Std', 'DeepViscosity_classes', 'antibody_key', '__source_file__']
[183] pred_rows=1 pred_cols=['Name', 'ACSINS_transformed', 'AS', 'BVP', 'CIC_transformed', 'CSI_transformed', 'ELISA', 'HIC', 'HEK', 'PSR']
     visco_rows=1 visco_cols=['Name', 'Prob_Mean', 'Prob_Std', 'DeepViscosity_classes', 'antibody_key', '__source_file__']


In [3]:
def summarize_df(df: pd.DataFrame, kind: str):
    cols = list(df.columns)
    has_step = "step" in cols
    has_read_error = "__read_error__" in cols
    n_rows = len(df)

    return {
        "kind": kind,
        "rows": n_rows,
        "has_step": bool(has_step),
        "has_read_error": bool(has_read_error),
        "columns": "|".join(cols),
        "read_error": (str(df["__read_error__"].iloc[0]) if has_read_error and n_rows > 0 else ""),
    }

diag_rows = []
for ab, (pred, visco) in raw_events.items():
    r1 = summarize_df(pred, "Prediction_Result")
    r2 = summarize_df(visco, "DeepViscosity_classes")
    r1["antibody_key"] = ab
    r2["antibody_key"] = ab
    diag_rows += [r1, r2]

diag = pd.DataFrame(diag_rows)

# eventful 기준(지금 케이스에서 False가 나오는 게 정상)
def is_eventful_run(pred_df: pd.DataFrame) -> bool:
    if "__read_error__" in pred_df.columns:
        return False
    if "step" in pred_df.columns:
        return True
    # step이 없어도 row가 여러 개면 event로 간주 가능
    return len(pred_df) > 1

eventful_flags = {ab: is_eventful_run(pred) for ab, (pred, _) in raw_events.items()}

print("eventful_flags:", eventful_flags)
diag #  “왜 event trace가 안 나왔는지” 판정 로직

eventful_flags: {'001': False, '045': False, '183': False}


Unnamed: 0,kind,rows,has_step,has_read_error,columns,read_error,antibody_key
0,Prediction_Result,1,False,False,Name|ACSINS_transformed|AS|BVP|CIC_transformed...,,1
1,DeepViscosity_classes,1,False,False,Name|Prob_Mean|Prob_Std|DeepViscosity_classes|...,,1
2,Prediction_Result,1,False,False,Name|ACSINS_transformed|AS|BVP|CIC_transformed...,,45
3,DeepViscosity_classes,1,False,False,Name|Prob_Mean|Prob_Std|DeepViscosity_classes|...,,45
4,Prediction_Result,1,False,False,Name|ACSINS_transformed|AS|BVP|CIC_transformed...,,183
5,DeepViscosity_classes,1,False,False,Name|Prob_Mean|Prob_Std|DeepViscosity_classes|...,,183


In [4]:
# Core6에서 최소 공통 포맷으로 묶기 위한 컬럼(필수)
CORE_META_COLS = ["antibody_key", "step", "mutation_applied"]

# pred에서 “예측 지표 컬럼”을 잡을 때 제외할 컬럼들
EXCLUDE_COLS = set(["antibody_key", "__source_file__", "__read_error__"])

def normalize_prediction_as_event(pred_df: pd.DataFrame) -> pd.DataFrame:
    df = pred_df.copy()

    # 읽기 실패/파일 없음이면 "event 생성 불가"로 남김
    if "__read_error__" in df.columns:
        out = pd.DataFrame({
            "antibody_key": [df["antibody_key"].iloc[0] if "antibody_key" in df.columns and len(df) else None],
            "step": [0],
            "mutation_applied": [False],
            "__event_note__": [df["__read_error__"].iloc[0] if len(df) else "read_error_empty"],
        })
        return out

    # step 없으면 단일 이벤트(0 step)로 강제
    if "step" not in df.columns:
        df["step"] = 0
        df["mutation_applied"] = False
        df["__event_note__"] = "no_step_single_row_run"
    else:
        df["mutation_applied"] = True
        df["__event_note__"] = "has_step_run"

    # 컬럼 정리: 예측 컬럼 후보는 숫자형만 남기는 게 안전
    pred_cols = [c for c in df.columns if c not in EXCLUDE_COLS and c not in CORE_META_COLS and c != "__event_note__"]
    for c in pred_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    return df

event_tables = {}
for ab, (pred, _) in raw_events.items():
    event_tables[ab] = normalize_prediction_as_event(pred)

for ab, df in event_tables.items():
    print(ab, df[["antibody_key","step","mutation_applied","__event_note__"]].head(3).to_dict("records")) # “event table 표준화” (지금은 step이 없으므로 강제로 step=0만 만든다)

001 [{'antibody_key': '001', 'step': 0, 'mutation_applied': False, '__event_note__': 'no_step_single_row_run'}]
045 [{'antibody_key': '045', 'step': 0, 'mutation_applied': False, '__event_note__': 'no_step_single_row_run'}]
183 [{'antibody_key': '183', 'step': 0, 'mutation_applied': False, '__event_note__': 'no_step_single_row_run'}]


In [5]:
def compute_state_trace(event_df: pd.DataFrame) -> pd.DataFrame:
    df = event_df.sort_values("step").reset_index(drop=True)

    # mutation 누적(지금은 전부 False면 0)
    df["mutation_count"] = df["mutation_applied"].fillna(False).astype(int).cumsum()

    # 예측 지표 컬럼 추출 (메타 제외, 숫자형만 대상으로)
    meta = set(["antibody_key", "step", "mutation_applied", "__source_file__", "__event_note__", "__read_error__", "mutation_count"])
    pred_cols = [c for c in df.columns if c not in meta]

    # pred_cols가 비거나(헤더만 있었거나) 전부 NaN이면 분산 0으로
    if len(pred_cols) == 0:
        df["pred_variance"] = 0.0
    else:
        # 행 단위 분산: 단일 행이면 NaN 나오기 쉬워서 fill
        df["pred_variance"] = df[pred_cols].var(axis=1).fillna(0.0)

    # SoD / SoMS (지금은 “실패 run 기록”이 목적이라 단순 정의 유지)
    df["SoD"] = (1.0 * df["mutation_count"]) + (0.5 * df["pred_variance"].rolling(3, min_periods=1).mean())
    df["SoMS"] = df["SoD"].rolling(2, min_periods=1).mean()

    return df

state_traces = {ab: compute_state_trace(df) for ab, df in event_tables.items()} # Event → State 누적(지금은 “단일 step=0”이라 SoD/SoMS도 단일값으로 떨어지는 게 정상)

In [6]:
def governance_label(sod: float) -> str:
    if sod < 1.0:
        return "NORMAL"
    elif sod < 2.5:
        return "WARNING"
    else:
        return "CRITICAL"

for ab, df in state_traces.items():
    df["governance_signal"] = df["SoD"].apply(lambda x: governance_label(float(x)))

for ab, df in state_traces.items():
    print(ab, df[["step","mutation_applied","SoD","SoMS","governance_signal","__event_note__"]].head(5))

    # Governance signal(지금은 대부분 NORMAL로 나오는 게 정상)

001    step  mutation_applied          SoD         SoMS governance_signal  \
0     0             False  1065.795654  1065.795654          CRITICAL   

           __event_note__  
0  no_step_single_row_run  
045    step  mutation_applied        SoD       SoMS governance_signal  \
0     0             False  730.51756  730.51756          CRITICAL   

           __event_note__  
0  no_step_single_row_run  
183    step  mutation_applied        SoD       SoMS governance_signal  \
0     0             False  730.51756  730.51756          CRITICAL   

           __event_note__  
0  no_step_single_row_run  


In [7]:
core6_trace = pd.concat(state_traces.values(), ignore_index=True)

# 보기 좋은 컬럼 순서(있으면 앞으로, 없으면 뒤)
front = ["antibody_key","step","mutation_applied","mutation_count","pred_variance","SoD","SoMS","governance_signal","__event_note__"]
cols = front + [c for c in core6_trace.columns if c not in front]
core6_trace = core6_trace[cols]

core6_trace.to_csv(OUT_CORE6_TRACE, index=False)
diag.to_csv(OUT_DIAG, index=False)

print("saved:", OUT_CORE6_TRACE, OUT_DIAG)
core6_trace.head(20)

saved: core6_state_trace.csv core6_run_diagnostics.csv


Unnamed: 0,antibody_key,step,mutation_applied,mutation_count,pred_variance,SoD,SoMS,governance_signal,__event_note__,Name,...,CIC_transformed,CSI_transformed,ELISA,HIC,HEK,PSR,SGAC_transformed,SMAC_transformed,Tm,__source_file__
0,1,0,False,0,2131.591308,1065.795654,1065.795654,CRITICAL,no_step_single_row_run,,...,0.041564,1.081533,2.640164,11.734008,150.169973,0.399758,-0.593735,0.91534,77.583333,001Prediction_Result.csv
1,45,0,False,0,1461.035119,730.51756,730.51756,CRITICAL,no_step_single_row_run,,...,0.128222,0.722424,1.374193,8.961372,121.586311,0.248792,-0.297045,0.942202,68.666667,045Prediction_Result.csv
2,183,0,False,0,1461.035119,730.51756,730.51756,CRITICAL,no_step_single_row_run,,...,0.128222,0.722424,1.374193,8.961372,121.586311,0.248792,-0.297045,0.942202,68.666667,183Prediction_Result.csv


In [8]:
summary = []
for ab in ANTIBODIES:
    pred, _ = raw_events[ab]
    flag = eventful_flags[ab]
    rows = len(pred)
    has_step = ("step" in pred.columns)
    has_err = ("__read_error__" in pred.columns)
    summary.append({
        "antibody_key": ab,
        "eventful": flag,
        "pred_rows": rows,
        "pred_has_step": has_step,
        "pred_has_read_error": has_err,
        "note": ("step/rows 조건 불충족 → 단일 step=0 baseline 이벤트로 기록" if (not flag) else "eventful"),
    })

pd.DataFrame(summary)

Unnamed: 0,antibody_key,eventful,pred_rows,pred_has_step,pred_has_read_error,note
0,1,False,1,False,False,step/rows 조건 불충족 → 단일 step=0 baseline 이벤트로 기록
1,45,False,1,False,False,step/rows 조건 불충족 → 단일 step=0 baseline 이벤트로 기록
2,183,False,1,False,False,step/rows 조건 불충족 → 단일 step=0 baseline 이벤트로 기록
