In [29]:
import pandas as pd
from metrics import *

In [9]:
def calculate_mean_std(results, cat):
    precision_list = [result[cat]["precision"] for result in results]
    recall_list = [result[cat]["recall"] for result in results]
    f1_list = [result[cat]["f1"] for result in results]
    support_list = [result[cat]["support"] for result in results]
    num_errors_list = [result[cat]["num_errors"] for result in results]

    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mean_f1 = sum(f1_list) / len(f1_list)

    std_precision = (
        sum([(x - mean_precision) ** 2 for x in precision_list]) / len(precision_list)
    ) ** 0.5
    std_recall = (
        sum([(x - mean_recall) ** 2 for x in recall_list]) / len(recall_list)
    ) ** 0.5
    std_f1 = (sum([(x - mean_f1) ** 2 for x in f1_list]) / len(f1_list)) ** 0.5

    return {
        "mean_precision": round(mean_precision, 3),
        "mean_recall": round(mean_recall, 3),
        "mean_f1": round(mean_f1, 3),
        "std_precision": round(std_precision, 3),
        "std_recall": round(std_recall, 3),
        "std_f1": round(std_f1, 3),
        "sum_support": sum(support_list),
        "sum_num_errors": sum(num_errors_list),
        "raw_mean_precision": mean_precision,
        "raw_mean_recall": mean_recall,
        "raw_mean_f1": mean_f1,
    }


def output_tabular_performance(results, categories=["T1", "T2", "T3", "T4"]):
    precisions = []
    recalls = []
    f1s = []

    for category in categories:
        eval = calculate_mean_std(results, category)
        print(
            "{} {:.3f}({:.3f}) {:.3f}({:.3f}) {:.3f}({:.3f})".format(
                category,
                eval["mean_precision"],
                eval["std_precision"],
                eval["mean_recall"],
                eval["std_recall"],
                eval["mean_f1"],
                eval["std_f1"],
            )
        )

        # for calculating macro average
        precisions.append(eval["raw_mean_precision"])
        recalls.append(eval["raw_mean_recall"])
        f1s.append(eval["raw_mean_f1"])

    print(
        "MacroAvg. {:.3f} {:.3f} {:.3f}".format(
            round(sum(precisions) / len(precisions), 3),
            round(sum(recalls) / len(recalls), 3),
            round(sum(f1s) / len(f1s), 3),
        )
    )

# kepa (reported in the draft)

In [10]:
kepa_t_results = []
zs_t_results = []
zscot_t_results = []


kepa_run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "cmem_t_40reports_ans_str"

    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")
    t_zs_df = pd.read_csv(
        "/home/yl3427/cylab/selfCorrectionAgent/result/0716_t14_zs_test_800.csv"
    ).sort_values(by="patient_filename")
    t_zscot_df = pd.read_csv(
        "/home/yl3427/cylab/selfCorrectionAgent/result/0716_t14_zscot_test_800.csv"
    ).sort_values(by="patient_filename")

    split_ids = t_test_df.patient_filename

    label_column = t_test_df["t"]
    t_test_pred_df = t_test_df[t_test_df.patient_filename.isin(split_ids)][pred_column]
    kepa_t_results.append(
        t14_calculate_metrics(true_labels=label_column, predictions=t_test_pred_df)
    )

    t_zs_pred_df = t_zs_df[t_zs_df.patient_filename.isin(split_ids)]["zs_t_ans_str"]
    zs_t_results.append(
        t14_calculate_metrics(true_labels=label_column, predictions=t_zs_pred_df)
    )

    t_zscot_pred_df = t_zscot_df[t_zscot_df.patient_filename.isin(split_ids)][
        "zs_t_ans_str"
    ]
    zscot_t_results.append(
        t14_calculate_metrics(true_labels=label_column, predictions=t_zscot_pred_df)
    )

In [11]:
output_tabular_performance(kepa_t_results)

T1 0.904(0.017) 0.812(0.040) 0.855(0.018)
T2 0.882(0.022) 0.938(0.018) 0.909(0.005)
T3 0.834(0.054) 0.810(0.058) 0.818(0.018)
T4 0.807(0.082) 0.634(0.038) 0.707(0.029)
MacroAvg. 0.857 0.799 0.822


In [12]:
kepa_n_results = []
zs_n_results = []
zscot_n_results = []

kepa_run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "cmem_n_40reports_ans_str"

    n_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_n03_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")
    n_zs_df = pd.read_csv(
        "/home/yl3427/cylab/selfCorrectionAgent/result/0716_n03_zs_test_800.csv"
    ).sort_values(by="patient_filename")
    n_zscot_df = pd.read_csv(
        "/home/yl3427/cylab/selfCorrectionAgent/result/0716_n03_zscot_test_800.csv"
    ).sort_values(by="patient_filename")

    split_ids = n_test_df.patient_filename

    label_column = n_test_df["n"]
    n_test_pred_df = n_test_df[n_test_df.patient_filename.isin(split_ids)][pred_column]
    kepa_n_results.append(
        n03_calculate_metrics(true_labels=label_column, predictions=n_test_pred_df)
    )

    n_zs_pred_df = n_zs_df[n_zs_df.patient_filename.isin(split_ids)]["zs_n_ans_str"]
    zs_n_results.append(
        n03_calculate_metrics(true_labels=label_column, predictions=n_zs_pred_df)
    )

    n_zscot_pred_df = n_zscot_df[n_zscot_df.patient_filename.isin(split_ids)][
        "zs_n_ans_str"
    ]
    zscot_n_results.append(
        n03_calculate_metrics(true_labels=label_column, predictions=n_zscot_pred_df)
    )

In [13]:
output_tabular_performance(kepa_n_results, categories=["N0", "N1", "N2", "N3"])

N0 0.944(0.008) 0.952(0.018) 0.948(0.011)
N1 0.885(0.020) 0.883(0.026) 0.884(0.010)
N2 0.713(0.031) 0.745(0.054) 0.727(0.022)
N3 0.886(0.058) 0.784(0.042) 0.830(0.017)
MacroAvg. 0.857 0.841 0.847


# GPT

In [14]:
kepa_t_results = []

kepa_run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "gpt4o_t_stage"

    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1112_t14_gpt_test_{run}_outof_8runs.csv"
    ).sort_values(by="patient_filename")

    label_column = t_test_df["t"]
    t_test_pred_df = t_test_df[pred_column]
    kepa_t_results.append(
        t14_calculate_metrics(true_labels=label_column, predictions=t_test_pred_df)
    )

In [15]:
output_tabular_performance(kepa_t_results)

T1 0.902(0.009) 0.903(0.025) 0.902(0.013)
T2 0.935(0.023) 0.939(0.015) 0.936(0.007)
T3 0.905(0.048) 0.813(0.080) 0.852(0.039)
T4 0.622(0.136) 0.728(0.052) 0.659(0.052)
MacroAvg. 0.841 0.846 0.837


In [16]:
n_test_df.columns

Index(['patient_filename', 'text', 'n', 'cmem_n_10reports_is_parsed',
       'cmem_n_10reasoning', 'cmem_n_10reports_ans_str',
       'cmem_n_20reports_is_parsed', 'cmem_n_20reasoning',
       'cmem_n_20reports_ans_str', 'cmem_n_30reports_is_parsed',
       'cmem_n_30reasoning', 'cmem_n_30reports_ans_str',
       'cmem_n_40reports_is_parsed', 'cmem_n_40reasoning',
       'cmem_n_40reports_ans_str', 'cmem_n_50reports_is_parsed',
       'cmem_n_50reasoning', 'cmem_n_50reports_ans_str',
       'cmem_n_60reports_is_parsed', 'cmem_n_60reasoning',
       'cmem_n_60reports_ans_str', 'cmem_n_70reports_is_parsed',
       'cmem_n_70reasoning', 'cmem_n_70reports_ans_str',
       'cmem_n_80reports_is_parsed', 'cmem_n_80reasoning',
       'cmem_n_80reports_ans_str', 'cmem_n_90reports_is_parsed',
       'cmem_n_90reasoning', 'cmem_n_90reports_ans_str',
       'cmem_n_100reports_is_parsed', 'cmem_n_100reasoning',
       'cmem_n_100reports_ans_str', 'cmem_n_10reports_reasoning',
       'cmem_n_20repor

In [17]:
kepa_n_results = []

kepa_run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "gpt4o_n_stage"

    n_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1112_n03_gpt_test_{run}_outof_8runs.csv"
    ).sort_values(by="patient_filename")

    label_column = n_test_df["n"]
    n_test_pred_df = n_test_df[pred_column]
    kepa_n_results.append(
        n03_calculate_metrics(true_labels=label_column, predictions=n_test_pred_df)
    )

In [18]:
output_tabular_performance(kepa_n_results, categories=["N0", "N1", "N2", "N3"])

N0 0.928(0.006) 0.962(0.051) 0.944(0.026)
N1 0.921(0.006) 0.875(0.013) 0.897(0.008)
N2 0.777(0.110) 0.786(0.034) 0.778(0.077)
N3 0.855(0.047) 0.850(0.016) 0.852(0.031)
MacroAvg. 0.870 0.868 0.868


# other models

In [19]:
kepa_t_results = []

kepa_run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "kepa_t_ans_str"

    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_t14_med42_v2_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    label_column = t_test_df["t"]
    t_test_pred_df = t_test_df[pred_column]
    kepa_t_results.append(
        t14_calculate_metrics(true_labels=label_column, predictions=t_test_pred_df)
    )

In [20]:
output_tabular_performance(kepa_t_results)

T1 0.813(0.073) 0.759(0.076) 0.783(0.064)
T2 0.855(0.031) 0.913(0.023) 0.882(0.016)
T3 0.869(0.063) 0.703(0.099) 0.770(0.065)
T4 0.630(0.046) 0.615(0.057) 0.621(0.042)
MacroAvg. 0.792 0.747 0.764


In [21]:
kepa_n_results = []

kepa_run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in kepa_run_lst:
    # print(f"Run {run}, memory 40")
    pred_column = "kepa_n_ans_str"

    n_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_n03_med42_v2_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    label_column = n_test_df["n"]
    n_test_pred_df = n_test_df[pred_column]
    kepa_n_results.append(
        n03_calculate_metrics(true_labels=label_column, predictions=n_test_pred_df)
    )

In [22]:
output_tabular_performance(kepa_n_results, categories=["N0", "N1", "N2", "N3"])

N0 0.950(0.011) 0.821(0.059) 0.879(0.032)
N1 0.775(0.056) 0.821(0.032) 0.795(0.022)
N2 0.657(0.067) 0.711(0.076) 0.675(0.018)
N3 0.759(0.103) 0.858(0.029) 0.800(0.062)
MacroAvg. 0.785 0.803 0.787


### zscot

In [23]:
zscot_t_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1118_t14_med42_v2_test_800.csv"
)
zscot_n_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1118_n03_med42_v2_test_800.csv"
)

t_label_column = zscot_t_df["t"]
t_pred_column = zscot_t_df["zscot_t_ans_str"]

n_label_column = zscot_n_df["n"]
n_pred_column = zscot_n_df["zscot_n_ans_str"]

t_results = t14_calculate_metrics(true_labels=t_label_column, predictions=t_pred_column)
n_results = n03_calculate_metrics(true_labels=n_label_column, predictions=n_pred_column)

In [24]:
t_results["overall"]

{'macro_precision': 0.746,
 'macro_recall': 0.678,
 'macro_f1': 0.703,
 'support': 800,
 'num_errors': 370}

In [25]:
n_results["overall"]

{'macro_precision': 0.748,
 'macro_recall': 0.723,
 'macro_f1': 0.724,
 'support': 800,
 'num_errors': 420}

### RAG

In [27]:
# t14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv"
)
test_df.columns

t_label_column = test_df["t"]
t_pred_column = test_df["t14_rag_raw_t_pred"]

t14_calculate_metrics(true_labels=t_label_column, predictions=t_pred_column)['overall']

{'macro_precision': 0.786,
 'macro_recall': 0.748,
 'macro_f1': 0.764,
 'support': 800,
 'num_errors': 262}

In [28]:
# n03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_n03_rag_raw_med42_v2_800.csv"
)
test_df.columns

n_label_column = test_df["n"]
n_pred_column = test_df["n03_rag_raw_n_pred"]

n03_calculate_metrics(true_labels=n_label_column, predictions=n_pred_column)['overall']

{'macro_precision': 0.76,
 'macro_recall': 0.799,
 'macro_f1': 0.759,
 'support': 800,
 'num_errors': 336}

In [36]:
# t14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_zs_med42_v2_800.csv"
)

t_label_column = test_df["t"]
t_pred_column = test_df["t14_ltm_zs_t_pred"]

t14_calculate_metrics(true_labels=t_label_column, predictions=t_pred_column)['overall']

{'macro_precision': 0.837,
 'macro_recall': 0.799,
 'macro_f1': 0.816,
 'support': 800,
 'num_errors': 206}

In [37]:
# n03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_n03_ltm_zs_med42_v2_800.csv"
)
test_df.columns

n_label_column = test_df["n"]
n_pred_column = test_df["n03_ltm_zs_n_pred"]

n03_calculate_metrics(true_labels=n_label_column, predictions=n_pred_column)['overall']

{'macro_precision': 0.801,
 'macro_recall': 0.826,
 'macro_f1': 0.807,
 'support': 800,
 'num_errors': 256}

In [30]:
# t14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv"
)
test_df.columns

t_label_column = test_df["t"]
t_pred_column = test_df["t14_ltm_rag1_t_pred"]

t14_calculate_metrics(true_labels=t_label_column, predictions=t_pred_column)['overall']

{'macro_precision': 0.838,
 'macro_recall': 0.793,
 'macro_f1': 0.812,
 'support': 800,
 'num_errors': 194}

In [31]:
# n03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_n03_ltm_rag1_med42_v2_800.csv"
)
test_df.columns

n_label_column = test_df["n"]
n_pred_column = test_df["n03_ltm_rag1_n_pred"]

n03_calculate_metrics(true_labels=n_label_column, predictions=n_pred_column)['overall']

{'macro_precision': 0.845,
 'macro_recall': 0.849,
 'macro_f1': 0.846,
 'support': 800,
 'num_errors': 192}

In [32]:
# t14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag2_med42_v2_800.csv"
)
test_df.columns

t_label_column = test_df["t"]
t_pred_column = test_df["t14_ltm_rag2_t_pred"]

t14_calculate_metrics(true_labels=t_label_column, predictions=t_pred_column)['overall']

{'macro_precision': 0.835,
 'macro_recall': 0.772,
 'macro_f1': 0.799,
 'support': 800,
 'num_errors': 220}

In [33]:
# n03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_n03_ltm_rag2_med42_v2_800.csv"
)
test_df.columns

n_label_column = test_df["n"]
n_pred_column = test_df["n03_ltm_rag2_n_pred"]

n03_calculate_metrics(true_labels=n_label_column, predictions=n_pred_column)['overall']

{'macro_precision': 0.766,
 'macro_recall': 0.763,
 'macro_f1': 0.761,
 'support': 800,
 'num_errors': 294}

# Re-Run Parsing Error Cases

In [1]:
from agent import *
from prompt import *
from openai import OpenAI
import pandas as pd
from pydantic import BaseModel, Field
from typing import Literal
import logging

### KEPA

In [None]:
class Response_T(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the T stage."
    )
    stage: Literal["T1", "T2", "T3", "T4"] = Field(
        description="The T stage determined from the report. Stage must be one of 'T1', 'T2', 'T3' or 'T4.'"
    )


class Response_N(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the N stage."
    )
    stage: Literal["N0", "N1", "N2", "N3"] = Field(
        description="The N stage determined from the report. Stage must be one of 'N0', 'N1', 'N2' or 'N3.'"
    )


testing_schema_t14 = Response_T.model_json_schema()
testing_schema_n03 = Response_N.model_json_schema()

client = OpenAI(api_key="empty", base_url="http://localhost:8000/v1")

In [None]:
def test_individual_report(
    dataset: pd.DataFrame,
    patient_filename: str,
    memory: str,
    label: str,
    testing_schema: dict,
    model: str = "m42-health/Llama3-Med42-70B",
):

    report = dataset[dataset.patient_filename == patient_filename]["text"].values[0]

    if label.lower()[0] == "n":
        prompt = testing_predict_prompt_n03.format(memory=memory, report=report)
    else:
        prompt = testing_predict_prompt_t14.format(memory=memory, report=report)

    filled_prompt = system_instruction + "\n" + prompt
    messages = [{"role": "user", "content": filled_prompt}]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        extra_body={"guided_json": testing_schema},
        temperature=0.1,  # 0.3, 0.5, 0.7, 0.9
    )

    try:
        response = json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"kepa_{label}_is_parsed"
    ] = True
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"kepa_{label}_ans_str"
    ] = response["stage"]
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"kepa_{label}_reasoning"
    ] = response["reasoning"]

    return dataset

In [None]:
# T14  [0, 1, 2, 3, 4, 5, 6, 8]
for run in [0, 1, 2, 3, 4, 5, 6, 8]:
    print(f"{run}th split")

    # Extract memory for t14
    t_train_file_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/t14_memory_dataset{run}.csv"
    )
    t_train_data = pd.read_csv(t_train_file_path)

    t_memory_dict = {}
    for idx, row in t_train_data.iterrows():
        t_memory_dict[idx + 1] = row["cmem_t_memory_str"]
    t_memory = t_memory_dict.get(40, "")  # Use .get to avoid KeyError

    test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_t14_med42_v2_test_{run}_outof_10runs.csv"
    )
    unparsed_df = test_df[
        ~test_df["kepa_t_is_parsed"].astype(bool) | test_df["kepa_t_ans_str"].isna()
    ]

    for idx, row in unparsed_df.iterrows():
        # if run == 8 and idx <= 149:
        #     continue
        patient_filename = row["patient_filename"]
        print(f"Processing patient: {patient_filename} (Index: {idx})")

        print(f"Before: {row['kepa_t_ans_str']}")

        updated_df = test_individual_report(
            dataset=test_df,
            patient_filename=patient_filename,
            memory=t_memory,
            label="t",
            testing_schema=testing_schema_t14,
        )

        if updated_df is None:
            print(f"Failed to process patient: {patient_filename}. Skipping...")
            continue
        else:
            test_df = updated_df  # Only assign if not None

        after_stage = test_df.loc[
            test_df["patient_filename"] == patient_filename, "kepa_t_ans_str"
        ].values
        after_reasoning = test_df.loc[
            test_df["patient_filename"] == patient_filename, "kepa_t_reasoning"
        ].values
        label_value = test_df.loc[
            test_df["patient_filename"] == patient_filename, "t"
        ].values

        print(f"After Stage: {after_stage}")
        print(f"After Reasoning: {after_reasoning}")
        print(f"Label: {label_value}")

        rerun_df_path = f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_t14_med42_v2_test_{run}_outof_10runs.csv"
        test_df.to_csv(rerun_df_path, index=False)

In [None]:
# N03 [0, 1, 3, 4, 5, 6, 7, 9]
for run in [0, 1, 3, 4, 5, 6, 7, 9]:
    print(f"{run}th split")

    # extract memory for t14
    n_train_file_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/n03_memory_dataset{run}.csv"
    )
    n_train_data = pd.read_csv(n_train_file_path)

    n_memory_dict = {}
    for idx, row in n_train_data.iterrows():
        n_memory_dict[idx + 1] = row["cmem_n_memory_str"]
    n_memory = n_memory_dict[40]

    test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_n03_med42_v2_test_{run}_outof_10runs.csv"
    )
    unparsed_df = test_df[
        ~test_df["kepa_n_is_parsed"].astype(bool) | test_df["kepa_n_ans_str"].isna()
    ]

    for idx, row in unparsed_df.iterrows():
        # if run == 8 and idx <= 149:
        #     continue

        patient_filename = row["patient_filename"]
        print(f"Processing patient: {patient_filename} (Index: {idx})")

        print(f"Before: {row['kepa_n_ans_str']}")

        updated_df = test_individual_report(
            dataset=test_df,
            patient_filename=patient_filename,
            memory=n_memory,
            label="n",
            testing_schema=testing_schema_n03,
        )

        if updated_df is None:
            print(f"Failed to process patient: {patient_filename}. Skipping...")
            continue
        else:
            test_df = updated_df

        after_stage = test_df.loc[
            test_df["patient_filename"] == patient_filename, "kepa_n_ans_str"
        ].values
        after_reasoning = test_df.loc[
            test_df["patient_filename"] == patient_filename, "kepa_n_reasoning"
        ].values
        label_value = test_df.loc[
            test_df["patient_filename"] == patient_filename, "n"
        ].values

        print(f"After Stage: {after_stage}")
        print(f"After Reasoning: {after_reasoning}")
        print(f"Label: {label_value}")

        rerun_df_path = f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_n03_med42_v2_test_{run}_outof_10runs.csv"
        test_df.to_csv(rerun_df_path, index=False)

### ZSCOT

In [2]:
class Response_T(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the T stage."
    )
    stage: Literal["T1", "T2", "T3", "T4"] = Field(
        description="The T stage determined from the report. Stage must be one of 'T1', 'T2', 'T3' or 'T4.'"
    )


class Response_N(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the N stage."
    )
    stage: Literal["N0", "N1", "N2", "N3"] = Field(
        description="The N stage determined from the report. Stage must be one of 'N0', 'N1', 'N2' or 'N3.'"
    )


testing_schema_t14 = Response_T.model_json_schema()
testing_schema_n03 = Response_N.model_json_schema()

client = OpenAI(api_key="empty", base_url="http://localhost:8000/v1")

In [None]:
def test_individual_report(
    dataset: pd.DataFrame,
    patient_filename: str,
    label: str,
    testing_schema: dict,
    model: str = "m42-health/Llama3-Med42-70B",
):

    report = dataset[dataset.patient_filename == patient_filename]["text"].values[0]

    if label.lower()[0] == "n":
        prompt = zscot_predict_prompt_n03.format(report=report)
    else:
        prompt = zscot_predict_prompt_t14.format(report=report)

    filled_prompt = prompt_template.format(
        system_instruction=system_instruction, prompt=prompt
    )
    messages = [{"role": "user", "content": filled_prompt}]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        extra_body={"guided_json": testing_schema},
        temperature=0.1,  # 0.3, 0.5, 0.7, 0.9,
        # top_p = 1.0,
    )

    try:
        response = json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"zscot_{label}_is_parsed"
    ] = True
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"zscot_{label}_ans_str"
    ] = response["stage"]
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"zscot_{label}_reasoning"
    ] = response["reasoning"]

    return dataset

In [14]:
### T14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1118_t14_med42_v2_test_800.csv"
)
unparsed_df = test_df[
    ~test_df["zscot_t_is_parsed"].astype(bool) | test_df["zscot_t_ans_str"].isna()
]

for idx, row in unparsed_df.iterrows():
    # if idx <= 245:
    #     continue

    patient_filename = row["patient_filename"]
    print(f"Processing patient: {patient_filename} (Index: {idx})")

    print(f"Before: {row['zscot_t_ans_str']}")

    updated_df = test_individual_report(
        dataset=test_df,
        patient_filename=patient_filename,
        label="t",
        testing_schema=testing_schema_t14,
    )

    if updated_df is None:
        print(f"Failed to process patient: {patient_filename}. Skipping...")
        continue
    else:
        test_df = updated_df  # Only assign if not None

    after_stage = test_df.loc[
        test_df["patient_filename"] == patient_filename, "zscot_t_ans_str"
    ].values
    after_reasoning = test_df.loc[
        test_df["patient_filename"] == patient_filename, "zscot_t_reasoning"
    ].values
    label_value = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t"
    ].values

    print(f"After Stage: {after_stage}")
    print(f"After Reasoning: {after_reasoning}")
    print(f"Label: {label_value}")

    rerun_df_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1118_t14_med42_v2_test_800.csv"
    )
    test_df.to_csv(rerun_df_path, index=False)

In [15]:
### N03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1118_n03_med42_v2_test_800.csv"
)
unparsed_df = test_df[
    ~test_df["zscot_n_is_parsed"].astype(bool) | test_df["zscot_n_ans_str"].isna()
]

for idx, row in unparsed_df.iterrows():

    patient_filename = row["patient_filename"]
    print(f"Processing patient: {patient_filename} (Index: {idx})")

    print(f"Before: {row['zscot_n_ans_str']}")

    updated_df = test_individual_report(
        dataset=test_df,
        patient_filename=patient_filename,
        label="n",
        testing_schema=testing_schema_n03,
    )

    if updated_df is None:
        print(f"Failed to process patient: {patient_filename}. Skipping...")
        continue
    else:
        test_df = updated_df  # Only assign if not None

    after_stage = test_df.loc[
        test_df["patient_filename"] == patient_filename, "zscot_n_ans_str"
    ].values
    after_reasoning = test_df.loc[
        test_df["patient_filename"] == patient_filename, "zscot_n_reasoning"
    ].values
    label_value = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n"
    ].values

    print(f"After Stage: {after_stage}")
    print(f"After Reasoning: {after_reasoning}")
    print(f"Label: {label_value}")

    rerun_df_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1118_n03_med42_v2_test_800.csv"
    )
    test_df.to_csv(rerun_df_path, index=False)

### RAG

In [None]:
class Response_T(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the T stage."
    )
    stage: Literal["T1", "T2", "T3", "T4"] = Field(
        description="The T stage determined from the report. Stage must be one of 'T1', 'T2', 'T3' or 'T4.'"
    )


class Response_N(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the N stage."
    )
    stage: Literal["N0", "N1", "N2", "N3"] = Field(
        description="The N stage determined from the report. Stage must be one of 'N0', 'N1', 'N2' or 'N3.'"
    )


testing_schema_t14 = Response_T.model_json_schema()
testing_schema_n03 = Response_N.model_json_schema()

client = OpenAI(api_key="empty", base_url="http://localhost:8000/v1")

In [None]:
def test_individual_report(
    dataset: pd.DataFrame,
    patient_filename: str,
    label: str,
    model: str = "m42-health/Llama3-Med42-70B",
):

    report = dataset[dataset.patient_filename == patient_filename]["text"].values[0]

    if label.lower()[0] == "n":
        prompt = rag_n03.format(report=report)
        test_name="n03_rag_raw" 
        testing_schema = testing_schema_n03
    else:
        prompt = rag_t14.format(report=report)
        test_name="t14_rag_raw"
        testing_schema = testing_schema_t14

    filled_prompt = prompt_template_med42.format(
        system_instruction=system_instruction, prompt=prompt
    )
    messages = [{"role": "user", "content": filled_prompt}]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        extra_body={"guided_json": testing_schema},
        temperature=0.1,  # 0.3, 0.5, 0.7, 0.9,
        # top_p = 1.0,
    )

    try:
        response = json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"{test_name}_{label}_is_parsed"
    ] = True
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"{test_name}_{label}_pred"
    ] = response["stage"]
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"{test_name}_{label}_reasoning"
    ] = response["reasoning"]

    return dataset

In [None]:
### T14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv"
)
unparsed_df = test_df[
    ~test_df["t14_rag_raw_t_is_parsed"].astype(bool) | test_df["t14_rag_raw_t_pred"].isna()
]

for idx, row in unparsed_df.iterrows():
    # if idx <= 245:
    #     continue

    patient_filename = row["patient_filename"]
    print(f"Processing patient: {patient_filename} (Index: {idx})")

    print(f"Before: {row['t14_rag_raw_t_pred']}")

    updated_df = test_individual_report(
        dataset=test_df,
        patient_filename=patient_filename,
        label="t",
    )

    if updated_df is None:
        print(f"Failed to process patient: {patient_filename}. Skipping...")
        continue
    else:
        test_df = updated_df  # Only assign if not None

    after_stage = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t14_rag_raw_t_pred"
    ].values
    after_reasoning = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t14_rag_raw_t_reasoning"
    ].values
    label_value = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t"
    ].values

    print(f"After Stage: {after_stage}")
    print(f"After Reasoning: {after_reasoning}")
    print(f"Label: {label_value}")

    rerun_df_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv"
    )
    test_df.to_csv(rerun_df_path, index=False)

In [None]:
### N03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_n03_rag_raw_med42_v2_800.csv"
)
unparsed_df = test_df[
    ~test_df["n03_rag_raw_n_is_parsed"].astype(bool) | test_df["n03_rag_raw_n_pred"].isna()
]

for idx, row in unparsed_df.iterrows():

    patient_filename = row["patient_filename"]
    print(f"Processing patient: {patient_filename} (Index: {idx})")

    print(f"Before: {row['n03_rag_raw_n_pred']}")

    updated_df = test_individual_report(
        dataset=test_df,
        patient_filename=patient_filename,
        label="n",
    )

    if updated_df is None:
        print(f"Failed to process patient: {patient_filename}. Skipping...")
        continue
    else:
        test_df = updated_df  # Only assign if not None

    after_stage = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n03_rag_raw_n_pred"
    ].values
    after_reasoning = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n03_rag_raw_n_reasoning"
    ].values
    label_value = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n"
    ].values

    print(f"After Stage: {after_stage}")
    print(f"After Reasoning: {after_reasoning}")
    print(f"Label: {label_value}")

    rerun_df_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_n03_rag_raw_med42_v2_800.csv"
    )
    test_df.to_csv(rerun_df_path, index=False)

### LTM

In [2]:
class Response_T(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the T stage."
    )
    stage: Literal["T1", "T2", "T3", "T4"] = Field(
        description="The T stage determined from the report. Stage must be one of 'T1', 'T2', 'T3' or 'T4.'"
    )


class Response_N(BaseModel):
    reasoning: str = Field(
        description="Step-by-step explanation of how you interpreted the report to determine the N stage."
    )
    stage: Literal["N0", "N1", "N2", "N3"] = Field(
        description="The N stage determined from the report. Stage must be one of 'N0', 'N1', 'N2' or 'N3.'"
    )


testing_schema_t14 = Response_T.model_json_schema()
testing_schema_n03 = Response_N.model_json_schema()

client = OpenAI(api_key="empty", base_url="http://localhost:8000/v1")

In [3]:
with open("../context.json", "r") as f:
    context = json.load(f)

rag_raw_t14 = context["rag_raw_t14"]
rag_raw_n03 = context["rag_raw_n03"]
ltm_zs_t14 = context["ltm_zs_t14"]
ltm_zs_n03 = context["ltm_zs_n03"]
ltm_rag1_t14 = context["ltm_rag1_t14"]
ltm_rag1_n03 = context["ltm_rag1_n03"]
ltm_rag2_t14 = context["ltm_rag2_t14"]
ltm_rag2_n03 = context["ltm_rag2_n03"]

In [5]:
def test_individual_report(
    dataset: pd.DataFrame,
    patient_filename: str,
    label: str,
    model: str = "m42-health/Llama3-Med42-70B",
):

    report = dataset[dataset.patient_filename == patient_filename]["text"].values[0]

    if label.lower()[0] == "n":
        prompt = ltm_n03.format(report=report, context=ltm_rag1_n03)
        test_name="n03_ltm_rag1" 
        testing_schema = testing_schema_n03
    else:
        prompt = ltm_t14.format(report=report, context=ltm_rag1_t14)
        test_name="t14_ltm_rag1"
        testing_schema = testing_schema_t14

    filled_prompt = prompt_template_med42.format(
        system_instruction=system_instruction, prompt=prompt
    )
    messages = [{"role": "user", "content": filled_prompt}]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        extra_body={"guided_json": testing_schema},
        temperature=0.1,  # 0.3, 0.5, 0.7, 0.9,
        # top_p = 1.0,
    )

    try:
        response = json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"{test_name}_{label}_is_parsed"
    ] = True
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"{test_name}_{label}_pred"
    ] = response["stage"]
    dataset.loc[
        dataset["patient_filename"] == patient_filename, f"{test_name}_{label}_reasoning"
    ] = response["reasoning"]

    return dataset

In [6]:
### T14
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv"
)
unparsed_df = test_df[
    (~test_df["t14_ltm_rag1_t_is_parsed"].astype(bool)) | (test_df["t14_ltm_rag1_t_pred"].isna())
]

for idx, row in unparsed_df.iterrows():

    patient_filename = row["patient_filename"]
    print(f"Processing patient: {patient_filename} (Index: {idx})")

    print(f"Before: {row['t14_ltm_rag1_t_pred']}")

    updated_df = test_individual_report(
        dataset=test_df,
        patient_filename=patient_filename,
        label="t",
    )

    if updated_df is None:
        print(f"Failed to process patient: {patient_filename}. Skipping...")
        continue
    else:
        test_df = updated_df  # Only assign if not None

    after_stage = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t14_ltm_rag1_t_pred"
    ].values
    after_reasoning = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t14_ltm_rag1_t_reasoning"
    ].values
    label_value = test_df.loc[
        test_df["patient_filename"] == patient_filename, "t"
    ].values

    print(f"After Stage: {after_stage}")
    print(f"After Reasoning: {after_reasoning}")
    print(f"Label: {label_value}")

    rerun_df_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv"
    )
    test_df.to_csv(rerun_df_path, index=False)

In [8]:
### N03
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_n03_ltm_rag1_med42_v2_800.csv"
)
unparsed_df = test_df[
    (~test_df["n03_ltm_rag1_n_is_parsed"].astype(bool)) | (test_df["n03_ltm_rag1_n_pred"].isna())
]

for idx, row in unparsed_df.iterrows():

    patient_filename = row["patient_filename"]
    print(f"Processing patient: {patient_filename} (Index: {idx})")

    print(f"Before: {row['n03_ltm_rag1_n_pred']}")

    updated_df = test_individual_report(
        dataset=test_df,
        patient_filename=patient_filename,
        label="n",
    )

    if updated_df is None:
        print(f"Failed to process patient: {patient_filename}. Skipping...")
        continue
    else:
        test_df = updated_df  # Only assign if not None

    after_stage = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n03_ltm_rag1_n_pred"
    ].values
    after_reasoning = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n03_ltm_rag1_n_reasoning"
    ].values
    label_value = test_df.loc[
        test_df["patient_filename"] == patient_filename, "n"
    ].values

    print(f"After Stage: {after_stage}")
    print(f"After Reasoning: {after_reasoning}")
    print(f"Label: {label_value}")

    rerun_df_path = (
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1128_n03_ltm_rag1_med42_v2_800.csv"
    )
    test_df.to_csv(rerun_df_path, index=False)

In [1]:
import pandas as pd
test_df = pd.read_csv(
    f"/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv"
)

In [2]:
test_df["t14_rag_raw_t_is_parsed"].sum()

800

In [24]:
client = OpenAI(api_key="empty", base_url="http://localhost:8000/v1")
messages = [{"role": "user", "content": "What is SOAP format?"}]

class Assesssment(BaseModel):
    assesssment: str = Field(
        description="Explain the SOAP format."
    )

response = client.chat.completions.create(
    model="m42-health/Llama3-Med42-70B",
    messages=messages,
    # extra_body={"guided_json": Assesssment.model_json_schema()},
    temperature=0.1,
)