In [1]:
import pandas as pd
from src.metrics import *

In [None]:
df.columns

In [None]:
df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv') # 이거야. 여기에 mixtral결과 다있어
t14_calculate_metrics(df['t'], df['zscot_t_stage'])

In [None]:
t14_calculate_metrics()

In [6]:
def calculate_mean_std(results, cat):
    precision_list = [result[cat]["precision"] for result in results]
    recall_list = [result[cat]["recall"] for result in results]
    f1_list = [result[cat]["f1"] for result in results]
    support_list = [result[cat]["support"] for result in results]
    num_errors_list = [result[cat]["num_errors"] for result in results]

    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mean_f1 = sum(f1_list) / len(f1_list)

    std_precision = (
        sum([(x - mean_precision) ** 2 for x in precision_list]) / len(precision_list)
    ) ** 0.5
    std_recall = (
        sum([(x - mean_recall) ** 2 for x in recall_list]) / len(recall_list)
    ) ** 0.5
    std_f1 = (sum([(x - mean_f1) ** 2 for x in f1_list]) / len(f1_list)) ** 0.5

    return {
        "mean_precision": round(mean_precision, 3),
        "mean_recall": round(mean_recall, 3),
        "mean_f1": round(mean_f1, 3),
        "std_precision": round(std_precision, 3),
        "std_recall": round(std_recall, 3),
        "std_f1": round(std_f1, 3),
        "sum_support": sum(support_list),
        "sum_num_errors": sum(num_errors_list),
        "raw_mean_precision": mean_precision,
        "raw_mean_recall": mean_recall,
        "raw_mean_f1": mean_f1,
    }


def output_tabular_performance(results, categories=["T1", "T2", "T3", "T4"]):
    precisions = []
    recalls = []
    f1s = []

    for category in categories:
        eval = calculate_mean_std(results, category)
        print(
            "{} {:.3f}({:.3f}) {:.3f}({:.3f}) {:.3f}({:.3f})".format(
                category,
                eval["mean_precision"],
                eval["std_precision"],
                eval["mean_recall"],
                eval["std_recall"],
                eval["mean_f1"],
                eval["std_f1"],
            )
        )

        # for calculating macro average
        precisions.append(eval["raw_mean_precision"])
        recalls.append(eval["raw_mean_recall"])
        f1s.append(eval["raw_mean_f1"])

    print(
        "MacroAvg. {:.3f} {:.3f} {:.3f}".format(
            round(sum(precisions) / len(precisions), 3),
            round(sum(recalls) / len(recalls), 3),
            round(sum(f1s) / len(f1s), 3),
        )
    )

Index(['patient_filename', 't', 'text', 'n', 'zscot_t_reasoning',
       'zscot_t_stage', 'zscot_n_reasoning', 'zscot_n_stage',
       'rag_raw_t_reasoning', 'rag_raw_t_stage', 'rag_raw_n_reasoning',
       'rag_raw_n_stage', 'ltm_zs_t_reasoning', 'ltm_zs_t_stage',
       'ltm_zs_n_reasoning', 'ltm_zs_n_stage', 'ltm_rag1_t_reasoning',
       'ltm_rag1_t_stage', 'ltm_rag1_n_reasoning', 'ltm_rag1_n_stage',
       'ltm_rag2_t_reasoning', 'ltm_rag2_t_stage', 'ltm_rag2_n_reasoning',
       'ltm_rag2_n_stage', 'zscot_t_flag', 'zscot_n_flag', 'rag_raw_t_flag',
       'rag_raw_n_flag', 'ltm_zs_t_flag', 'ltm_zs_n_flag', 'ltm_rag1_t_flag',
       'ltm_rag1_n_flag', 'ltm_rag2_t_flag', 'ltm_rag2_n_flag'],
      dtype='object')

In [36]:
# kewltm_t_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv")
others_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

In [None]:
# For T
print("Mixtral")
zscot_t_results = []
rag_t_results = []
ltm_rag_t_results = []
kewltm_t_results = []

t_label = 't'
zscot_t_stage = 'zscot_t_stage'
rag_t_stage = 'rag_raw_t_stage'
ltm_rag_t_stage = 'ltm_rag1_t_stage'
kewltm_t_stage = "cmem_t_40reports_ans_str"

others_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in run_lst:
    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_t_results.append(
        t14_calculate_metrics(t_test_df[t_label], t_test_df[kewltm_t_stage])
    )

    split_ids = t_test_df.patient_filename
    others_t_df = others_t_df[others_t_df["patient_filename"].isin(split_ids)]
    zscot_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[zscot_t_stage]))
    rag_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[rag_t_stage]))
    ltm_rag_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[ltm_rag_t_stage]))

print("ZSCoT")
output_tabular_performance(zscot_t_results)

print("RAG")
output_tabular_performance(rag_t_results)

print("LTM-RAG")
output_tabular_performance(ltm_rag_t_results)

print("KEW-LTM")
output_tabular_performance(kewltm_t_results)

In [None]:
# For N
print("Mixtral")
zscot_n_results = []
rag_n_results = []
ltm_rag_n_results = []
kewltm_n_results = []

n_label = 'n'
zscot_n_stage = 'zscot_n_stage'
rag_n_stage = 'rag_raw_n_stage'
ltm_rag_n_stage = 'ltm_rag1_n_stage'
kewltm_n_stage = "cmem_n_40reports_ans_str"

others_n_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in run_lst:
    n_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_n03_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_n_results.append(
        n03_calculate_metrics(n_test_df[n_label], n_test_df[kewltm_n_stage])
    )

    split_ids = n_test_df.patient_filename
    others_n_df = others_n_df[others_n_df["patient_filename"].isin(split_ids)]
    zscot_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[zscot_n_stage]))
    rag_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[rag_n_stage]))
    ltm_rag_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[ltm_rag_n_stage]))

print("ZSCoT")
output_tabular_performance(zscot_n_results, categories=["N0", "N1", "N2", "N3"])

print("RAG")
output_tabular_performance(rag_n_results, categories=["N0", "N1", "N2", "N3"])

print("LTM-RAG")
output_tabular_performance(ltm_rag_n_results, categories=["N0", "N1", "N2", "N3"])

print("KEW-LTM")
output_tabular_performance(kewltm_n_results, categories=["N0", "N1", "N2", "N3"])

In [None]:
lst = ["N0", "N1", "N2", "N3"]
lst.append("")
lst

In [None]:
df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv')
df.columns


In [None]:
t14_calculate_metrics(df['t'], df['t14_ltm_rag1_t_pred'])

In [None]:
t_test_df.columns

In [None]:
# For T
print("Med42")
zscot_t_results = []
rag_t_results = []
ltm_rag_t_results = []
kewltm_t_results = []

t_label = 't'
kewltm_t_stage = 'kepa_t_ans_str'
zscot_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1118_t14_med42_v2_test_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 'zscot_t_ans_str']]
rag_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 't14_rag_raw_t_pred']]
ltm_rag_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 't14_ltm_rag1_t_pred']]

run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in run_lst:
    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_t14_med42_v2_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_t_results.append(
        t14_calculate_metrics(t_test_df[t_label], t_test_df[kewltm_t_stage])
    )

    split_ids = t_test_df.patient_filename
    zscot_t_df = zscot_t_df[zscot_t_df["patient_filename"].isin(split_ids)]
    rag_t_df = rag_t_df[rag_t_df["patient_filename"].isin(split_ids)]
    ltm_rag_t_df = ltm_rag_t_df[ltm_rag_t_df["patient_filename"].isin(split_ids)]

    zscot_t_results.append(t14_calculate_metrics(zscot_t_df[t_label], zscot_t_df['zscot_t_ans_str']))
    rag_t_results.append(t14_calculate_metrics(rag_t_df[t_label], rag_t_df['t14_rag_raw_t_pred']))
    ltm_rag_t_results.append(t14_calculate_metrics(ltm_rag_t_df[t_label], ltm_rag_t_df['t14_ltm_rag1_t_pred']))

print("ZSCoT")
output_tabular_performance(zscot_t_results)

print("RAG")
output_tabular_performance(rag_t_results)

print("LTM-RAG")
output_tabular_performance(ltm_rag_t_results)

print("KEW-LTM")
output_tabular_performance(kewltm_t_results)

In [1]:
import pandas as pd
brca_report = pd.read_csv(
        "/secure/shared_data/rag_tnm_results/summary/5_folds_summary/brca_df.csv"
    )
df = brca_report[brca_report["n"] != -1][["patient_filename", "t", "text"]]

In [None]:
df = pd.read_csv("1221_result.csv")
df.columns

In [None]:
from src.metrics import *
# t14_calculate_metrics(df['t'], df['memory_test_pred'])
df = df[df['memory_test_parsed']]
t14_calculate_metrics2(df['t'], df['memory_test_pred'])

In [None]:
df = pd.read_csv("baseline_results.csv")
df.columns

In [None]:
t14_calculate_metrics(df['t'], df['t14_baseline_pred'])

In [None]:
df = pd.read_csv("1221_result_2.csv")
df.columns

In [None]:
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall']

In [None]:
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[:10]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[10:20]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[20:30]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[30:40]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[40:]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_mixtral.csv")
for i in range(0, 790, 10):
    sub_df = df[i:i+10]
    print(len(sub_df))

In [None]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1221_result_mixtral.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['1221_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['1221_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['1221_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['1221_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")



In [None]:
import pandas as pd
from src.metrics import *
data_df = pd.read_csv("1222_result_llama_4_30.csv")
data_df.columns

In [2]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_0.csv")
df.columns

Index(['patient_filename', 't', 'text', 'current_memory_from_llm',
       'current_memory_from_rag', 'experiment_t_stage_pred_zs',
       'experiment_t_stage_reasoning_zs',
       'experiment_t_stage_pred_llm_rulesonly',
       'experiment_t_stage_reasoning_llm_rulesonly',
       'experiment_t_stage_pred_rag_rulesonly',
       'experiment_t_stage_reasoning_rag_rulesonly',
       'experiment_t_stage_pred_memonly',
       'experiment_t_stage_reasoning_memonly',
       'experiment_t_stage_pred_memonly2',
       'experiment_t_stage_reasoning_memonly2',
       'experiment_t_stage_pred_rag_only',
       'experiment_t_stage_reasoning_rag_only',
       'experiment_t_stage_pred_llm_mem',
       'experiment_t_stage_reasoning_llm_mem',
       'experiment_t_stage_pred_rag_mem',
       'experiment_t_stage_reasoning_rag_mem', 'llm_rules', 'rag_rules',
       'final_memory_from_llm', 'final_memory_from_rag'],
      dtype='object')

In [None]:
df = pd.read_csv("1222_result_llama_4_30.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1222_result_llama_4_30.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


In [None]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1222_result_llama_1_1.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


In [None]:
df = pd.read_csv("1222_result_llama_1_1.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
df = pd.read_csv("1223_result_mixtral_1_1.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/result_t/1223_result_llama_1_1_increased_model_len_65536_full.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
main

In [None]:
zs

In [None]:
rules

In [2]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_0.csv")
main_1 = 'experiment_n_stage_pred_llm_mem'
main_2 = 'experiment_n_stage_pred_rag_mem'
zs = 'experiment_n_stage_pred_zs'
rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
memory_1 = 'experiment_n_stage_pred_memonly'
memory_2 = 'experiment_n_stage_pred_memonly2'
rag = 'experiment_n_stage_pred_rag_only'

main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

main_1: 0.81, 0.869, 0.859
main_2: 0.858, 0.89, 0.889
zs: 0.839, 0.889, 0.883
rules_1: 0.779, 0.858, 0.841
rules_2: 0.848, 0.886, 0.885
memory_1: 0.84, 0.887, 0.882
memory_2: 0.839, 0.887, 0.882
rag: 0.867, 0.899, 0.897


In [2]:
from tabulate import tabulate

full_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_0.csv")
for i in range(0, 790, 40):
    df = full_df[i:i+40]

    main_1 = 'experiment_n_stage_pred_llm_mem'
    main_2 = 'experiment_n_stage_pred_rag_mem'
    zs = 'experiment_n_stage_pred_zs'
    rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
    rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
    memory_1 = 'experiment_n_stage_pred_memonly'
    memory_2 = 'experiment_n_stage_pred_memonly2'
    rag = 'experiment_n_stage_pred_rag_only'

    main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
    main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
    zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
    rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
    rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
    memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
    memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
    rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

    table_data = [
        ["main_1",   main_1['macro_f1'],   main_1['micro_f1'],   main_1['weighted_f1']],
        ["main_2",   main_2['macro_f1'],   main_2['micro_f1'],   main_2['weighted_f1']],
        ["zs",       zs['macro_f1'],       zs['micro_f1'],       zs['weighted_f1']],
        ["rules_1",  rules_1['macro_f1'],  rules_1['micro_f1'],  rules_1['weighted_f1']],
        ["rules_2",  rules_2['macro_f1'],  rules_2['micro_f1'],  rules_2['weighted_f1']],
        ["memory_1", memory_1['macro_f1'], memory_1['micro_f1'], memory_1['weighted_f1']],
        ["memory_2", memory_2['macro_f1'], memory_2['micro_f1'], memory_2['weighted_f1']],
        ["rag",      rag['macro_f1'],      rag['micro_f1'],      rag['weighted_f1']],
    ]

    print(tabulate(
        table_data,
        headers=["Method", "Macro F1", "Micro F1", "Weighted F1"],
        floatfmt=".4f",  # 4 decimal places
        tablefmt="github"
    ))


| Method   |   Macro F1 |   Micro F1 |   Weighted F1 |
|----------|------------|------------|---------------|
| main_1   |     0.4440 |     0.8000 |        0.7340 |
| main_2   |     0.7830 |     0.9000 |        0.8890 |
| zs       |     0.7210 |     0.8750 |        0.8690 |
| rules_1  |     0.4580 |     0.8250 |        0.7570 |
| rules_2  |     0.7830 |     0.9000 |        0.8890 |
| memory_1 |     0.6220 |     0.8250 |        0.8190 |
| memory_2 |     0.5980 |     0.8000 |        0.8000 |
| rag      |     0.8650 |     0.9250 |        0.9230 |
| Method   |   Macro F1 |   Micro F1 |   Weighted F1 |
|----------|------------|------------|---------------|
| main_1   |     0.6970 |     0.8750 |        0.8410 |
| main_2   |     0.8910 |     0.9000 |        0.8970 |
| zs       |     0.8960 |     0.9000 |        0.8990 |
| rules_1  |     0.6660 |     0.8500 |        0.8150 |
| rules_2  |     0.8910 |     0.9000 |        0.8970 |
| memory_1 |     0.8290 |     0.9000 |        0.8890 |
| memory_2

In [3]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_4.csv")
main_1 = 'experiment_n_stage_pred_llm_mem'
main_2 = 'experiment_n_stage_pred_rag_mem'
zs = 'experiment_n_stage_pred_zs'
rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
memory_1 = 'experiment_n_stage_pred_memonly'
memory_2 = 'experiment_n_stage_pred_memonly2'
rag = 'experiment_n_stage_pred_rag_only'

main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

main_1: 0.813, 0.869, 0.861
main_2: 0.858, 0.896, 0.895
zs: 0.84, 0.889, 0.883
rules_1: 0.782, 0.858, 0.841
rules_2: 0.841, 0.881, 0.878
memory_1: 0.834, 0.886, 0.88
memory_2: 0.847, 0.891, 0.886
rag: 0.865, 0.897, 0.896


In [4]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_0.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")


main_1: 0.772, 0.866, 0.871
main_2: 0.659, 0.751, 0.747
zs: 0.756, 0.837, 0.838
rules_1: 0.769, 0.862, 0.869
rules_2: 0.65, 0.748, 0.746
memory_1: 0.723, 0.819, 0.819
memory_2: 0.711, 0.803, 0.803
rag: 0.712, 0.809, 0.805


In [5]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_1.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")


main_1: 0.757, 0.855, 0.86
main_2: 0.682, 0.763, 0.758
zs: 0.75, 0.83, 0.831
rules_1: 0.767, 0.859, 0.865
rules_2: 0.66, 0.758, 0.756
memory_1: 0.716, 0.809, 0.809
memory_2: 0.712, 0.812, 0.809
rag: 0.718, 0.812, 0.809


In [2]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_2.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

main_1: 0.769, 0.862, 0.868
main_2: 0.684, 0.767, 0.764
zs: 0.767, 0.842, 0.842
rules_1: 0.768, 0.86, 0.866
rules_2: 0.658, 0.755, 0.753
memory_1: 0.735, 0.821, 0.822
memory_2: 0.705, 0.808, 0.81
rag: 0.713, 0.809, 0.805


In [3]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_3.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

main_1: 0.793, 0.879, 0.882
main_2: 0.686, 0.768, 0.766
zs: 0.767, 0.838, 0.838
rules_1: 0.769, 0.86, 0.867
rules_2: 0.658, 0.755, 0.753
memory_1: 0.722, 0.82, 0.817
memory_2: 0.708, 0.808, 0.809
rag: 0.715, 0.807, 0.803


In [6]:
df = pd.read_csv("/secure/shared_data/rag_tnm_results/summary/5_folds_summary/luad_df.csv")
df.t.unique()

array([1, 2, 0, 3])