In [1]:
import pandas as pd
from src.metrics import *

In [None]:
df.columns

In [None]:
df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv') # 이거야. 여기에 mixtral결과 다있어
t14_calculate_metrics(df['t'], df['zscot_t_stage'])

In [None]:
t14_calculate_metrics()

In [6]:
def calculate_mean_std(results, cat):
    precision_list = [result[cat]["precision"] for result in results]
    recall_list = [result[cat]["recall"] for result in results]
    f1_list = [result[cat]["f1"] for result in results]
    support_list = [result[cat]["support"] for result in results]
    num_errors_list = [result[cat]["num_errors"] for result in results]

    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mean_f1 = sum(f1_list) / len(f1_list)

    std_precision = (
        sum([(x - mean_precision) ** 2 for x in precision_list]) / len(precision_list)
    ) ** 0.5
    std_recall = (
        sum([(x - mean_recall) ** 2 for x in recall_list]) / len(recall_list)
    ) ** 0.5
    std_f1 = (sum([(x - mean_f1) ** 2 for x in f1_list]) / len(f1_list)) ** 0.5

    return {
        "mean_precision": round(mean_precision, 3),
        "mean_recall": round(mean_recall, 3),
        "mean_f1": round(mean_f1, 3),
        "std_precision": round(std_precision, 3),
        "std_recall": round(std_recall, 3),
        "std_f1": round(std_f1, 3),
        "sum_support": sum(support_list),
        "sum_num_errors": sum(num_errors_list),
        "raw_mean_precision": mean_precision,
        "raw_mean_recall": mean_recall,
        "raw_mean_f1": mean_f1,
    }


def output_tabular_performance(results, categories=["T1", "T2", "T3", "T4"]):
    precisions = []
    recalls = []
    f1s = []

    for category in categories:
        eval = calculate_mean_std(results, category)
        print(
            "{} {:.3f}({:.3f}) {:.3f}({:.3f}) {:.3f}({:.3f})".format(
                category,
                eval["mean_precision"],
                eval["std_precision"],
                eval["mean_recall"],
                eval["std_recall"],
                eval["mean_f1"],
                eval["std_f1"],
            )
        )

        # for calculating macro average
        precisions.append(eval["raw_mean_precision"])
        recalls.append(eval["raw_mean_recall"])
        f1s.append(eval["raw_mean_f1"])

    print(
        "MacroAvg. {:.3f} {:.3f} {:.3f}".format(
            round(sum(precisions) / len(precisions), 3),
            round(sum(recalls) / len(recalls), 3),
            round(sum(f1s) / len(f1s), 3),
        )
    )

Index(['patient_filename', 't', 'text', 'n', 'zscot_t_reasoning',
       'zscot_t_stage', 'zscot_n_reasoning', 'zscot_n_stage',
       'rag_raw_t_reasoning', 'rag_raw_t_stage', 'rag_raw_n_reasoning',
       'rag_raw_n_stage', 'ltm_zs_t_reasoning', 'ltm_zs_t_stage',
       'ltm_zs_n_reasoning', 'ltm_zs_n_stage', 'ltm_rag1_t_reasoning',
       'ltm_rag1_t_stage', 'ltm_rag1_n_reasoning', 'ltm_rag1_n_stage',
       'ltm_rag2_t_reasoning', 'ltm_rag2_t_stage', 'ltm_rag2_n_reasoning',
       'ltm_rag2_n_stage', 'zscot_t_flag', 'zscot_n_flag', 'rag_raw_t_flag',
       'rag_raw_n_flag', 'ltm_zs_t_flag', 'ltm_zs_n_flag', 'ltm_rag1_t_flag',
       'ltm_rag1_n_flag', 'ltm_rag2_t_flag', 'ltm_rag2_n_flag'],
      dtype='object')

In [36]:
# kewltm_t_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv")
others_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

In [None]:
# For T
print("Mixtral")
zscot_t_results = []
rag_t_results = []
ltm_rag_t_results = []
kewltm_t_results = []

t_label = 't'
zscot_t_stage = 'zscot_t_stage'
rag_t_stage = 'rag_raw_t_stage'
ltm_rag_t_stage = 'ltm_rag1_t_stage'
kewltm_t_stage = "cmem_t_40reports_ans_str"

others_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in run_lst:
    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_t_results.append(
        t14_calculate_metrics(t_test_df[t_label], t_test_df[kewltm_t_stage])
    )

    split_ids = t_test_df.patient_filename
    others_t_df = others_t_df[others_t_df["patient_filename"].isin(split_ids)]
    zscot_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[zscot_t_stage]))
    rag_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[rag_t_stage]))
    ltm_rag_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[ltm_rag_t_stage]))

print("ZSCoT")
output_tabular_performance(zscot_t_results)

print("RAG")
output_tabular_performance(rag_t_results)

print("LTM-RAG")
output_tabular_performance(ltm_rag_t_results)

print("KEW-LTM")
output_tabular_performance(kewltm_t_results)

In [None]:
# For N
print("Mixtral")
zscot_n_results = []
rag_n_results = []
ltm_rag_n_results = []
kewltm_n_results = []

n_label = 'n'
zscot_n_stage = 'zscot_n_stage'
rag_n_stage = 'rag_raw_n_stage'
ltm_rag_n_stage = 'ltm_rag1_n_stage'
kewltm_n_stage = "cmem_n_40reports_ans_str"

others_n_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in run_lst:
    n_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_n03_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_n_results.append(
        n03_calculate_metrics(n_test_df[n_label], n_test_df[kewltm_n_stage])
    )

    split_ids = n_test_df.patient_filename
    others_n_df = others_n_df[others_n_df["patient_filename"].isin(split_ids)]
    zscot_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[zscot_n_stage]))
    rag_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[rag_n_stage]))
    ltm_rag_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[ltm_rag_n_stage]))

print("ZSCoT")
output_tabular_performance(zscot_n_results, categories=["N0", "N1", "N2", "N3"])

print("RAG")
output_tabular_performance(rag_n_results, categories=["N0", "N1", "N2", "N3"])

print("LTM-RAG")
output_tabular_performance(ltm_rag_n_results, categories=["N0", "N1", "N2", "N3"])

print("KEW-LTM")
output_tabular_performance(kewltm_n_results, categories=["N0", "N1", "N2", "N3"])

In [None]:
lst = ["N0", "N1", "N2", "N3"]
lst.append("")
lst

In [None]:
df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv')
df.columns


In [None]:
t14_calculate_metrics(df['t'], df['t14_ltm_rag1_t_pred'])

In [None]:
t_test_df.columns

In [None]:
# For T
print("Med42")
zscot_t_results = []
rag_t_results = []
ltm_rag_t_results = []
kewltm_t_results = []

t_label = 't'
kewltm_t_stage = 'kepa_t_ans_str'
zscot_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1118_t14_med42_v2_test_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 'zscot_t_ans_str']]
rag_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 't14_rag_raw_t_pred']]
ltm_rag_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 't14_ltm_rag1_t_pred']]

run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in run_lst:
    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_t14_med42_v2_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_t_results.append(
        t14_calculate_metrics(t_test_df[t_label], t_test_df[kewltm_t_stage])
    )

    split_ids = t_test_df.patient_filename
    zscot_t_df = zscot_t_df[zscot_t_df["patient_filename"].isin(split_ids)]
    rag_t_df = rag_t_df[rag_t_df["patient_filename"].isin(split_ids)]
    ltm_rag_t_df = ltm_rag_t_df[ltm_rag_t_df["patient_filename"].isin(split_ids)]

    zscot_t_results.append(t14_calculate_metrics(zscot_t_df[t_label], zscot_t_df['zscot_t_ans_str']))
    rag_t_results.append(t14_calculate_metrics(rag_t_df[t_label], rag_t_df['t14_rag_raw_t_pred']))
    ltm_rag_t_results.append(t14_calculate_metrics(ltm_rag_t_df[t_label], ltm_rag_t_df['t14_ltm_rag1_t_pred']))

print("ZSCoT")
output_tabular_performance(zscot_t_results)

print("RAG")
output_tabular_performance(rag_t_results)

print("LTM-RAG")
output_tabular_performance(ltm_rag_t_results)

print("KEW-LTM")
output_tabular_performance(kewltm_t_results)

In [1]:
import pandas as pd
brca_report = pd.read_csv(
        "/secure/shared_data/rag_tnm_results/summary/5_folds_summary/brca_df.csv"
    )
df = brca_report[brca_report["n"] != -1][["patient_filename", "t", "text"]]

In [2]:
df = pd.read_csv("1221_result.csv")
df.columns

Index(['patient_filename', 't', 'text', 'memory_test_parsed',
       'memory_test_reasoning', 'memory_test_pred',
       'memory_test_error_analysis', 'memory_test_refine_memory_count',
       'final_memory'],
      dtype='object')

In [4]:
from src.metrics import *
# t14_calculate_metrics(df['t'], df['memory_test_pred'])
df = df[df['memory_test_parsed']]
t14_calculate_metrics2(df['t'], df['memory_test_pred'])

{'T2': {'precision': 0.857,
  'recall': 0.908,
  'f1': 0.882,
  'support': 468,
  'num_errors': 114},
 'T1': {'precision': 0.766,
  'recall': 0.786,
  'f1': 0.776,
  'support': 187,
  'num_errors': 85},
 'T4': {'precision': 0.65,
  'recall': 0.361,
  'f1': 0.464,
  'support': 36,
  'num_errors': 30},
 'T3': {'precision': 0.868,
  'recall': 0.731,
  'f1': 0.794,
  'support': 108,
  'num_errors': 41},
 'overall': {'macro_precision': 0.785,
  'macro_recall': 0.697,
  'macro_f1': 0.729,
  'micro_precision': 0.831,
  'micro_recall': 0.831,
  'micro_f1': 0.831,
  'weighted_f1': 0.826,
  'support': 799,
  'num_errors': 270}}

In [None]:
df = pd.read_csv("baseline_results.csv")
df.columns

In [None]:
t14_calculate_metrics(df['t'], df['t14_baseline_pred'])

In [None]:
df = pd.read_csv("1221_result_2.csv")
df.columns

In [None]:
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall']

In [None]:
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[:10]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[10:20]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[20:30]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[30:40]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[40:]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_mixtral.csv")
for i in range(0, 790, 10):
    sub_df = df[i:i+10]
    print(len(sub_df))

In [3]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1221_result_mixtral.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['1221_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['1221_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['1221_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['1221_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")



macro F1: 0.793 (0.825) (0.726) (0.706)
micro F1: 0.825 (0.825) (0.800) (0.787)
weighted F1: 0.829 (0.824) (0.813) (0.783)
-----------------
macro F1: 0.911 (0.901) (0.927) (0.908)
micro F1: 0.950 (0.938) (0.963) (0.931)
weighted F1: 0.947 (0.937) (0.960) (0.929)
-----------------
macro F1: 0.813 (0.962) (0.860) (0.722)
micro F1: 0.937 (0.988) (0.925) (0.875)
weighted F1: 0.933 (0.987) (0.931) (0.872)
-----------------
macro F1: 0.733 (0.617) (0.685) (0.442)
micro F1: 0.838 (0.800) (0.812) (0.717)
weighted F1: 0.848 (0.792) (0.825) (0.715)
-----------------
macro F1: 0.594 (0.706) (0.571) (0.720)
micro F1: 0.684 (0.825) (0.750) (0.787)
weighted F1: 0.678 (0.815) (0.739) (0.775)
-----------------
macro F1: 0.676 (0.747) (0.704) (0.488)
micro F1: 0.812 (0.812) (0.838) (0.713)
weighted F1: 0.801 (0.809) (0.830) (0.689)
-----------------
macro F1: 0.703 (0.923) (0.751) (0.710)
micro F1: 0.812 (0.912) (0.875) (0.750)
weighted F1: 0.819 (0.912) (0.878) (0.755)
-----------------
macro F1: 0.7

In [3]:
import pandas as pd
from src.metrics import *
data_df = pd.read_csv("1222_result_llama_4_30.csv")
data_df.columns

Index(['patient_filename', 't', 'text', 'current_memory', 'ltm_test_pred_zs',
       'ltm_test_pred_rules', 'ltm_test_pred_memory_only', 'ltm_test_parsed',
       'ltm_test_reasoning', 'ltm_test_pred', 'ltm_test_refine_memory_count',
       'base_rules', 'final_memory'],
      dtype='object')

In [None]:
t14_calculate_metrics2

In [5]:
df = pd.read_csv("1222_result_llama_4_30.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


macro F1: 0.780 (0.769) (0.765) (0.753)
micro F1: 0.863 (0.835) (0.856) (0.838)
weighted F1: 0.867 (0.836) (0.864) (0.841)


In [4]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1222_result_llama_4_30.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


macro F1: 0.720 (0.637) (0.686) (0.662)
micro F1: 0.825 (0.750) (0.800) (0.775)
weighted F1: 0.840 (0.774) (0.822) (0.792)
-----------------
macro F1: 0.877 (0.925) (0.866) (0.861)
micro F1: 0.912 (0.950) (0.912) (0.912)
weighted F1: 0.914 (0.950) (0.913) (0.912)
-----------------
macro F1: 0.915 (0.930) (0.915) (0.881)
micro F1: 0.963 (0.963) (0.963) (0.938)
weighted F1: 0.967 (0.964) (0.967) (0.945)
-----------------
macro F1: 0.704 (0.666) (0.735) (0.631)
micro F1: 0.850 (0.812) (0.875) (0.825)
weighted F1: 0.851 (0.806) (0.883) (0.828)
-----------------
macro F1: 0.659 (0.652) (0.617) (0.680)
micro F1: 0.787 (0.725) (0.738) (0.805)
weighted F1: 0.779 (0.727) (0.744) (0.802)
-----------------
macro F1: 0.799 (0.687) (0.798) (0.720)
micro F1: 0.850 (0.738) (0.838) (0.752)
weighted F1: 0.850 (0.733) (0.843) (0.751)
-----------------
macro F1: 0.702 (0.765) (0.677) (0.740)
micro F1: 0.850 (0.900) (0.838) (0.875)
weighted F1: 0.861 (0.908) (0.854) (0.887)
-----------------
macro F1: 0.8

In [1]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1222_result_llama_1_1.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


macro F1: 0.690 (0.671) (0.708) (0.644)
micro F1: 0.800 (0.775) (0.825) (0.762)
weighted F1: 0.816 (0.794) (0.841) (0.777)
-----------------
macro F1: 0.890 (0.925) (0.854) (0.861)
micro F1: 0.925 (0.950) (0.900) (0.900)
weighted F1: 0.926 (0.950) (0.902) (0.896)
-----------------
macro F1: 0.936 (0.929) (0.915) (0.921)
micro F1: 0.975 (0.963) (0.963) (0.963)
weighted F1: 0.977 (0.964) (0.967) (0.964)
-----------------
macro F1: 0.722 (0.657) (0.735) (0.568)
micro F1: 0.875 (0.800) (0.875) (0.812)
weighted F1: 0.882 (0.795) (0.883) (0.796)
-----------------
macro F1: 0.631 (0.654) (0.589) (0.650)
micro F1: 0.750 (0.725) (0.700) (0.775)
weighted F1: 0.745 (0.728) (0.708) (0.763)
-----------------
macro F1: 0.837 (0.695) (0.798) (0.679)
micro F1: 0.887 (0.750) (0.838) (0.762)
weighted F1: 0.890 (0.745) (0.843) (0.750)
-----------------
macro F1: 0.718 (0.765) (0.735) (0.635)
micro F1: 0.863 (0.900) (0.863) (0.863)
weighted F1: 0.884 (0.908) (0.878) (0.856)
-----------------
macro F1: 0.7

In [2]:
df = pd.read_csv("1222_result_llama_1_1.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


macro F1: 0.783 (0.774) (0.771) (0.747)
micro F1: 0.870 (0.840) (0.859) (0.838)
weighted F1: 0.875 (0.842) (0.866) (0.836)


In [2]:
df = pd.read_csv("1223_result_mixtral_1_1.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


macro F1: 0.746 (0.812) (0.789) (0.749)
micro F1: 0.834 (0.883) (0.861) (0.851)
weighted F1: 0.836 (0.881) (0.859) (0.846)


In [2]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/1223_result_llama_1_1_increased_model_len_65536_full.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


macro F1: 0.764 (0.741) (0.761) (0.721)
micro F1: 0.864 (0.823) (0.857) (0.817)
weighted F1: 0.870 (0.824) (0.865) (0.816)


In [3]:
data_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/1223_result_llama_1_1_increased_model_len_65536_full.csv")

for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


macro F1: 0.758 (0.703) (0.776) (0.650)
micro F1: 0.838 (0.775) (0.863) (0.738)
weighted F1: 0.845 (0.780) (0.875) (0.737)
-----------------
macro F1: 0.678 (0.583) (0.671) (0.745)
micro F1: 0.825 (0.762) (0.825) (0.838)
weighted F1: 0.834 (0.766) (0.839) (0.829)
-----------------
macro F1: 0.842 (0.808) (0.817) (0.783)
micro F1: 0.912 (0.863) (0.887) (0.825)
weighted F1: 0.912 (0.862) (0.890) (0.816)
-----------------
macro F1: 0.844 (0.772) (0.811) (0.763)
micro F1: 0.938 (0.875) (0.900) (0.850)
weighted F1: 0.933 (0.870) (0.895) (0.844)
-----------------
macro F1: 0.764 (0.674) (0.781) (0.739)
micro F1: 0.850 (0.800) (0.863) (0.850)
weighted F1: 0.852 (0.790) (0.869) (0.846)
-----------------
macro F1: 0.811 (0.764) (0.793) (0.761)
micro F1: 0.875 (0.825) (0.850) (0.825)
weighted F1: 0.874 (0.818) (0.853) (0.818)
-----------------
macro F1: 0.822 (0.803) (0.814) (0.700)
micro F1: 0.900 (0.838) (0.887) (0.762)
weighted F1: 0.901 (0.829) (0.888) (0.757)
-----------------
macro F1: 0.8