In [1]:
import pandas as pd
from src.metrics import *

In [3]:
df.columns

Index(['patient_filename', 't', 'text', 'n', 'zscot_t_reasoning',
       'zscot_t_stage', 'zscot_n_reasoning', 'zscot_n_stage',
       'rag_raw_t_reasoning', 'rag_raw_t_stage', 'rag_raw_n_reasoning',
       'rag_raw_n_stage', 'ltm_zs_t_reasoning', 'ltm_zs_t_stage',
       'ltm_zs_n_reasoning', 'ltm_zs_n_stage', 'ltm_rag1_t_reasoning',
       'ltm_rag1_t_stage', 'ltm_rag1_n_reasoning', 'ltm_rag1_n_stage',
       'ltm_rag2_t_reasoning', 'ltm_rag2_t_stage', 'ltm_rag2_n_reasoning',
       'ltm_rag2_n_stage', 'zscot_t_flag', 'zscot_n_flag', 'rag_raw_t_flag',
       'rag_raw_n_flag', 'ltm_zs_t_flag', 'ltm_zs_n_flag', 'ltm_rag1_t_flag',
       'ltm_rag1_n_flag', 'ltm_rag2_t_flag', 'ltm_rag2_n_flag'],
      dtype='object')

In [47]:
import pandas as pd

def t14_calculate_metrics(true_labels: pd.Series, predictions: pd.Series) -> dict:
    """
    Calculates precision, recall, F1-score, and support for T-stage predictions.
    Includes both per-label, micro-average, and macro-average scores.

    Args:
        true_labels: A pandas Series of true labels (e.g., integers from 0 to N-1).
        predictions: A pandas Series of predicted labels (strings).

    Returns:
        A dictionary containing the metrics for each label and overall scores.
    """
    # Check for valid inputs
    if len(true_labels) != len(predictions):
        raise ValueError("The length of true_labels and predictions must be the same.")

    if not isinstance(true_labels, pd.Series) or not isinstance(predictions, pd.Series):
        raise TypeError("true_labels and predictions must be pandas Series.")

    if any(pd.isna(pred) or not isinstance(pred, str) for pred in predictions):
        raise ValueError("All predictions must be non-null strings.")

    # Standardize true labels to "T{x+1}" format
    true_labels = true_labels.apply(lambda x: f"T{int(x)+1}")

    metrics = {}
    label_counts = {}
    unique_true_labels = sorted(list(set(true_labels))) # Ensure consistent order

    for label in unique_true_labels:
        metrics[label] = {"tp": 0, "fp": 0, "fn": 0}
        label_counts[label] = 0

    for true_label, prediction in zip(true_labels, predictions):
        # Ensure prediction is a string and convert to uppercase
        prediction_str = str(prediction).upper()
        
        label_counts[true_label] += 1
        if true_label in prediction_str:
            metrics[true_label]["tp"] += 1
        else:
            metrics[true_label]["fn"] += 1

        # Calculate false positives
        # A prediction is a false positive for a label if:
        # 1. The label is present in the prediction string.
        # 2. The label is NOT the true_label.
        for label_to_check_fp in unique_true_labels:
            if label_to_check_fp in prediction_str and label_to_check_fp != true_label:
                metrics[label_to_check_fp]["fp"] += 1
    
    results = {}
    # Variables for micro-averaging
    total_tp_micro = 0
    total_fp_micro = 0
    total_fn_micro = 0
    
    # Variables for macro-averaging
    macro_precision_sum = 0.0
    macro_recall_sum = 0.0
    macro_f1_sum = 0.0
    
    total_instances = len(true_labels)

    for label in unique_true_labels: # Iterate in a defined order
        counts = metrics[label]
        tp = counts["tp"]
        fp = counts["fp"]
        fn = counts["fn"]

        # Precision: TP / (TP + FP)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        # Recall: TP / (TP + FN)
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        # F1-Score: 2 * (Precision * Recall) / (Precision + Recall)
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )
        support = label_counts[label]

        results[label] = {
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "f1": round(f1, 3),
            "support": support,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "num_errors": fp + fn, # Sum of false positives and false negatives for this class
        }

        # Accumulate for micro-averages
        total_tp_micro += tp
        total_fp_micro += fp
        total_fn_micro += fn
        
        # Accumulate for macro-averages
        macro_precision_sum += precision
        macro_recall_sum += recall
        macro_f1_sum += f1

    # Calculate macro-averaged metrics
    num_labels = len(unique_true_labels)
    macro_precision = macro_precision_sum / num_labels if num_labels > 0 else 0.0
    macro_recall = macro_recall_sum / num_labels if num_labels > 0 else 0.0
    macro_f1 = macro_f1_sum / num_labels if num_labels > 0 else 0.0 # Often calculated as harmonic mean of macro_precision and macro_recall

    # Calculate micro-averaged (overall) precision, recall, and F1 score
    # Micro-Precision: Sum of all TPs / (Sum of all TPs + Sum of all FPs)
    micro_precision = (
        total_tp_micro / (total_tp_micro + total_fp_micro) if (total_tp_micro + total_fp_micro) > 0 else 0.0
    )
    # Micro-Recall: Sum of all TPs / (Sum of all TPs + Sum of all FNs)
    micro_recall = (
        total_tp_micro / (total_tp_micro + total_fn_micro) if (total_tp_micro + total_fn_micro) > 0 else 0.0
    )
    # Micro-F1: 2 * (Micro-Precision * Micro-Recall) / (Micro-Precision + Micro-Recall)
    micro_f1 = (
        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
        if (micro_precision + micro_recall) > 0
        else 0.0
    )

    # Calculate weighted F1 score
    weighted_f1_sum = 0.0
    for label in unique_true_labels:
        weighted_f1_sum += results[label]["f1"] * label_counts[label]
    weighted_f1 = weighted_f1_sum / total_instances if total_instances > 0 else 0.0

    results["overall"] = {
        "micro_precision": round(micro_precision, 3),
        "micro_recall": round(micro_recall, 3),
        "micro_f1": round(micro_f1, 3),
        "macro_precision": round(macro_precision, 3),
        "macro_recall": round(macro_recall, 3),
        "macro_f1": round(macro_f1, 3),
        "weighted_f1": round(weighted_f1, 3),
        "support": total_instances,
        "total_tp": total_tp_micro,
        "total_fp": total_fp_micro,
        "total_fn": total_fn_micro,
        "num_errors": total_fp_micro + total_fn_micro, # Sum of all false positives and false negatives
    }
    return results


def n03_calculate_metrics(true_labels: pd.Series, predictions: pd.Series) -> dict:
    """
    Calculates precision, recall, F1-score, and support for N-stage predictions.
    Includes both per-label, micro-average, and macro-average scores.
    Handles specific label replacements: "NO" to "N0", "NL" to "N1".

    Args:
        true_labels: A pandas Series of true labels (e.g., integers from 0 to N-1).
        predictions: A pandas Series of predicted labels (strings).

    Returns:
        A dictionary containing the metrics for each label and overall scores.
    """
    # Check for valid inputs
    if len(true_labels) != len(predictions):
        raise ValueError("The length of true_labels and predictions must be the same.")

    if not isinstance(true_labels, pd.Series) or not isinstance(predictions, pd.Series):
        raise TypeError("true_labels and predictions must be pandas Series.")

    if any(pd.isna(pred) or not isinstance(pred, str) for pred in predictions):
        raise ValueError("All predictions must be non-null strings.")

    # Standardize true labels to "N{x}" format
    true_labels = true_labels.apply(lambda x: f"N{int(x)}")

    metrics = {}
    label_counts = {}
    unique_true_labels = sorted(list(set(true_labels))) # Ensure consistent order

    for label in unique_true_labels:
        metrics[label] = {"tp": 0, "fp": 0, "fn": 0}
        label_counts[label] = 0

    for true_label, prediction in zip(true_labels, predictions):
        # Ensure prediction is a string, convert to uppercase, and apply replacements
        prediction_str = str(prediction).upper()
        prediction_str = prediction_str.replace("NO", "N0").replace("NL", "N1")
        
        label_counts[true_label] += 1
        if true_label in prediction_str:
            metrics[true_label]["tp"] += 1
        else:
            metrics[true_label]["fn"] += 1

        # Calculate false positives
        for label_to_check_fp in unique_true_labels:
            if label_to_check_fp in prediction_str and label_to_check_fp != true_label:
                metrics[label_to_check_fp]["fp"] += 1

    results = {}
    # Variables for micro-averaging
    total_tp_micro = 0
    total_fp_micro = 0
    total_fn_micro = 0
    
    # Variables for macro-averaging
    macro_precision_sum = 0.0
    macro_recall_sum = 0.0
    macro_f1_sum = 0.0

    total_instances = len(true_labels)

    for label in unique_true_labels: # Iterate in a defined order
        counts = metrics[label]
        tp = counts["tp"]
        fp = counts["fp"]
        fn = counts["fn"]

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = (
            2 * precision * recall / (precision + recall)
            if (precision + recall) > 0
            else 0.0
        )
        support = label_counts[label]

        results[label] = {
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "f1": round(f1, 3),
            "support": support,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "num_errors": fp + fn,
        }

        total_tp_micro += tp
        total_fp_micro += fp
        total_fn_micro += fn

        macro_precision_sum += precision
        macro_recall_sum += recall
        macro_f1_sum += f1

    # Calculate macro-averaged metrics
    num_labels = len(unique_true_labels)
    macro_precision = macro_precision_sum / num_labels if num_labels > 0 else 0.0
    macro_recall = macro_recall_sum / num_labels if num_labels > 0 else 0.0
    macro_f1 = macro_f1_sum / num_labels if num_labels > 0 else 0.0

    # Calculate micro-averaged (overall) precision, recall, and F1 score
    micro_precision = (
        total_tp_micro / (total_tp_micro + total_fp_micro) if (total_tp_micro + total_fp_micro) > 0 else 0.0
    )
    micro_recall = (
        total_tp_micro / (total_tp_micro + total_fn_micro) if (total_tp_micro + total_fn_micro) > 0 else 0.0
    )
    micro_f1 = (
        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
        if (micro_precision + micro_recall) > 0
        else 0.0
    )

    # Calculate weighted F1 score
    weighted_f1_sum = 0.0
    for label in unique_true_labels:
        weighted_f1_sum += results[label]["f1"] * label_counts[label]
    weighted_f1 = weighted_f1_sum / total_instances if total_instances > 0 else 0.0
    
    results["overall"] = {
        "micro_precision": round(micro_precision, 3),
        "micro_recall": round(micro_recall, 3),
        "micro_f1": round(micro_f1, 3),
        "macro_precision": round(macro_precision, 3),
        "macro_recall": round(macro_recall, 3),
        "macro_f1": round(macro_f1, 3),
        "weighted_f1": round(weighted_f1, 3),
        "support": total_instances,
        "total_tp": total_tp_micro,
        "total_fp": total_fp_micro,
        "total_fn": total_fn_micro,
        "num_errors": total_fp_micro + total_fn_micro,
    }

    return results

if __name__ == '__main__':
    # Example Usage for t14_calculate_metrics
    print("--- Example for t14_calculate_metrics ---")
    true_t_labels = pd.Series([0, 1, 2, 0, 1, 2, 0, 1, 2, 3]) 
    # Corresponding to T1, T2, T3, T1, T2, T3, T1, T2, T3, T4
    predictions_t = pd.Series([
        "T1", "T2", "T3", 
        "T1", "T1", "T2", # T2 pred as T1, T3 pred as T2
        "T1 and T2", "T2", "T4", # T1 pred as T1 and T2 (FP for T2), T3 pred as T4 (FN for T3, FP for T4 if T4 is not true)
        "T4"
    ])
    
    # Test with mixed types in predictions (should raise ValueError)
    # predictions_t_invalid = pd.Series(["T1", "T2", None, "T1"])
    # try:
    #     t14_metrics_invalid = t14_calculate_metrics(true_t_labels, predictions_t_invalid)
    #     for key, value in t14_metrics_invalid.items():
    #         print(f"{key}: {value}")
    # except ValueError as e:
    #     print(f"Error: {e}")

    # Test with valid predictions
    t14_metrics = t14_calculate_metrics(true_t_labels, predictions_t)
    print("Per-label metrics:")
    for label, metrics_val in t14_metrics.items():
        if label != "overall":
            print(f"  {label}: {metrics_val}")
    print("Overall metrics:")
    print(f"  {t14_metrics['overall']}")
    print("\n")

    # Example Usage for n03_calculate_metrics
    print("--- Example for n03_calculate_metrics ---")
    true_n_labels = pd.Series([0, 1, 0, 1, 0, 0, 1, 1, 2, 2]) 
    # Corresponding to N0, N1, N0, N1, N0, N0, N1, N1, N2, N2
    predictions_n = pd.Series([
        "N0", "N1", "N0", "N1", 
        "NO", # N0, but "NO" should be converted to "N0"
        "N1", # True N0, predicted N1 (FP for N1, FN for N0)
        "NL", # N1, but "NL" should be converted to "N1"
        "N0", # True N1, predicted N0 (FP for N0, FN for N1)
        "N2", 
        "N2 and N0" # True N2, predicted N2 and N0 (FP for N0)
    ])
    n03_metrics = n03_calculate_metrics(true_n_labels, predictions_n)
    print("Per-label metrics:")
    for label, metrics_val in n03_metrics.items():
        if label != "overall":
            print(f"  {label}: {metrics_val}")
    print("Overall metrics:")
    print(f"  {n03_metrics['overall']}")

    # Example with empty inputs (should ideally be handled or raise error based on design)
    # print("\n--- Example with empty inputs ---")
    # empty_true = pd.Series([], dtype='object')
    # empty_preds = pd.Series([], dtype='object')
    # try:
    #     empty_metrics = t14_calculate_metrics(empty_true, empty_preds)
    #     print(empty_metrics) # Should produce all zeros or handle gracefully
    # except ValueError as e:
    #     print(f"Error with empty inputs: {e}") # Current code will raise ValueError due to len check

    # Example with labels not present in true_labels but in predictions
    # This scenario is implicitly handled as FPs for those "extra" predicted labels won't be counted against any true class directly,
    # but they might influence the FP count of actual classes if the prediction string contains multiple labels.
    # The current logic for FP:
    #   `if label in prediction and label != true_label: metrics[label]["fp"] += 1`
    # This means FP is only counted for labels that are actual true labels in the dataset.
    # If a prediction contains "T5" but "T5" is not in `unique_true_labels`, it won't get an FP count.
    # This is a common way to handle it, focusing on the predefined set of classes.

    print("\n--- Example for t14_calculate_metrics with all correct predictions ---")
    true_t_all_correct = pd.Series([0, 1, 2, 0])
    predictions_t_all_correct = pd.Series(["T1", "T2", "T3", "T1"])
    t14_metrics_correct = t14_calculate_metrics(true_t_all_correct, predictions_t_all_correct)
    print("Overall metrics (all correct):")
    print(f"  {t14_metrics_correct['overall']}")
    # Expected: micro_precision: 1.0, micro_recall: 1.0, micro_f1: 1.0
    # Expected: macro_precision: 1.0, macro_recall: 1.0, macro_f1: 1.0

    print("\n--- Example for t14_calculate_metrics with all incorrect predictions (different labels) ---")
    true_t_all_incorrect = pd.Series([0, 1, 2]) # T1, T2, T3
    predictions_t_all_incorrect = pd.Series(["T2", "T3", "T1"]) # Predicted T2 for T1, T3 for T2, T1 for T3
    t14_metrics_incorrect = t14_calculate_metrics(true_t_all_incorrect, predictions_t_all_incorrect)
    print("Overall metrics (all incorrect, different labels):")
    # For T1: TP=0, FN=1 (true T1, pred T2), FP=1 (pred T1 for true T3)
    # For T2: TP=0, FN=1 (true T2, pred T3), FP=1 (pred T2 for true T1)
    # For T3: TP=0, FN=1 (true T3, pred T1), FP=1 (pred T3 for true T2)
    # Micro: total_tp_micro=0, total_fp_micro=3, total_fn_micro=3. P=0, R=0, F1=0
    # Macro: P for T1=0, R for T1=0, F1 for T1=0. Same for T2, T3. Macro P,R,F1 = 0
    print(f"  {t14_metrics_incorrect['overall']}")
    for label in ['T1', 'T2', 'T3']:
        print(f"  {label}: {t14_metrics_incorrect[label]}")

    print("\n--- Example for t14_calculate_metrics with predictions containing non-existent labels ---")
    true_t_extra_pred = pd.Series([0, 1]) # T1, T2
    predictions_t_extra_pred = pd.Series(["T1 and T5", "T6"]) # T5 and T6 are not in true labels
    t14_metrics_extra = t14_calculate_metrics(true_t_extra_pred, predictions_t_extra_pred)
    print("Overall metrics (predictions with non-existent labels):")
    print(f"  {t14_metrics_extra['overall']}")
    # T1: true T1, pred "T1 and T5". TP=1 for T1. FN=0 for T1. FP=0 for T1 (T5 is not a known label to check against).
    # T2: true T2, pred "T6". TP=0 for T2. FN=1 for T2. FP=0 for T2.
    # Micro: total_tp_micro=1, total_fp_micro=0, total_fn_micro=1. P=1/(1+0)=1, R=1/(1+1)=0.5, F1=2*1*0.5/(1+0.5) = 1/1.5 = 0.667
    # Macro P: (P_T1 + P_T2)/2 = (1+0)/2 = 0.5
    # Macro R: (R_T1 + R_T2)/2 = (1+0)/2 = 0.5
    # Macro F1: (F1_T1 + F1_T2)/2 = (1+0)/2 = 0.5
    # Wait, my FP logic: `if label_to_check_fp in prediction_str and label_to_check_fp != true_label:`
    # `label_to_check_fp` comes from `unique_true_labels`. So T5 and T6 won't cause FPs for T1 or T2.
    # This is correct.
    print(f"  T1: {t14_metrics_extra['T1']}")
    print(f"  T2: {t14_metrics_extra['T2']}")


    # Example for n03 with a mix
    print("\n--- Example for n03_calculate_metrics ---")
    true_n = pd.Series([0, 0, 1, 1, 2]) # N0, N0, N1, N1, N2
    pred_n = pd.Series(["N0", "N1", "N0", "N2", "N2"])
    # N0: True=N0, Pred=N0 -> TP for N0
    # N0: True=N0, Pred=N1 -> FN for N0, FP for N1
    # N1: True=N1, Pred=N0 -> FN for N1, FP for N0
    # N1: True=N1, Pred=N2 -> FN for N1, FP for N2
    # N2: True=N2, Pred=N2 -> TP for N2

    # N0: TP=1, FN=1, FP=1 -> P=0.5, R=0.5, F1=0.5
    # N1: TP=0, FN=2, FP=1 -> P=0, R=0, F1=0
    # N2: TP=1, FN=0, FP=1 -> P=0.5, R=1, F1=0.667

    # Micro: total_tp_micro=2, total_fp_micro=3, total_fn_micro=3
    # Micro P = 2 / (2+3) = 2/5 = 0.4
    # Micro R = 2 / (2+3) = 2/5 = 0.4
    # Micro F1 = 2 * 0.4 * 0.4 / (0.4+0.4) = 0.32 / 0.8 = 0.4

    # Macro P = (0.5 + 0 + 0.5) / 3 = 1/3 = 0.333
    # Macro R = (0.5 + 0 + 1) / 3 = 1.5/3 = 0.5
    # Macro F1 = (0.5 + 0 + 0.667) / 3 = 1.167/3 = 0.389

    n03_res = n03_calculate_metrics(true_n, pred_n)
    print("Per-label metrics:")
    for label, metrics_val in n03_res.items():
        if label != "overall":
            print(f"  {label}: {metrics_val}")
    print("Overall metrics:")
    print(f"  {n03_res['overall']}")



--- Example for t14_calculate_metrics ---
Per-label metrics:
  T1: {'precision': 0.75, 'recall': 1.0, 'f1': 0.857, 'support': 3, 'tp': 3, 'fp': 1, 'fn': 0, 'num_errors': 1}
  T2: {'precision': 0.5, 'recall': 0.667, 'f1': 0.571, 'support': 3, 'tp': 2, 'fp': 2, 'fn': 1, 'num_errors': 3}
  T3: {'precision': 1.0, 'recall': 0.333, 'f1': 0.5, 'support': 3, 'tp': 1, 'fp': 0, 'fn': 2, 'num_errors': 2}
  T4: {'precision': 0.5, 'recall': 1.0, 'f1': 0.667, 'support': 1, 'tp': 1, 'fp': 1, 'fn': 0, 'num_errors': 1}
Overall metrics:
  {'micro_precision': 0.636, 'micro_recall': 0.7, 'micro_f1': 0.667, 'macro_precision': 0.688, 'macro_recall': 0.75, 'macro_f1': 0.649, 'weighted_f1': 0.645, 'support': 10, 'total_tp': 7, 'total_fp': 4, 'total_fn': 3, 'num_errors': 7}


--- Example for n03_calculate_metrics ---
Per-label metrics:
  N0: {'precision': 0.6, 'recall': 0.75, 'f1': 0.667, 'support': 4, 'tp': 3, 'fp': 2, 'fn': 1, 'num_errors': 3}
  N1: {'precision': 0.75, 'recall': 0.75, 'f1': 0.75, 'support': 

In [22]:
df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv') # 이거야. 여기에 mixtral결과 다있어
print(t14_calculate_metrics(df['t'], df['zscot_t_stage'])['overall'])
print()
print(t14_calculate_metrics(df['t'], df['rag_raw_t_stage'])['overall'])
print()
print(t14_calculate_metrics(df['t'], df['ltm_rag1_t_stage'])['overall'])

{'micro_precision': 0.85, 'micro_recall': 0.863, 'micro_f1': 0.856, 'macro_precision': 0.831, 'macro_recall': 0.765, 'macro_f1': 0.792, 'weighted_f1': 0.854, 'support': 800, 'total_tp': 690, 'total_fp': 122, 'total_fn': 110, 'num_errors': 232}

{'micro_precision': 0.81, 'micro_recall': 0.815, 'micro_f1': 0.812, 'macro_precision': 0.771, 'macro_recall': 0.73, 'macro_f1': 0.743, 'weighted_f1': 0.812, 'support': 800, 'total_tp': 652, 'total_fp': 153, 'total_fn': 148, 'num_errors': 301}

{'micro_precision': 0.853, 'micro_recall': 0.848, 'micro_f1': 0.85, 'macro_precision': 0.792, 'macro_recall': 0.728, 'macro_f1': 0.746, 'weighted_f1': 0.846, 'support': 800, 'total_tp': 678, 'total_fp': 117, 'total_fn': 122, 'num_errors': 239}


In [23]:
print(n03_calculate_metrics(df['n'], df['zscot_n_stage'])['overall'])
print()
print(n03_calculate_metrics(df['n'], df['rag_raw_n_stage'])['overall'])
print()
print(n03_calculate_metrics(df['n'], df['ltm_rag1_n_stage'])['overall'])

{'micro_precision': 0.874, 'micro_recall': 0.873, 'micro_f1': 0.873, 'macro_precision': 0.843, 'macro_recall': 0.822, 'macro_f1': 0.832, 'weighted_f1': 0.872, 'support': 800, 'total_tp': 698, 'total_fp': 101, 'total_fn': 102, 'num_errors': 203}

{'micro_precision': 0.841, 'micro_recall': 0.835, 'micro_f1': 0.838, 'macro_precision': 0.803, 'macro_recall': 0.799, 'macro_f1': 0.797, 'weighted_f1': 0.84, 'support': 800, 'total_tp': 668, 'total_fp': 126, 'total_fn': 132, 'num_errors': 258}

{'micro_precision': 0.853, 'micro_recall': 0.859, 'micro_f1': 0.856, 'macro_precision': 0.807, 'macro_recall': 0.814, 'macro_f1': 0.81, 'weighted_f1': 0.856, 'support': 800, 'total_tp': 687, 'total_fp': 118, 'total_fn': 113, 'num_errors': 231}


In [35]:
def calculate_mean_std(results, cat):
    precision_list = [result[cat]["precision"] for result in results]
    recall_list = [result[cat]["recall"] for result in results]
    f1_list = [result[cat]["f1"] for result in results]
    support_list = [result[cat]["support"] for result in results]
    num_errors_list = [result[cat]["num_errors"] for result in results]

    mean_precision = sum(precision_list) / len(precision_list)
    mean_recall = sum(recall_list) / len(recall_list)
    mean_f1 = sum(f1_list) / len(f1_list)

    std_precision = (
        sum([(x - mean_precision) ** 2 for x in precision_list]) / len(precision_list)
    ) ** 0.5
    std_recall = (
        sum([(x - mean_recall) ** 2 for x in recall_list]) / len(recall_list)
    ) ** 0.5
    std_f1 = (sum([(x - mean_f1) ** 2 for x in f1_list]) / len(f1_list)) ** 0.5

    return {
        "mean_precision": round(mean_precision, 3),
        "mean_recall": round(mean_recall, 3),
        "mean_f1": round(mean_f1, 3),
        "std_precision": round(std_precision, 3),
        "std_recall": round(std_recall, 3),
        "std_f1": round(std_f1, 3),
        "sum_support": sum(support_list),
        "sum_num_errors": sum(num_errors_list),
        "raw_mean_precision": mean_precision,
        "raw_mean_recall": mean_recall,
        "raw_mean_f1": mean_f1,
    }


def output_tabular_performance(results, categories=["T1", "T2", "T3", "T4"]):
    precisions = []
    recalls = []
    f1s = []

    for category in categories:
        eval = calculate_mean_std(results, category)
        print(
            "{} {:.3f}({:.3f}) {:.3f}({:.3f}) {:.3f}({:.3f})".format(
                category,
                eval["mean_precision"],
                eval["std_precision"],
                eval["mean_recall"],
                eval["std_recall"],
                eval["mean_f1"],
                eval["std_f1"],
            )
        )

        # for calculating macro average
        precisions.append(eval["raw_mean_precision"])
        recalls.append(eval["raw_mean_recall"])
        f1s.append(eval["raw_mean_f1"])

    print(
        "MacroAvg. {:.3f} {:.3f} {:.3f}".format(
            round(sum(precisions) / len(precisions), 3),
            round(sum(recalls) / len(recalls), 3),
            round(sum(f1s) / len(f1s), 3),
        )
    )

Index(['patient_filename', 't', 'text', 'n', 'zscot_t_reasoning',
       'zscot_t_stage', 'zscot_n_reasoning', 'zscot_n_stage',
       'rag_raw_t_reasoning', 'rag_raw_t_stage', 'rag_raw_n_reasoning',
       'rag_raw_n_stage', 'ltm_zs_t_reasoning', 'ltm_zs_t_stage',
       'ltm_zs_n_reasoning', 'ltm_zs_n_stage', 'ltm_rag1_t_reasoning',
       'ltm_rag1_t_stage', 'ltm_rag1_n_reasoning', 'ltm_rag1_n_stage',
       'ltm_rag2_t_reasoning', 'ltm_rag2_t_stage', 'ltm_rag2_n_reasoning',
       'ltm_rag2_n_stage', 'zscot_t_flag', 'zscot_n_flag', 'rag_raw_t_flag',
       'rag_raw_n_flag', 'ltm_zs_t_flag', 'ltm_zs_n_flag', 'ltm_rag1_t_flag',
       'ltm_rag1_n_flag', 'ltm_rag2_t_flag', 'ltm_rag2_n_flag'],
      dtype='object')

In [44]:
# kewltm_t_df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv")
# others_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')


###############################################################################
# Updated helper: calculate_mean_std                                          #
###############################################################################
def calculate_mean_std(
        results: list[dict],
        cat: str | None,
        level: str = "label"     # "label", "micro", or "macro"
    ) -> dict:
    """
    Compute mean ± std for a single class ("label" level) or for the
    overall micro / macro aggregates produced by *t14_calculate_metrics*.

    Args
    ----
    results : list of metrics dicts (output of t14_calculate_metrics)
    cat     : class label (e.g. "T1") – ignored for micro/macro levels
    level   : "label" | "micro" | "macro"

    Returns
    -------
    dict with keys:
        mean_precision, mean_recall, mean_f1,
        std_precision,  std_recall,  std_f1,
        (plus sums and raw means used elsewhere)
    """
    # ------------------------------------------------------------------ #
    # Gather the three score lists                                       #
    # ------------------------------------------------------------------ #
    precision_list, recall_list, f1_list = [], [], []
    support_list, num_errors_list = [], []

    for res in results:
        if level == "label":
            src = res[cat]                                 # per‑class block
        elif level == "micro":
            src = {                                        # NEW ↓
                "precision": res["overall"]["micro_precision"],
                "recall":    res["overall"]["micro_recall"],
                "f1":        res["overall"]["micro_f1"],
                "support":   res["overall"]["support"],
                "num_errors": res["overall"]["num_errors"],
            }
        elif level == "macro":
            src = {                                        # NEW ↓
                "precision": res["overall"]["macro_precision"],
                "recall":    res["overall"]["macro_recall"],
                "f1":        res["overall"]["macro_f1"],
                "support":   res["overall"]["support"],
                "num_errors": res["overall"]["num_errors"],
            }
        else:
            raise ValueError(f"Unknown level: {level}")

        precision_list.append(src["precision"])
        recall_list.append(src["recall"])
        f1_list.append(src["f1"])
        support_list.append(src["support"])
        num_errors_list.append(src["num_errors"])

    # ------------------------------------------------------------------ #
    # Mean / std                                                         #
    # ------------------------------------------------------------------ #
    mean_p = sum(precision_list) / len(precision_list)
    mean_r = sum(recall_list)    / len(recall_list)
    mean_f = sum(f1_list)        / len(f1_list)

    std_p = (sum((x - mean_p) ** 2 for x in precision_list) / len(precision_list)) ** 0.5
    std_r = (sum((x - mean_r) ** 2 for x in recall_list)    / len(recall_list))    ** 0.5
    std_f = (sum((x - mean_f) ** 2 for x in f1_list)        / len(f1_list))        ** 0.5

    return {
        "mean_precision": round(mean_p, 3),
        "mean_recall":    round(mean_r, 3),
        "mean_f1":        round(mean_f, 3),
        "std_precision":  round(std_p, 3),
        "std_recall":     round(std_r, 3),
        "std_f1":         round(std_f, 3),
        "sum_support":    sum(support_list),
        "sum_num_errors": sum(num_errors_list),
        "raw_mean_precision": mean_p,   # keep raw for higher‑level macro
        "raw_mean_recall":    mean_r,
        "raw_mean_f1":        mean_f,
    }


###############################################################################
# Updated helper: output_tabular_performance                                  #
###############################################################################
def output_tabular_performance(
        results: list[dict],
        categories: list[str] = ("T1", "T2", "T3", "T4"),
        show_overall: bool = True         # NEW flag
    ) -> None:
    """
    Print mean ± std precision/recall/F1 for each class, followed by:
      • category‑macro average (same as before)
      • micro‑average (overall)
      • macro‑average (overall)
    """
    # ------------------------------------------------------------------ #
    # Per‑class lines                                                    #
    # ------------------------------------------------------------------ #
    label_means_p, label_means_r, label_means_f = [], [], []

    for cat in categories:
        stats = calculate_mean_std(results, cat, level="label")
        print(f"{cat:8s} "
              f"{stats['mean_precision']:.3f}({stats['std_precision']:.3f}) "
              f"{stats['mean_recall']:.3f}({stats['std_recall']:.3f}) "
              f"{stats['mean_f1']:.3f}({stats['std_f1']:.3f})")

        label_means_p.append(stats["raw_mean_precision"])
        label_means_r.append(stats["raw_mean_recall"])
        label_means_f.append(stats["raw_mean_f1"])

    # ------------------------------------------------------------------ #
    # Category‑macro (average of label means)                            #
    # ------------------------------------------------------------------ #
    print(f"{'Cat‑Macro':8s} "
          f"{sum(label_means_p)/len(label_means_p):.3f} "
          f"{sum(label_means_r)/len(label_means_r):.3f} "
          f"{sum(label_means_f)/len(label_means_f):.3f}")

    # ------------------------------------------------------------------ #
    # Overall micro / macro                                              #
    # ------------------------------------------------------------------ #
    if show_overall:
        micro = calculate_mean_std(results, None, level="micro")
        macro = calculate_mean_std(results, None, level="macro")

        print(f"{'MicroAvg.':8s} "
              f"{micro['mean_precision']:.3f}({micro['std_precision']:.3f}) "
              f"{micro['mean_recall']:.3f}({micro['std_recall']:.3f}) "
              f"{micro['mean_f1']:.3f}({micro['std_f1']:.3f})")

        print(f"{'MacroAvg.':8s} "
              f"{macro['mean_precision']:.3f}({macro['std_precision']:.3f}) "
              f"{macro['mean_recall']:.3f}({macro['std_recall']:.3f}) "
              f"{macro['mean_f1']:.3f}({macro['std_f1']:.3f})")


In [45]:
# For T
print("Mixtral")
# zscot_t_results = []
# rag_t_results = []
# ltm_rag_t_results = []
kewltm_t_results = []

t_label = 't'
# zscot_t_stage = 'zscot_t_stage'
# rag_t_stage = 'rag_raw_t_stage'
# ltm_rag_t_stage = 'ltm_rag1_t_stage'
kewltm_t_stage = "cmem_t_40reports_ans_str"

# others_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in run_lst:
    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_t14_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_t_results.append(
        t14_calculate_metrics(t_test_df[t_label], t_test_df[kewltm_t_stage])
    )

    # split_ids = t_test_df.patient_filename
    # others_t_df = others_t_df[others_t_df["patient_filename"].isin(split_ids)]
    # zscot_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[zscot_t_stage]))
    # rag_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[rag_t_stage]))
    # ltm_rag_t_results.append(t14_calculate_metrics(others_t_df[t_label], others_t_df[ltm_rag_t_stage]))

# print("ZSCoT")
# output_tabular_performance(zscot_t_results)

# print("RAG")
# output_tabular_performance(rag_t_results)

# print("LTM-RAG")
# output_tabular_performance(ltm_rag_t_results)

print("KEW-LTM")
output_tabular_performance(kewltm_t_results)

Mixtral
KEW-LTM
T1       0.904(0.017) 0.812(0.040) 0.855(0.018)
T2       0.882(0.022) 0.938(0.018) 0.909(0.005)
T3       0.834(0.054) 0.810(0.058) 0.818(0.018)
T4       0.807(0.082) 0.634(0.038) 0.707(0.029)
Cat‑Macro 0.857 0.799 0.822
MicroAvg. 0.876(0.006) 0.878(0.007) 0.877(0.007)
MacroAvg. 0.857(0.022) 0.799(0.020) 0.822(0.010)


In [46]:
# For N
print("Mixtral")
# zscot_n_results = []
# rag_n_results = []
# ltm_rag_n_results = []
kewltm_n_results = []

n_label = 'n'
# zscot_n_stage = 'zscot_n_stage'
# rag_n_stage = 'rag_raw_n_stage'
# ltm_rag_n_stage = 'ltm_rag1_n_stage'
kewltm_n_stage = "cmem_n_40reports_ans_str"

# others_n_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/mixtral_rag_result/0929_ltm_rag2.csv')

run_lst = [0, 1, 3, 4, 5, 6, 7, 9]

for run in run_lst:
    n_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/0718_n03_dynamic_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_n_results.append(
        n03_calculate_metrics(n_test_df[n_label], n_test_df[kewltm_n_stage])
    )

    # split_ids = n_test_df.patient_filename
    # others_n_df = others_n_df[others_n_df["patient_filename"].isin(split_ids)]
    # zscot_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[zscot_n_stage]))
    # rag_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[rag_n_stage]))
    # ltm_rag_n_results.append(n03_calculate_metrics(others_n_df[n_label], others_n_df[ltm_rag_n_stage]))

# print("ZSCoT")
# output_tabular_performance(zscot_n_results, categories=["N0", "N1", "N2", "N3"])

# print("RAG")
# output_tabular_performance(rag_n_results, categories=["N0", "N1", "N2", "N3"])

# print("LTM-RAG")
# output_tabular_performance(ltm_rag_n_results, categories=["N0", "N1", "N2", "N3"])

print("KEW-LTM")
output_tabular_performance(kewltm_n_results, categories=["N0", "N1", "N2", "N3"])

Mixtral
KEW-LTM
N0       0.944(0.008) 0.952(0.018) 0.948(0.011)
N1       0.885(0.020) 0.883(0.026) 0.884(0.010)
N2       0.713(0.031) 0.745(0.054) 0.727(0.022)
N3       0.886(0.058) 0.784(0.042) 0.830(0.017)
Cat‑Macro 0.857 0.841 0.847
MicroAvg. 0.883(0.007) 0.883(0.007) 0.883(0.007)
MacroAvg. 0.857(0.011) 0.841(0.011) 0.847(0.008)


In [None]:
lst = ["N0", "N1", "N2", "N3"]
lst.append("")
lst

In [None]:
df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv')
df.columns


In [None]:
t14_calculate_metrics(df['t'], df['t14_ltm_rag1_t_pred'])

In [None]:
t_test_df.columns

In [17]:
# For T
print("Med42")
zscot_t_results = []
rag_t_results = []
ltm_rag_t_results = []
kewltm_t_results = []

t_label = 't'
kewltm_t_stage = 'kepa_t_ans_str'
zscot_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1118_t14_med42_v2_test_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 'zscot_t_ans_str']]
rag_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1120_t14_rag_raw_med42_v2_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 't14_rag_raw_t_pred']]
ltm_rag_t_df = pd.read_csv('/home/yl3427/cylab/selfCorrectionAgent/result/1128_t14_ltm_rag1_med42_v2_800.csv').sort_values(by="patient_filename")[["patient_filename", 't', 't14_ltm_rag1_t_pred']]

run_lst = [0, 1, 2, 3, 4, 5, 6, 8]

for run in run_lst:
    t_test_df = pd.read_csv(
        f"/home/yl3427/cylab/selfCorrectionAgent/result/1114_t14_med42_v2_test_{run}_outof_10runs.csv"
    ).sort_values(by="patient_filename")

    kewltm_t_results.append(
        t14_calculate_metrics(t_test_df[t_label], t_test_df[kewltm_t_stage])
    )

    split_ids = t_test_df.patient_filename
    zscot_t_df = zscot_t_df[zscot_t_df["patient_filename"].isin(split_ids)]
    rag_t_df = rag_t_df[rag_t_df["patient_filename"].isin(split_ids)]
    ltm_rag_t_df = ltm_rag_t_df[ltm_rag_t_df["patient_filename"].isin(split_ids)]

    zscot_t_results.append(t14_calculate_metrics(zscot_t_df[t_label], zscot_t_df['zscot_t_ans_str']))
    rag_t_results.append(t14_calculate_metrics(rag_t_df[t_label], rag_t_df['t14_rag_raw_t_pred']))
    ltm_rag_t_results.append(t14_calculate_metrics(ltm_rag_t_df[t_label], ltm_rag_t_df['t14_ltm_rag1_t_pred']))

print("ZSCoT")
output_tabular_performance(zscot_t_results)

print("RAG")
output_tabular_performance(rag_t_results)

print("LTM-RAG")
output_tabular_performance(ltm_rag_t_results)

print("KEW-LTM")
output_tabular_performance(kewltm_t_results)

Med42
ZSCoT
T1 0.586(0.012) 0.705(0.018) 0.640(0.013)
T2 0.822(0.015) 0.801(0.015) 0.811(0.015)
T3 0.827(0.011) 0.703(0.026) 0.760(0.015)
T4 0.832(0.078) 0.545(0.059) 0.658(0.064)
MacroAvg. 0.767 0.689 0.717
RAG
T1 0.840(0.004) 0.782(0.023) 0.810(0.014)
T2 0.834(0.015) 0.897(0.008) 0.864(0.012)
T3 0.841(0.012) 0.672(0.021) 0.747(0.014)
T4 0.546(0.071) 0.541(0.073) 0.543(0.070)
MacroAvg. 0.765 0.723 0.741
LTM-RAG
T1 0.918(0.015) 0.807(0.008) 0.859(0.010)
T2 0.896(0.008) 0.947(0.008) 0.920(0.007)
T3 0.842(0.021) 0.859(0.012) 0.850(0.014)
T4 0.692(0.065) 0.601(0.052) 0.643(0.055)
MacroAvg. 0.837 0.803 0.818
KEW-LTM
T1 0.813(0.073) 0.759(0.076) 0.783(0.064)
T2 0.855(0.031) 0.913(0.023) 0.882(0.016)
T3 0.869(0.063) 0.703(0.099) 0.770(0.065)
T4 0.630(0.046) 0.615(0.057) 0.621(0.042)
MacroAvg. 0.792 0.747 0.764


In [None]:
import pandas as pd
brca_report = pd.read_csv(
        "/secure/shared_data/rag_tnm_results/summary/5_folds_summary/brca_df.csv"
    )
df = brca_report[brca_report["n"] != -1][["patient_filename", "t", "text"]]

In [None]:
df = pd.read_csv("1221_result.csv")
df.columns

In [None]:
from src.metrics import *
# t14_calculate_metrics(df['t'], df['memory_test_pred'])
df = df[df['memory_test_parsed']]
t14_calculate_metrics2(df['t'], df['memory_test_pred'])

In [None]:
df = pd.read_csv("baseline_results.csv")
df.columns

In [None]:
t14_calculate_metrics(df['t'], df['t14_baseline_pred'])

In [None]:
df = pd.read_csv("1221_result_2.csv")
df.columns

In [None]:
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall']

In [None]:
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[:10]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[10:20]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[20:30]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[30:40]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_3.csv")[40:]
df[['t', '1221_test_pred_zs','1221_test_pred_rules', '1221_test_pred_memory_only','1221_test_pred']]
t14_calculate_metrics(df['t'], df['1221_test_pred'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_zs'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_rules'])['overall'], t14_calculate_metrics(df['t'], df['1221_test_pred_memory_only'])['overall']

In [None]:
df = pd.read_csv("1221_result_mixtral.csv")
for i in range(0, 790, 10):
    sub_df = df[i:i+10]
    print(len(sub_df))

In [None]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1221_result_mixtral.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['1221_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['1221_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['1221_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['1221_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")



In [None]:
import pandas as pd
from src.metrics import *
data_df = pd.read_csv("1222_result_llama_4_30.csv")
data_df.columns

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_0.csv")
df.columns

In [None]:
df = pd.read_csv("1222_result_llama_4_30.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1222_result_llama_4_30.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


In [None]:
import pandas as pd
from src.metrics import *

data_df = pd.read_csv("1222_result_llama_1_1.csv")
for i in range(0, 790, 80):
    df = data_df[i:i+80]
    main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
    zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
    rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
    memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
    print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
    print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
    print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")
    print("-----------------")


In [None]:
df = pd.read_csv("1222_result_llama_1_1.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
df = pd.read_csv("1223_result_mixtral_1_1.csv")

main = t14_calculate_metrics2(df['t'],df['ltm_test_pred'])['overall']
zs = t14_calculate_metrics2(df['t'], df['ltm_test_pred_zs'])['overall']
rules = t14_calculate_metrics2(df['t'], df['ltm_test_pred_rules'])['overall']
memory = t14_calculate_metrics2(df['t'], df['ltm_test_pred_memory_only'])['overall']
print(f"macro F1: {main['macro_f1']:.3f} ({zs['macro_f1']:.3f}) ({rules['macro_f1']:.3f}) ({memory['macro_f1']:.3f})")
print(f"micro F1: {main['micro_f1']:.3f} ({zs['micro_f1']:.3f}) ({rules['micro_f1']:.3f}) ({memory['micro_f1']:.3f})")
print(f"weighted F1: {main['weighted_f1']:.3f} ({zs['weighted_f1']:.3f}) ({rules['weighted_f1']:.3f}) ({memory['weighted_f1']:.3f})")


In [None]:
main

In [None]:
zs

In [None]:
rules

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_0.csv")
main_1 = 'experiment_n_stage_pred_llm_mem'
main_2 = 'experiment_n_stage_pred_rag_mem'
zs = 'experiment_n_stage_pred_zs'
rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
memory_1 = 'experiment_n_stage_pred_memonly'
memory_2 = 'experiment_n_stage_pred_memonly2'
rag = 'experiment_n_stage_pred_rag_only'

main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
from tabulate import tabulate

full_df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_0.csv")
for i in range(0, 790, 40):
    df = full_df[i:i+40]

    main_1 = 'experiment_n_stage_pred_llm_mem'
    main_2 = 'experiment_n_stage_pred_rag_mem'
    zs = 'experiment_n_stage_pred_zs'
    rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
    rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
    memory_1 = 'experiment_n_stage_pred_memonly'
    memory_2 = 'experiment_n_stage_pred_memonly2'
    rag = 'experiment_n_stage_pred_rag_only'

    main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
    main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
    zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
    rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
    rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
    memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
    memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
    rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

    table_data = [
        ["main_1",   main_1['macro_f1'],   main_1['micro_f1'],   main_1['weighted_f1']],
        ["main_2",   main_2['macro_f1'],   main_2['micro_f1'],   main_2['weighted_f1']],
        ["zs",       zs['macro_f1'],       zs['micro_f1'],       zs['weighted_f1']],
        ["rules_1",  rules_1['macro_f1'],  rules_1['micro_f1'],  rules_1['weighted_f1']],
        ["rules_2",  rules_2['macro_f1'],  rules_2['micro_f1'],  rules_2['weighted_f1']],
        ["memory_1", memory_1['macro_f1'], memory_1['micro_f1'], memory_1['weighted_f1']],
        ["memory_2", memory_2['macro_f1'], memory_2['micro_f1'], memory_2['weighted_f1']],
        ["rag",      rag['macro_f1'],      rag['micro_f1'],      rag['weighted_f1']],
    ]

    print(tabulate(
        table_data,
        headers=["Method", "Macro F1", "Micro F1", "Weighted F1"],
        floatfmt=".4f",  # 4 decimal places
        tablefmt="github"
    ))


In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_4.csv")
main_1 = 'experiment_n_stage_pred_llm_mem'
main_2 = 'experiment_n_stage_pred_rag_mem'
zs = 'experiment_n_stage_pred_zs'
rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
memory_1 = 'experiment_n_stage_pred_memonly'
memory_2 = 'experiment_n_stage_pred_memonly2'
rag = 'experiment_n_stage_pred_rag_only'

main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_0.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")


In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_1.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")


In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_2.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_3.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/secure/shared_data/rag_tnm_results/summary/5_folds_summary/luad_df.csv")
df.n.unique()

In [None]:
df.columns

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_0_lung.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")


In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_1_lung.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_2_lung.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_3_lung.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_4_lung.csv")
main_1 = 'experiment_t_stage_pred_llm_mem'
main_2 = 'experiment_t_stage_pred_rag_mem'
zs = 'experiment_t_stage_pred_zs'
rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
memory_1 = 'experiment_t_stage_pred_memonly'
memory_2 = 'experiment_t_stage_pred_memonly2'
rag = 'experiment_t_stage_pred_rag_only'

main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
rag = t14_calculate_metrics2(df['t'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_0_lung.csv")
main_1 = 'experiment_n_stage_pred_llm_mem'
main_2 = 'experiment_n_stage_pred_rag_mem'
zs = 'experiment_n_stage_pred_zs'
rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
memory_1 = 'experiment_n_stage_pred_memonly'
memory_2 = 'experiment_n_stage_pred_memonly2'
rag = 'experiment_n_stage_pred_rag_only'

main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
df = pd.read_csv("/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_1_lung.csv")
main_1 = 'experiment_n_stage_pred_llm_mem'
main_2 = 'experiment_n_stage_pred_rag_mem'
zs = 'experiment_n_stage_pred_zs'
rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
memory_1 = 'experiment_n_stage_pred_memonly'
memory_2 = 'experiment_n_stage_pred_memonly2'
rag = 'experiment_n_stage_pred_rag_only'

main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
rag = n03_calculate_metrics2(df['n'], df[rag])['overall']

print(f"main_1: {main_1['macro_f1']}, {main_1['micro_f1']}, {main_1['weighted_f1']}")
print(f"main_2: {main_2['macro_f1']}, {main_2['micro_f1']}, {main_2['weighted_f1']}")
print(f"zs: {zs['macro_f1']}, {zs['micro_f1']}, {zs['weighted_f1']}")
print(f"rules_1: {rules_1['macro_f1']}, {rules_1['micro_f1']}, {rules_1['weighted_f1']}")
print(f"rules_2: {rules_2['macro_f1']}, {rules_2['micro_f1']}, {rules_2['weighted_f1']}")
print(f"memory_1: {memory_1['macro_f1']}, {memory_1['micro_f1']}, {memory_1['weighted_f1']}")
print(f"memory_2: {memory_2['macro_f1']}, {memory_2['micro_f1']}, {memory_2['weighted_f1']}")
print(f"rag: {rag['macro_f1']}, {rag['micro_f1']}, {rag['weighted_f1']}")

In [None]:
from tabulate import tabulate
print("T14: Breast Cancer")
print()
for i in range(5):
    df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result_breast/results_t_stage_run_{i}.csv")
    main_1 = 'experiment_t_stage_pred_llm_mem'
    main_2 = 'experiment_t_stage_pred_rag_mem'
    zs = 'experiment_t_stage_pred_zs'
    rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
    rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
    memory_1 = 'experiment_t_stage_pred_memonly'
    memory_2 = 'experiment_t_stage_pred_memonly2'
    rag = 'experiment_t_stage_pred_rag_only'

    main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
    main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
    zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
    rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
    rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
    memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
    memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
    rag = t14_calculate_metrics2(df['t'], df[rag])['overall']


    table_data = [
        ["ZS",       zs['macro_f1']],
        ["Rule1 (from nothing)",  rules_1['macro_f1']],
        ["Memory1 (from Rule1)", memory_1['macro_f1']],
        ["Rule1 + Memory1",   main_1['macro_f1']],
        [],
        ["RAG",      rag['macro_f1']],
        ["Rule2 (from RAG excerpt)",  rules_2['macro_f1']],
        ["Memory2 (from Rule2)", memory_2['macro_f1']],
        ["Rule2 + Memory2",   main_2['macro_f1']],
    ]
    print(f"Run {i}")
    print(tabulate(
        table_data,
        headers=["Method", "Macro F1"],
        floatfmt=".3f",  
        tablefmt="github"
    ))
    print()
    print()

In [None]:
from tabulate import tabulate
print("T14: Lung Cancer")
for i in range(5):
    df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_{i}_lung.csv")
    main_1 = 'experiment_t_stage_pred_llm_mem'
    main_2 = 'experiment_t_stage_pred_rag_mem'
    zs = 'experiment_t_stage_pred_zs'
    rules_1 = 'experiment_t_stage_pred_llm_rulesonly'
    rules_2 = 'experiment_t_stage_pred_rag_rulesonly'
    memory_1 = 'experiment_t_stage_pred_memonly'
    memory_2 = 'experiment_t_stage_pred_memonly2'
    rag = 'experiment_t_stage_pred_rag_only'

    main_1 = t14_calculate_metrics2(df['t'],df[main_1])['overall']
    main_2 = t14_calculate_metrics2(df['t'],df[main_2])['overall']
    zs = t14_calculate_metrics2(df['t'], df[zs])['overall']
    rules_1 = t14_calculate_metrics2(df['t'], df[rules_1])['overall']
    rules_2 = t14_calculate_metrics2(df['t'], df[rules_2])['overall']
    memory_1 = t14_calculate_metrics2(df['t'], df[memory_1])['overall']
    memory_2 = t14_calculate_metrics2(df['t'], df[memory_2])['overall']
    rag = t14_calculate_metrics2(df['t'], df[rag])['overall']


    table_data = [
        ["ZS",       zs['macro_f1']],
        ["Rule1 (from nothing)",  rules_1['macro_f1']],
        ["Memory1 (from Rule1)", memory_1['macro_f1']],
        ["Rule1 + Memory1",   main_1['macro_f1']],
        [],
        ["RAG",      rag['macro_f1']],
        ["Rule2 (from RAG excerpt)",  rules_2['macro_f1']],
        ["Memory2 (from Rule2)", memory_2['macro_f1']],
        ["Rule2 + Memory2",   main_2['macro_f1']],
    ]
    print(f"Run {i}")
    print(tabulate(
        table_data,
        headers=["Method", "Macro F1"],
        floatfmt=".3f",  
        tablefmt="github"
    ))
    print()
    print()

In [None]:
from tabulate import tabulate
print("N03: Breast Cancer")
for i in range(5):
    df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/result_breast/results_n_stage_run_{i}.csv")
    main_1 = 'experiment_n_stage_pred_llm_mem'
    main_2 = 'experiment_n_stage_pred_rag_mem'
    zs = 'experiment_n_stage_pred_zs'
    rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
    rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
    memory_1 = 'experiment_n_stage_pred_memonly'
    memory_2 = 'experiment_n_stage_pred_memonly2'
    rag = 'experiment_n_stage_pred_rag_only'

    main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
    main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
    zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
    rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
    rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
    memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
    memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
    rag = n03_calculate_metrics2(df['n'], df[rag])['overall']


    table_data = [
        ["ZS",       zs['macro_f1']],
        ["Rule1 (from nothing)",  rules_1['macro_f1']],
        ["Memory1 (from Rule1)", memory_1['macro_f1']],
        ["Rule1 + Memory1",   main_1['macro_f1']],
        [],
        ["RAG",      rag['macro_f1']],
        ["Rule2 (from RAG excerpt)",  rules_2['macro_f1']],
        ["Memory2 (from Rule2)", memory_2['macro_f1']],
        ["Rule2 + Memory2",   main_2['macro_f1']],
    ]
    print(f"Run {i}")
    print(tabulate(
        table_data,
        headers=["Method", "Macro F1"],
        floatfmt=".3f",  
        tablefmt="github"
    ))
    print()
    print()

In [None]:
from tabulate import tabulate
print("N03: Lung Cancer")
for i in range(5):
    df = pd.read_csv(f"/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_{i}_lung.csv")
    main_1 = 'experiment_n_stage_pred_llm_mem'
    main_2 = 'experiment_n_stage_pred_rag_mem'
    zs = 'experiment_n_stage_pred_zs'
    rules_1 = 'experiment_n_stage_pred_llm_rulesonly'
    rules_2 = 'experiment_n_stage_pred_rag_rulesonly'
    memory_1 = 'experiment_n_stage_pred_memonly'
    memory_2 = 'experiment_n_stage_pred_memonly2'
    rag = 'experiment_n_stage_pred_rag_only'

    main_1 = n03_calculate_metrics2(df['n'],df[main_1])['overall']
    main_2 = n03_calculate_metrics2(df['n'],df[main_2])['overall']
    zs = n03_calculate_metrics2(df['n'], df[zs])['overall']
    rules_1 = n03_calculate_metrics2(df['n'], df[rules_1])['overall']
    rules_2 = n03_calculate_metrics2(df['n'], df[rules_2])['overall']
    memory_1 = n03_calculate_metrics2(df['n'], df[memory_1])['overall']
    memory_2 = n03_calculate_metrics2(df['n'], df[memory_2])['overall']
    rag = n03_calculate_metrics2(df['n'], df[rag])['overall']


    table_data = [
        ["ZS",       zs['macro_f1']],
        ["Rule1 (from nothing)",  rules_1['macro_f1']],
        ["Memory1 (from Rule1)", memory_1['macro_f1']],
        ["Rule1 + Memory1",   main_1['macro_f1']],
        [],
        ["RAG",      rag['macro_f1']],
        ["Rule2 (from RAG excerpt)",  rules_2['macro_f1']],
        ["Memory2 (from Rule2)", memory_2['macro_f1']],
        ["Rule2 + Memory2",   main_2['macro_f1']],
    ]
    print(f"Run {i}")
    print(tabulate(
        table_data,
        headers=["Method", "Macro F1"],
        floatfmt=".3f",  
        tablefmt="github"
    ))
    print()
    print()

In [None]:
df = pd.read_csv("/secure/shared_data/rag_tnm_results/summary/5_folds_summary/brca_df.csv")
df[df.n != -1].shape

In [None]:
df = pd.read_csv("/secure/shared_data/rag_tnm_results/summary/5_folds_summary/luad_df.csv")
df[df.n != -1].shape

In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate

cancer = "Breast"
tnm = "T14"
results_path = "/home/yl3427/cylab/selfCorrectionAgent/result_breast/results_t_stage_run_{i}.csv"

###
results_tracker = {
    "ZS": [],
    "Rule1 (from nothing)": [],
    "Memory1 (from Rule1)": [],
    "Rule1 + Memory1": [],
    "RAG": [],
    "Rule2 (from RAG excerpt)": [],
    "Memory2 (from Rule2)": [],
    "Rule2 + Memory2": []
}

print(f"{tnm.upper()}: {cancer} Cancer\n")

if tnm.lower()[0] == 't':
    calculate_metrics = t14_calculate_metrics2
elif tnm.lower()[0] == 'n':
    calculate_metrics = n03_calculate_metrics2
# Loop through the 5 runs

for i in range(5):
    df = pd.read_csv(results_path.format(i=i))
    
    main_1_col = f"experiment_{tnm.lower()[0]}_stage_pred_llm_mem"
    main_2_col = f"experiment_{tnm.lower()[0]}_stage_pred_rag_mem"
    zs_col     = f"experiment_{tnm.lower()[0]}_stage_pred_zs"
    rules_1_col= f"experiment_{tnm.lower()[0]}_stage_pred_llm_rulesonly"
    rules_2_col= f"experiment_{tnm.lower()[0]}_stage_pred_rag_rulesonly"
    memory_1_col=f"experiment_{tnm.lower()[0]}_stage_pred_memonly"
    memory_2_col=f"experiment_{tnm.lower()[0]}_stage_pred_memonly2"
    rag_col    = f"experiment_{tnm.lower()[0]}_stage_pred_rag_only"

    main_1   = calculate_metrics(df[tnm.lower()[0]], df[main_1_col])['overall']
    main_2   = calculate_metrics(df[tnm.lower()[0]], df[main_2_col])['overall']
    zs       = calculate_metrics(df[tnm.lower()[0]], df[zs_col])['overall']
    rules_1  = calculate_metrics(df[tnm.lower()[0]], df[rules_1_col])['overall']
    rules_2  = calculate_metrics(df[tnm.lower()[0]], df[rules_2_col])['overall']
    memory_1 = calculate_metrics(df[tnm.lower()[0]], df[memory_1_col])['overall']
    memory_2 = calculate_metrics(df[tnm.lower()[0]], df[memory_2_col])['overall']
    rag      = calculate_metrics(df[tnm.lower()[0]], df[rag_col])['overall']

    
    # what calcuate_metrics returns
    # results["overall"] = {
    #     "macro_precision": round(macro_precision, 3),
    #     "macro_recall":    round(macro_recall, 3),
    #     "macro_f1":        round(macro_f1, 3),
    #     "micro_precision": round(micro_precision, 3),
    #     "micro_recall":    round(micro_recall, 3),
    #     "micro_f1":        round(micro_f1, 3),
    #     "weighted_f1":     round(weighted_f1, 3),
    #     "support":         total_instances,
    #     "num_errors":      total_fp + total_fn,
    # }
    
    # Append the macro_f1 to our tracker
    results_tracker["ZS"].append(zs['macro_f1'])
    results_tracker["Rule1 (from nothing)"].append(rules_1['macro_f1'])
    results_tracker["Memory1 (from Rule1)"].append(memory_1['macro_f1'])
    results_tracker["Rule1 + Memory1"].append(main_1['macro_f1'])
    results_tracker["RAG"].append(rag['macro_f1'])
    results_tracker["Rule2 (from RAG excerpt)"].append(rules_2['macro_f1'])
    results_tracker["Memory2 (from Rule2)"].append(memory_2['macro_f1'])
    results_tracker["Rule2 + Memory2"].append(main_2['macro_f1'])
    
    # Print a run-specific table
    table_data = [
        ["ZS", zs['macro_f1']],
        ["Rule1 (from nothing)", rules_1['macro_f1']],
        ["Memory1 (from Rule1)", memory_1['macro_f1']],
        ["Rule1 + Memory1", main_1['macro_f1']],
        [],
        ["RAG", rag['macro_f1']],
        ["Rule2 (from RAG excerpt)", rules_2['macro_f1']],
        ["Memory2 (from Rule2)", memory_2['macro_f1']],
        ["Rule2 + Memory2", main_2['macro_f1']],
    ]
    print(f"Run {i}")
    print(tabulate(
        table_data,
        headers=["Method", "Macro F1"],
        floatfmt=".3f",  
        tablefmt="github"
    ))
    print()
    print()

# Now build a summary table (averages and std) across all runs
summary_table = []
methods_order = [
    "ZS",
    "Rule1 (from nothing)",
    "Memory1 (from Rule1)",
    "Rule1 + Memory1",
    "RAG",
    "Rule2 (from RAG excerpt)",
    "Memory2 (from Rule2)",
    "Rule2 + Memory2"
]

for method in methods_order:
    scores = results_tracker[method]
    mean_val = np.mean(scores)
    std_val  = np.std(scores, ddof=1)  # sample standard deviation
    summary_table.append([method, mean_val, std_val])

print("## Summary Across All Runs")
print(tabulate(
    summary_table,
    headers=["Method", "Mean Macro F1", "Std Macro F1"],
    floatfmt=".3f",
    tablefmt="github"
))


In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from src.metrics import *

def run_experiments(cancer, tnm, results_path, num_runs=5):

    # Decide which metric function to use based on T vs. N
    if tnm.lower().startswith('t'):
        calculate_metrics = t14_calculate_metrics2
    elif tnm.lower().startswith('n'):
        calculate_metrics = n03_calculate_metrics2
    else:
        raise ValueError("Only T or N staging is supported in this template.")

    # Methods you want to evaluate:
    # (keys = descriptive labels, values = column name in CSV)
    methods = {
        "ZS":                 f"experiment_{tnm.lower()[0]}_stage_pred_zs",
        "Rule1 (from nothing)": f"experiment_{tnm.lower()[0]}_stage_pred_llm_rulesonly",
        "Memory1 (from Rule1)": f"experiment_{tnm.lower()[0]}_stage_pred_memonly",
        "Rule1 + Memory1":   f"experiment_{tnm.lower()[0]}_stage_pred_llm_mem",
        "RAG":               f"experiment_{tnm.lower()[0]}_stage_pred_rag_only",
        "Rule2 (from RAG excerpt)": f"experiment_{tnm.lower()[0]}_stage_pred_rag_rulesonly",
        "Memory2 (from Rule2)": f"experiment_{tnm.lower()[0]}_stage_pred_memonly2",
        "Rule2 + Memory2":   f"experiment_{tnm.lower()[0]}_stage_pred_rag_mem"
    }

    # We want to track each method's macro/micro precision, recall, and F1 across runs
    # results_tracker[method]["macro_precision"] = [list of values across runs], etc.
    metric_keys = ["macro_precision", "macro_recall", "macro_f1",
                #    "micro_precision", "micro_recall", "micro_f1"
                   ]
    results_tracker = {
        m: {mk: [] for mk in metric_keys} for m in methods.keys()
    }

    print(f"{tnm.upper()}: {cancer} Cancer")

    # Loop over each run
    for run_index in range(num_runs):
        if run_index == 1:
            continue
        print(run_index)
        df = pd.read_csv(results_path.format(i=run_index))
        
        # Evaluate each method's performance
        method_results = {}
        for method_label, col_name in methods.items():
            # Make sure the column is in your dataframe
            if col_name not in df.columns:
                raise ValueError(f"Column '{col_name}' not found in CSV for method '{method_label}'.")

            # Calculate metrics against ground truth: df[tnm.lower()[0]]
            result = calculate_metrics(df[tnm.lower()[0]], df[col_name])['overall']
            method_results[method_label] = result

            # Store the metrics
            for mk in metric_keys:
                results_tracker[method_label][mk].append(result[mk])  

        # Build a run-specific table (show macro & micro metrics)
        table_data = []
        for method_label in methods.keys():
            res = method_results[method_label]
            table_data.append([
                method_label,
                res["macro_precision"], res["macro_recall"], res["macro_f1"],
                # res["micro_precision"], res["micro_recall"], res["micro_f1"]
            ])

        # print(f"Run {run_index}")
        # print(tabulate(
        #     table_data,
        #     headers=["Method", 
        #              "Macro P", "Macro R", "Macro F1", 
        #              "Micro P", "Micro R", "Micro F1"],
        #     floatfmt=".3f",
        #     tablefmt="github"
        # ))
        # print()

    # Build a final summary table (mean & std across all runs)
    summary_table = []
    for method_label in methods.keys():
        row = [method_label]
        for mk in metric_keys:
            scores = results_tracker[method_label][mk]
            mean_val = np.mean(scores)
            std_val  = np.std(scores, ddof=1)  # sample standard deviation
            row.append(f"{mean_val:.3f}±{std_val:.3f}")
        summary_table.append(row)

    # Print summary table
    print(f"## Summary Across {num_runs} Runs")
    print(tabulate(
        summary_table,
        headers=[
            "Method", 
            "Macro P (mean±std)", "Macro R (mean±std)", "Macro F1 (mean±std)",
            "Micro P (mean±std)", "Micro R (mean±std)", "Micro F1 (mean±std)"
        ],
        tablefmt="github"
    ))


In [None]:
cancer = "Breast"
tnm = "T14"
results_path = "/home/yl3427/cylab/selfCorrectionAgent/result_breast/results_t_stage_run_{i}.csv"
run_experiments(cancer, tnm, results_path, num_runs=5)
print("------------------------"*4)
print()
cancer = "Breast"
tnm = "N03"
results_path = "/home/yl3427/cylab/selfCorrectionAgent/result_breast/results_n_stage_run_{i}.csv"
run_experiments(cancer, tnm, results_path, num_runs=5)
print("------------------------"*4)
print()
cancer = "Lung"
tnm = "T14"
results_path = "/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_{i}_lung.csv"
run_experiments(cancer, tnm, results_path, num_runs=5)
print("------------------------"*4)
print()
cancer = "Lung"
tnm = "N03"
results_path = "/home/yl3427/cylab/selfCorrectionAgent/results_n_stage_run_{i}_lung.csv"
run_experiments(cancer, tnm, results_path, num_runs=5)
print("------------------------"*4)
print()

In [None]:
# mixtral
cancer = "Lung"
tnm = "T14"
results_path = "/home/yl3427/cylab/selfCorrectionAgent/results_t_stage_run_{i}_lung_mixtral.csv"
run_experiments(cancer, tnm, results_path, num_runs=5)
print("------------------------"*4)
print()

In [None]:
df = pd.read_csv(results_path.format(i=0))
'experiment_t_stage_pred_zs' in df.columns
