In [1]:
minimum = 1e-4

def harmonic_mean(numbers):
    if not numbers:
        raise ValueError("List is empty, cannot compute harmonic mean.")
    
    reciprocal_sum = sum(1/x if x != 0 else 1/minimum for x in numbers )
    harmonic_mean = len(numbers) / reciprocal_sum
    
    return harmonic_mean

In [2]:
from pathlib import Path
import json
from tqdm import tqdm

RESULTS_DIR = Path("results")

record_ckpt = [2**i for i in range(11)]

metrics = {
    alg_name:{
        "step":[[] for _ in range(3)],
        "ES":[[] for _ in range(3)],
        "GS":[[] for _ in range(3)], 
        "LS":[[] for _ in range(3)], 
        "ERS":[[] for _ in range(3)], 
        "ORS":[[] for _ in range(3)], 
        "S":[[] for _ in range(3)],
    } 
    for alg_name in ["MEND", "ROME", "MEMIT", "WilKE"]
}

for alg_name in metrics.keys():
    dir_name = alg_name
    # For three runs
    for run_round in range(3):
        # Determine run directory    
        alg_dir = RESULTS_DIR / dir_name
        run_dir = RESULTS_DIR / dir_name / f"run_{str(run_round).zfill(3)}"
        
        print(f"Current proecss folder: {run_dir}")
        files = list(run_dir.glob("edit_*.json"))
        files.sort(key=lambda x: int(str(x).split("_")[-4]))
        
        # Collect all results of the current algorithm
        results = []
        for case_file in tqdm(files):
            try:
                with open(case_file, "r") as f:
                    data = json.load(f)
            except json.JSONDecodeError:
                print(f"Could not decode {case_file} due to format error; skipping.")
            results.append(data)
        
        for ckpt in record_ckpt:
            metrics[alg_name]["step"][run_round].append(ckpt)
            
            ES ,GS, LS = 0, 0, 0
            for i in range(ckpt):
                ES += sum(results[i]["post"]["rewrite_prompts_correct"])/len(results[i]["post"]["rewrite_prompts_correct"])
                GS += sum(results[i]["post"]["paraphrase_prompts_correct"])/len(results[i]["post"]["paraphrase_prompts_correct"])
                LS += sum(results[i]["post"]["neighborhood_prompts_correct"])/len(results[i]["post"]["neighborhood_prompts_correct"])
            
            ERS, ORS = 0, 0
            try:
                with open(run_dir / f"retention_of_edit_{ckpt}.json", "r") as f:
                    retention = json.load(f)
            except json.JSONDecodeError:
                print(f"Could not decode {case_file} due to format error; skipping.")
            
            ERS = retention["edit_retention"]/retention["edit_length"]
            ORS = retention["orig_retention"]/retention["orig_length"]
            
            metrics[alg_name]["ES"][run_round].append(ES/ckpt)
            metrics[alg_name]["GS"][run_round].append(GS/ckpt)
            metrics[alg_name]["LS"][run_round].append(LS/ckpt)
            metrics[alg_name]["ERS"][run_round].append(ERS)
            metrics[alg_name]["ORS"][run_round].append(ORS)
            metrics[alg_name]["S"][run_round].append(harmonic_mean([ES/ckpt, GS/ckpt, LS/ckpt, ERS, ORS]))

Current proecss folder: results(gpt-j)/MEND/run_000(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 5421.42it/s]


Current proecss folder: results(gpt-j)/MEND/run_001(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 5637.39it/s]


Current proecss folder: results(gpt-j)/MEND/run_002(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 6992.14it/s]


Current proecss folder: results(gpt-j)/ROME/run_000(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 6713.42it/s]


Current proecss folder: results(gpt-j)/ROME/run_001(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 5574.88it/s]

Current proecss folder: results(gpt-j)/ROME/run_002(seed=48)



100%|██████████| 1025/1025 [00:00<00:00, 5927.31it/s]


Current proecss folder: results(gpt-j)/MEMIT/run_000(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 4785.84it/s]


Current proecss folder: results(gpt-j)/MEMIT/run_001(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 6821.05it/s]


Current proecss folder: results(gpt-j)/MEMIT/run_002(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 4395.02it/s]


Current proecss folder: results(gpt-j)/WilKE/run_000(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 2816.08it/s]


Current proecss folder: results(gpt-j)/WilKE/run_001(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 8685.49it/s]


Current proecss folder: results(gpt-j)/WilKE/run_002(seed=48)


100%|██████████| 1025/1025 [00:00<00:00, 7745.48it/s]


In [3]:
import numpy as np

def calculate_average_and_std(metric):
    average_result = np.mean(metric, axis=0)
    std_result = np.std(metric, axis=0)
    return average_result, std_result

In [4]:
for alg_name in metrics.keys():
    print(f"=============> For Method {alg_name} <=============")
    for metric in list(metrics[alg_name].keys())[1:]:
        print(f"Metric {metric}: {calculate_average_and_std(metrics[alg_name][metric])}")

Metric ES: (array([0.        , 0.5       , 0.75      , 0.625     , 0.4375    ,
       0.21875   , 0.109375  , 0.0546875 , 0.02734375, 0.02018229,
       0.0374349 ]), array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00663935,
       0.023658  ]))
Metric GS: (array([0.        , 0.5       , 0.5       , 0.3125    , 0.25      ,
       0.125     , 0.0703125 , 0.03515625, 0.01757812, 0.01106771,
       0.01757812]), array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00256315,
       0.00979001]))
Metric LS: (array([0.4       , 0.2       , 0.325     , 0.2375    , 0.19375   ,
       0.1625    , 0.09739583, 0.04869792, 0.02434896, 0.02109375,
       0.04785156]), array([5.55111512e-17, 2.77555756e-17, 0.00000000e+00, 0.00000000e+00,
       2.77555756e-17, 0.00000000e+00, 1.94877989e-03, 9.74389944e-04,
       4.87194972e-04, 1.09757979e-02, 3.079513