In [1]:
import json
from typing import List, Dict
from tqdm import tqdm
import os
from string import punctuation

In [2]:
exists_results = os.listdir('GPT4_results/raw')

save_dic = {}
for idx, md5 in tqdm(enumerate(exists_results), total=len(exists_results)):
    with open('GPT4_results/raw/' + md5, 'r', encoding='utf-8') as fraw:
        rsp = fraw.read()
        rsp = rsp.strip(' |\n' + punctuation)
        
        entities = rsp.split('||')
        
        l = []
        for entity in entities:
            l.append(entity.strip())
        save_dic[md5] = l

json.dump(save_dic, open('GPT4_results.json','w'), indent=4)

100%|██████████| 4826/4826 [00:10<00:00, 474.47it/s]


In [3]:
exists_results = os.listdir('Gemini_results/raw')

save_dic = {}
for idx, md5 in tqdm(enumerate(exists_results), total=len(exists_results)):
    with open('Gemini_results/raw/' + md5, 'r', encoding='utf-8') as fraw:
        rsp = fraw.read()
        rsp = rsp.strip(' |\n' + punctuation)
        
        entities = rsp.split('||')
        
        l = []
        for entity in entities:
            l.append(entity.strip())
        save_dic[md5] = l

json.dump(save_dic, open('Gemini_results.json','w'), indent=4)

100%|██████████| 4527/4527 [00:09<00:00, 471.21it/s]


In [21]:
exists_results = os.listdir('GPT4Turbo_results/raw')

save_dic = {}
for idx, md5 in tqdm(enumerate(exists_results), total=len(exists_results)):
    with open('GPT4Turbo_results/raw/' + md5, 'r', encoding='utf-8') as fraw:
        rsp = fraw.read()
        rsp = rsp.strip(' |\n' + punctuation)
        
        entities = rsp.split('||')
        
        l = []
        for entity in entities:
            l.append(entity.strip())
        save_dic[md5] = l

json.dump(save_dic, open('GPT4t_results.json','w'), indent=4)

100%|██████████| 3486/3486 [00:00<00:00, 16908.31it/s]


In [22]:
Gemini_results = json.load(open('Gemini_results.json', 'r'))
GPT4_results = json.load(open('GPT4_results.json', 'r'))
GPT4t_results = json.load(open('GPT4t_results.json', 'r'))

In [20]:
ground_truth = []
for key, value in GPT4_results.items():
    for v in value:
        ground_truth.append(v)

In [21]:
Gemini = []
for key, value in Gemini_results.items():
    for v in value:
        Gemini.append(v)

In [25]:
macro_sum = {"precision": 0, "recall": 0, "f1": 0}
micro_total = {"true_positives": 0, "predicted": 0, "actual": 0}

In [24]:
def calculate_precision_recall_f1(pred: List[Dict], gold: List[Dict]) -> Dict:
    true_positives = len([event for event in pred if event in gold])
    precision = true_positives / len(pred) if pred else 0
    recall = true_positives / len(gold) if gold else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) else 0
    return {"precision": precision, "recall": recall, "f1": f1}

In [26]:
for key, value in Gemini_results.items():
    pred = value
    gold = GPT4_results[key]

    # Calculate for each truck
    scores = calculate_precision_recall_f1(pred, gold)
    for key in macro_sum:
        macro_sum[key] += scores[key]

    # Micro averages totals
    micro_total["true_positives"] += len([event for event in pred if event in gold])
    micro_total["predicted"] += len(pred)
    micro_total["actual"] += len(gold)

In [27]:
num_docs = len(Gemini_results)
macro_avg = {key: value / num_docs for key, value in macro_sum.items() if num_docs > 0}
micro_precision = micro_total["true_positives"] / micro_total["predicted"] if micro_total["predicted"] else 0
micro_recall = micro_total["true_positives"] / micro_total["actual"] if micro_total["actual"] else 0
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (
        micro_precision + micro_recall) else 0
micro_avg = {"precision": micro_precision, "recall": micro_recall, "f1": micro_f1}

print({"macro_average": macro_avg, "micro_average": micro_avg})

{'macro_average': {'precision': 0.7294426307633526, 'recall': 0.5243439856440241, 'f1': 0.5986735429580398}, 'micro_average': {'precision': 0.7362313979019273, 'recall': 0.5125415308841169, 'f1': 0.604351934714721}}


In [32]:
macro_sum = {"precision": 0, "recall": 0, "f1": 0}
micro_total = {"true_positives": 0, "predicted": 0, "actual": 0}

In [33]:
for key, value in GPT4t_results.items():
    pred = value
    gold = GPT4_results[key]

    # Calculate for each truck
    scores = calculate_precision_recall_f1(pred, gold)
    for key in macro_sum:
        macro_sum[key] += scores[key]

    # Micro averages totals
    micro_total["true_positives"] += len([event for event in pred if event in gold])
    micro_total["predicted"] += len(pred)
    micro_total["actual"] += len(gold)

In [34]:
num_docs = len(GPT4t_results)
macro_avg = {key: value / num_docs for key, value in macro_sum.items() if num_docs > 0}
micro_precision = micro_total["true_positives"] / micro_total["predicted"] if micro_total["predicted"] else 0
micro_recall = micro_total["true_positives"] / micro_total["actual"] if micro_total["actual"] else 0
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if (
        micro_precision + micro_recall) else 0
micro_avg = {"precision": micro_precision, "recall": micro_recall, "f1": micro_f1}

print({"macro_average": macro_avg, "micro_average": micro_avg})

{'macro_average': {'precision': 0.7928396045414225, 'recall': 0.6294179771810295, 'f1': 0.6948323781821505}, 'micro_average': {'precision': 0.7890355469353427, 'recall': 0.6145228992326819, 'f1': 0.6909301329405487}}
