In [None]:
import pandas as pd
from tqdm.std import tqdm
import json
import os

## load data

In [None]:
file_path = 'covid-19/expansion/' ## change it to your own path

In [None]:
filename_list = os.listdir(file_path)

In [None]:
filename_list

In [None]:
df_list = []
for filename in filename_list:
    df = pd.read_csv(os.path.join(file_path,filename))
    df_list.append(df)

covid_df = pd.concat(df_list).reset_index(drop=True)

In [None]:
covid_df

In [None]:
## extract valid colums
df = covid_df[["Query","GoldName","GoldID","allResult"]]

In [None]:
df.fillna("null",inplace=True)

In [None]:
df

In [None]:
## parse json
def get_predId(element):
    predid = element.get("conceptId")
    return predid

In [None]:
## get the actual results:
def filter_results(results):
    result_list = []
    for result in results:
        if result.get("score") > 0:
            result_list.append(result)
    return result_list

In [None]:
## extracting goldId and predId
def extract_GoldIdAndPredId(df):
    goldIds = []
    predIds = []
    for row in df.itertuples():
        goldId = row.GoldID.split("||")
        goldIds.append(goldId)
        
        all_results = row.allResult
        if 'null' in all_results:
            all_results = all_results.replace('null', 'None')
        allResults = eval(all_results)[:10]
        allResults = filter_results(allResults)
        predId_list = []
        if len(allResults) == 0:
            predId_list.append("Mapping_Not_Found")
        else:
            for result in allResults:
                predId = get_predId(result)
                predId_list.append(predId)
        predIds.append(predId_list)
    return goldIds, predIds
        

In [None]:
goldIds, predIds = extract_GoldIdAndPredId(df)

In [None]:
def calculate_acc_at_k(goldId, predId, k):
    correct = 0
    for gold, pred in zip(goldId, predId):
        if any(p in gold for p in pred[:k]):
            correct += 1
    return correct / len(goldId)


def calculate_precision_recall_f1(goldId, predId):
    true_positive = 0
    total_predicted = 0
    total_gold = 0
    
    for gold, pred in zip(goldId, predId):
        true_positive += len(set(gold) & set(pred))
        total_predicted += len(pred)
        total_gold += len(gold)
    
    precision = true_positive / total_predicted if total_predicted > 0 else 0
    recall = true_positive / total_gold if total_gold > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1


def calculate_precision_recall_at_k(goldId, predId, k):
    true_positive = 0
    total_predicted = 0
    total_gold = 0
    
    for gold, pred in zip(goldId, predId):
        true_positive += len(set(gold) & set(pred[:k]))
        total_predicted += min(len(pred[:k]), k)
        total_gold += len(gold)
    
    precision = true_positive / total_predicted if total_predicted > 0 else 0
    recall = true_positive / total_gold if total_gold > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

In [None]:
k_values = [1, 5, 10]

total_acc = {}

for k in k_values:
    avg_acc = calculate_acc_at_k(goldIds, predIds, k)
    precison, recall, f = calculate_precision_recall_at_k(goldIds,predIds,k)
    total_acc[f'acc@{k}'] = avg_acc
    total_acc[f"precision@{k}"] = precison
    total_acc[f"recall@{k}"] = recall
    total_acc[f"f1@{k}"] = f

p, r, f1 = calculate_precision_recall_f1(goldIds,predIds)
total_acc['precision'] = p
total_acc['recall'] = r
total_acc['f1'] = f1

total_acc

## remove Mapping not found

In [None]:
df1 = df[df.GoldID != "Mapping_Not_Found"]

In [None]:
goldIds, predIds = extract_GoldIdAndPredId(df1)

In [None]:
k_values = [1, 5, 10]

new_total_acc = {}

for k in k_values:
    avg_acc = calculate_acc_at_k(goldIds, predIds, k)
    precison, recall, f = calculate_precision_recall_at_k(goldIds,predIds,k)
    new_total_acc[f'acc@{k}'] = avg_acc
    new_total_acc[f"precision@{k}"] = precison
    new_total_acc[f"recall@{k}"] = recall
    new_total_acc[f"f1@{k}"] = f

p, r, f1 = calculate_precision_recall_f1(goldIds,predIds)
new_total_acc['precision'] = p
new_total_acc['recall'] = r
new_total_acc['f1'] = f1

new_total_acc

## only mapping not found

In [None]:
df2 = df[df.GoldID == "Mapping_Not_Found"]

In [None]:
df2

In [None]:
goldIds, predIds = extract_GoldIdAndPredId(df2)

In [None]:
k_values = [1, 5, 10]

new_total_acc = {}

for k in k_values:
    avg_acc = calculate_acc_at_k(goldIds, predIds, k)
    precison, recall, f = calculate_precision_recall_at_k(goldIds,predIds,k)
    new_total_acc[f'acc@{k}'] = avg_acc
    new_total_acc[f"precision@{k}"] = precison
    new_total_acc[f"recall@{k}"] = recall
    new_total_acc[f"f1@{k}"] = f

p, r, f1 = calculate_precision_recall_f1(goldIds,predIds)
new_total_acc['precision'] = p
new_total_acc['recall'] = r
new_total_acc['f1'] = f1

new_total_acc