In [3]:
import os
import json
import pandas as pd

# Define the base directory where results are stored
base_dir = "Evaluation_results"

short_model_names = {
    "MBZUAI_LaMini_GPT_1.5B": "LaMini", 
    "Qwen_Qwen2.5_1.5B_Instruct": "Qwen",
      "instruction_pretrain_InstructLM_1.3B": "InstructLM",
    "nvidia_AceInstruct_1.5B": "AceInst", 
    "meta-llama-3.2-instruct-3b-chunking": "llama-i(3B)-chunking",
    "meta-llama-3.2-instruct-3b": "llama-i(3B)"
    }

# print(short_model_names.keys())
# List to store extracted data
data = []

# Iterate over all model directories
for model_name in os.listdir(base_dir):
    model_path = os.path.join(base_dir, model_name)
    result_file = os.path.join(model_path, "results.json")
    # print(model_name)

    if os.path.isdir(model_path) and os.path.exists(result_file):
        with open(result_file, "r") as f:
            results = json.load(f)

            # Extract metrics and ensure all datasets have same format
            row = {"Model": short_model_names.get(model_name, model_name)}

            for dataset in ["gov_report", "summ_screen_fd", "qmsum", "qasper", "narrative_qa", "quality"]:
                metrics = results.get(dataset, {})

                if "Rouge1" in metrics:
                    row[f"{dataset}"] = f"{metrics.get('Rouge1', 0) * 100:.1f} / {metrics.get('Rouge2', 0) * 100:.1f} / {metrics.get('RougeL', 0) * 100:.1f}"
                elif "f1_score" in metrics:
                    row[f"{dataset}"] = f"{metrics.get('f1_score', 0) * 100:.1f}"
                elif "EM_score" in metrics:
                    row[f"{dataset}"] = f"{metrics.get('EM_score', 0) * 100:.1f}"
                else:
                    row[f"{dataset}"] = "N/A"

            data.append(row)

# Convert to Pandas DataFrame
df = pd.DataFrame(data)

# Rename columns for better readability
df.rename(columns={
    "gov_report": "Gov(R(1/2/L))",
    "summ_screen_fd": "SumScr(R(1/2/L))",
    "qmsum": "QMSum(R(1/2/L))",
    "qasper": "Qasper(F1)",
    "narrative_qa": "NarQA(F1)",
    "quality": "Qual(EM)"
}, inplace=True)

df["Avg"] = df.iloc[:, 1:].apply(lambda row: sum([float(x.split(
    '/')[0]) if '/' in x else float(x) for x in row if x != "N/A"]) / len(row), axis=1)

# Print the table
print(df)

                  Model       Gov(R(1/2/L))   SumScr(R(1/2/L))  \
0  llama-i(3B)-chunking  49.4 / 17.2 / 22.0  30.3 / 6.4 / 15.7   
1           llama-i(3B)  53.7 / 21.4 / 24.1  29.2 / 6.3 / 15.6   

     QMSum(R(1/2/L)) Qasper(F1) NarQA(F1) Qual(EM)        Avg  
0  28.2 / 6.5 / 18.0       10.6       5.9     33.3  26.283333  
1  34.1 / 8.7 / 20.6       14.8      22.3     38.1  32.033333  
