In [2]:
import os
import json
import pandas as pd

# Define the base directory where results are stored
base_dir = "Evaluation_results"

short_model_names = {"MBZUAI_LaMini_GPT_1.5B": "LaMini","Qwen_Qwen2.5_1.5B_Instruct":"Qwen","instruction_pretrain_InstructLM_1.3B":"InstructLM","nvidia_AceInstruct_1.5B":"AceInst","_assets_models_meta_llama_3.2_instruct_3b":"llama-i(3B)"}

#print(short_model_names.keys())
# List to store extracted data
data = []

# Iterate over all model directories
for model_name in os.listdir(base_dir):
    model_path = os.path.join(base_dir, model_name)
    result_file = os.path.join(model_path, "results.json")
    #print(model_name)
    
    if os.path.isdir(model_path) and os.path.exists(result_file):
        with open(result_file, "r") as f:
            results = json.load(f)
            
            # Extract metrics and ensure all datasets have same format
            row = {"Model": short_model_names.get(model_name, model_name)}
            
            for dataset in ["gov_report", "summ_screen_fd", "qmsum", "qasper", "narrative_qa", "quality"]:
                metrics = results.get(dataset, {})
                
                if "Rouge1" in metrics:
                    row[f"{dataset}"] = f"{metrics.get('Rouge1', 0) * 100:.1f} / {metrics.get('Rouge2', 0) * 100:.1f} / {metrics.get('RougeL', 0) * 100:.1f}"
                elif "f1_score" in metrics:
                    row[f"{dataset}"] = f"{metrics.get('f1_score', 0) * 100:.1f}"
                elif "EM_score" in metrics:
                    row[f"{dataset}"] = f"{metrics.get('EM_score', 0) * 100:.1f}"
                else:
                    row[f"{dataset}"] = "N/A"
            
            data.append(row)

# Convert to Pandas DataFrame
df = pd.DataFrame(data)

# Rename columns for better readability
df.rename(columns={
    "gov_report": "Gov(R(1/2/L))",
    "summ_screen_fd": "SumScr(R(1/2/L))",
    "qmsum": "QMSum(R(1/2/L))",
    "qasper": "Qasper(F1)",
    "narrative_qa": "NarQA(F1)",
    "quality": "Qual(EM)"
}, inplace=True)

df["Avg"] = df.iloc[:, 1:].apply(lambda row: sum([float(x.split('/')[0]) if '/' in x else float(x) for x in row if x != "N/A"]) / len(row), axis=1)

# Print the table
print(df)

         Model       Gov(R(1/2/L))   SumScr(R(1/2/L))    QMSum(R(1/2/L))  \
0   InstructLM    12.0 / 3.5 / 7.6    6.8 / 0.5 / 5.2   11.3 / 2.0 / 8.4   
1      AceInst  45.9 / 14.4 / 20.8   17.8 / 2.9 / 9.9   16.1 / 3.5 / 9.6   
2       LaMini  29.3 / 11.1 / 16.8  20.4 / 2.3 / 12.6  21.8 / 6.7 / 16.1   
3  llama-i(3B)  50.3 / 19.7 / 24.1  28.1 / 6.5 / 15.2  29.2 / 7.1 / 18.1   
4         Qwen  44.3 / 17.2 / 21.5  18.6 / 2.8 / 10.5  30.6 / 7.0 / 18.7   

  Qasper(F1) NarQA(F1) Qual(EM)        Avg  
0        5.4       4.8      0.0   6.716667  
1        5.0       2.8      0.0  14.600000  
2        3.3       5.2      0.0  13.333333  
3       13.3      13.3      0.0  22.366667  
4        7.8      12.0      4.8  19.683333  
