In [1]:
import json
import os
from datetime import datetime

import pandas as pd

from supertrainer import SUPERTRAINER_PUBLIC_ROOT

# Define the directory containing the folders
dataset_dir = os.getenv(SUPERTRAINER_PUBLIC_ROOT) # Replace with the path to your directory

# Initialize lists to store metrics and results
metrics_list = []
results_list = []

# Define the filter date
date_filter = datetime(2024, 10, 28, 10, 14, 31)

# Loop through each folder in the directory
for folder_name in os.listdir(dataset_dir):
    folder_path = os.path.join(dataset_dir, folder_name)
    if os.path.isdir(folder_path):
        # Extract date, dataset name, split name, and model name from folder name (assuming the format contains these details)
        try:
            parts = folder_name.split('-')
            dataset_name = parts[0]
            split_name = parts[1]
            date_str = parts[2]
            folder_date = datetime.strptime(date_str, "%Y%m%d_%H%M%S")
            model_name = '-'.join(parts[3:])
        except (ValueError, IndexError):
            continue  # Skip folders without a valid format

        # Apply date filter
        if folder_date <= date_filter:
            continue

        # Load metrics.json if available
        metrics_path = os.path.join(folder_path, "metrics.json")
        if os.path.exists(metrics_path):
            with open(metrics_path, "r", encoding="utf-8") as f:
                metrics = json.load(f)
                metrics["folder_name"] = folder_name
                metrics["date"] = folder_date
                metrics["dataset_name"] = dataset_name
                metrics["split_name"] = split_name
                metrics["model_name"] = model_name
                metrics_list.append(metrics)

        # Load results.json if available
        results_path = os.path.join(folder_path, "results.json")
        if os.path.exists(results_path):
            with open(results_path, "r", encoding="utf-8") as f:
                results = json.load(f)
                for result in results:
                    result["folder_name"] = folder_name
                    result["date"] = folder_date
                    result["dataset_name"] = dataset_name
                    result["split_name"] = split_name
                    result["model_name"] = model_name
                    results_list.append(result)

# Sort metrics and results by date (most recent first)
metrics_list = sorted(metrics_list, key=lambda x: x["date"], reverse=True)
results_list = sorted(results_list, key=lambda x: x["date"], reverse=True)

# Create DataFrames for better visualization and manipulation
metrics_df = pd.DataFrame(metrics_list)
results_df = pd.DataFrame(results_list)

# Save merged metrics and results to JSON files
merged_metrics_path = os.path.join(dataset_dir, "merged_metrics.json")
merged_results_path = os.path.join(dataset_dir, "merged_results.json")

with open(merged_metrics_path, "w", encoding="utf-8") as f:
    json.dump(metrics_list, f, default=str, indent=4)

with open(merged_results_path, "w", encoding="utf-8") as f:
    json.dump(results_list, f, default=str, indent=4)

# Optionally, display the merged DataFrames
print("Merged Metrics DataFrame:")
print(metrics_df.head())
print("\nMerged Results DataFrame:")
print(results_df.head())

Merged Metrics DataFrame:
   accuracy  precision    recall  f1_score  \
0     0.320   0.295038  0.297607  0.273850   
1     0.320   0.295002  0.296524  0.274947   
2     0.345   0.289295  0.277037  0.269059   
3     0.285   0.306417  0.303875  0.261300   
4     0.360   0.318717  0.311111  0.295991   

                                         folder_name                date  \
0  fake_news_detection_dataset_cross_lingual_form... 2024-10-28 21:49:04   
1  fake_news_detection_dataset_cross_lingual_form... 2024-10-28 21:43:41   
2  fake_news_detection_dataset_cross_lingual_form... 2024-10-28 21:38:56   
3  fake_news_detection_dataset_cross_lingual_form... 2024-10-28 21:34:19   
4  fake_news_detection_dataset_cross_lingual_form... 2024-10-28 21:28:34   

                                        dataset_name  \
0  fake_news_detection_dataset_cross_lingual_form...   
1  fake_news_detection_dataset_cross_lingual_form...   
2  fake_news_detection_dataset_cross_lingual_form...   
3  fake_news_det

In [57]:
metrics_df.iloc[-1]

accuracy                                                      0.4
precision                                                 0.54373
recall                                                   0.588433
f1_score                                                 0.425829
folder_name     fake_news_detection_dataset_cross_lingual_form...
date                                          2024-10-28 10:14:32
dataset_name    fake_news_detection_dataset_cross_lingual_form...
split_name                             train_claim_en_evidence_en
model_name                             claude-3-5-sonnet-20240620
Name: 82, dtype: object

In [2]:
metrics_df["model_name"].unique()

array(['gemma-2-9b-bnb-4bit', 'gpt-4o-mini',
       'bert-base-multilingual-uncased', 'indobert-base-uncased',
       'xlm-roberta-base', 'bert-base-arabic', 'Llama-3.2-3B-Instruct',
       'mistral-7b-instruct-v0.3-bnb-4bit', 'Qwen2.5-7B-bnb-4bit',
       'claude-3-5-sonnet-20240620'], dtype=object)

In [5]:
bert_based_models = {"xlm-roberta-base": "Bilal", "bert-base-arabic": "Bilal", "bert-base-multilingual-uncased": "Erland", "indobert-base-uncased": "Erland"}
open_llms = {"Llama-3.2-3B-Instruct": "Bilal", "mistral-7b-instruct-v0.3-bnb-4bit": "Bilal", "gemma-2-9b-bnb-4bit": "Erland", "Qwen2.5-7B-bnb-4bit": "Erland"}
closed_llms = {"gpt-4o-mini": "Bilal", "claude-3-5-sonnet-20240620": "Erland"}

metrics_df["person"] = metrics_df["model_name"].apply(lambda x: bert_based_models.get(x) or open_llms.get(x) or closed_llms.get(x))

bilal_df = metrics_df[metrics_df["person"] == "Bilal"]
erland_df = metrics_df[metrics_df["person"] == "Erland"]

model_map_to_short = {
    "xlm-roberta-base": "XLM-R",
    "bert-base-arabic": "ArabicBERT",
    "bert-base-multilingual-uncased": "mBERT",
    "indobert-base-uncased": "IndoBERT",
    "Llama-3.2-3B-Instruct": "Llama",
    "mistral-7b-instruct-v0.3-bnb-4bit": "Mistral",
    "gemma-2-9b-bnb-4bit": "Gemma",
    "Qwen2.5-7B-bnb-4bit": "Qwen",
    "gpt-4o-mini": "GPT-4o",
    "claude-3-5-sonnet-20240620": "Claude"
}

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Model categorization
bert_based_models = {
    "xlm-roberta-base": "BERT-based",
    "bert-base-arabic": "BERT-based", 
    "bert-base-multilingual-uncased": "BERT-based",
    "indobert-base-uncased": "BERT-based"
}

open_llms = {
    "Llama-3.2-3B-Instruct": "Open LLM",
    "mistral-7b-instruct-v0.3-bnb-4bit": "Open LLM",
    "gemma-2-9b-bnb-4bit": "Open LLM",
    "Qwen2.5-7B-bnb-4bit": "Open LLM"
}

closed_llms = {
    "gpt-4o-mini": "Closed LLM",
    "claude-3-5-sonnet-20240620": "Closed LLM"
}

# Model sizes (in billions of parameters) - approximate values
model_sizes = {
    "xlm-roberta-base": 0.125,
    "bert-base-arabic": 0.11,
    "bert-base-multilingual-uncased": 0.11,
    "indobert-base-uncased": 0.11,
    "Llama-3.2-3B-Instruct": 3,
    "mistral-7b-instruct-v0.3-bnb-4bit": 7,
    "gemma-2-9b-bnb-4bit": 2.9,
    "Qwen2.5-7B-bnb-4bit": 7,
    "gpt-4o-mini": 20,  
    "claude-3-5-sonnet-20240620": 20  
}

# Combine all models into a dictionary with their categories
all_models = {**bert_based_models, **open_llms, **closed_llms}

def process_data(df):
    # Filter for English evidence only
    df_en = df[df['folder_name'].str.contains('evidence_en')]
    
    # Add model category and size
    df_en['model_category'] = df_en['model_name'].map(lambda x: [v for k,v in all_models.items() if k in x][0])
    df_en['model_size'] = df_en['model_name'].map(lambda x: model_sizes[[k for k in model_sizes.keys() if k in x][0]])
    
    # Extract claim language
    df_en['claim_language'] = df_en['folder_name'].str.extract('claim_(\w+)_evidence')
    
    return df_en

def plot_performance_by_category(df_en):
    plt.figure(figsize=(12, 6))
    
    # Create grouped bar plot
    metrics = ['accuracy', 'precision', 'recall', 'f1_score']
    x = np.arange(len(df_en['model_category'].unique()))
    width = 0.2
    
    for i, metric in enumerate(metrics):
        means = df_en.groupby('model_category')[metric].mean()
        plt.bar(x + i*width, means, width, label=metric.replace('_', ' ').title())
    
    plt.xlabel('Model Category')
    plt.ylabel('Score')
    plt.title('Model Performance by Category (English Evidence)')
    plt.xticks(x + width*1.5, df_en['model_category'].unique(), rotation=0)
    plt.legend()
    plt.tight_layout()
    plt.savefig('category_performance.pdf', format='pdf', bbox_inches='tight', dpi=300)
    plt.close()

def plot_language_comparison(df_en):
    plt.figure(figsize=(10, 6))
    
    sns.boxplot(data=df_en, x='claim_language', y='f1_score', hue='model_category')
    plt.title('F1 Score by Claim Language and Model Category')
    plt.xlabel('Claim Language')
    plt.ylabel('F1 Score')
    plt.tight_layout()
    plt.savefig('language_comparison.pdf', format='pdf', bbox_inches='tight', dpi=300)
    plt.close()

def plot_size_impact(df_en):
    plt.figure(figsize=(10, 6))
    
    sns.scatterplot(data=df_en, x='model_size', y='f1_score', 
                    hue='model_category', style='claim_language',
                    s=100, alpha=0.7)
    
    plt.xscale('log')  # Log scale for model size
    plt.title('Impact of Model Size on Performance')
    plt.xlabel('Model Size (Billion Parameters)')
    plt.ylabel('F1 Score')
    plt.tight_layout()
    plt.savefig('size_impact.pdf', format='pdf', bbox_inches='tight', dpi=300)
    plt.close()

# Assuming df is your original DataFrame
# df_en = process_data(metrics_df)
# plot_performance_by_category(df_en)
# plot_language_comparison(df_en)
# plot_size_impact(df_en)

In [24]:
metrics_df

Unnamed: 0,accuracy,precision,recall,f1_score,folder_name,date,dataset_name,split_name,model_name,person
0,0.320,0.295038,0.297607,0.273850,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:49:04,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_arb,gemma-2-9b-bnb-4bit,Erland
1,0.320,0.295002,0.296524,0.274947,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:43:41,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_idn,gemma-2-9b-bnb-4bit,Erland
2,0.345,0.289295,0.277037,0.269059,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:38:56,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_en,gemma-2-9b-bnb-4bit,Erland
3,0.285,0.306417,0.303875,0.261300,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:34:19,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_arb,gemma-2-9b-bnb-4bit,Erland
4,0.360,0.318717,0.311111,0.295991,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:28:34,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_idn,gemma-2-9b-bnb-4bit,Erland
...,...,...,...,...,...,...,...,...,...,...
78,0.380,0.530741,0.587863,0.404891,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 11:36:01,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_idn,claude-3-5-sonnet-20240620,Erland
79,0.370,0.565988,0.599430,0.395954,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 11:10:11,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_en,claude-3-5-sonnet-20240620,Erland
80,0.485,0.552472,0.620171,0.492959,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 10:51:42,fake_news_detection_dataset_cross_lingual_form...,train_claim_en_evidence_arb,claude-3-5-sonnet-20240620,Erland
81,0.430,0.568176,0.588205,0.461038,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 10:33:23,fake_news_detection_dataset_cross_lingual_form...,train_claim_en_evidence_idn,claude-3-5-sonnet-20240620,Erland


In [30]:
def create_processed_dataframes(df):
    """
    Process the original dataframe to create filtered and summary dataframes
    """
    # Step 1: Create a model category mapping
    model_category_map = {**bert_based_models, **open_llms, **closed_llms}
    
    # Step 2: Filter for only English evidence
    mask_en_evidence = df['folder_name'].str.contains('evidence_en')
    df_filtered = df[mask_en_evidence].copy()
    
    # Step 3: Add model category and size
    df_filtered['model_category'] = df_filtered['model_name'].map(
        lambda x: next((v for k, v in model_category_map.items() if k in x), "Unknown")
    )
    df_filtered['model_size'] = df_filtered['model_name'].map(
        lambda x: next((v for k, v in model_sizes.items() if k in x), 0)
    )
    
    # Step 4: Extract claim language
    df_filtered['claim_language'] = df_filtered['folder_name'].str.extract('claim_(\w+)_evidence')
    
    # Step 5: Create summary dataframe
    df_summary = pd.DataFrame()
    
    # Calculate average performance for English and non-English claims
    for model in df_filtered['model_name'].unique():
        model_data = df_filtered[df_filtered['model_name'] == model]
        
        en_perf = model_data[model_data['claim_language'] == 'en']['f1_score'].iloc[0]
        non_en_perf = model_data[model_data['claim_language'] != 'en']['f1_score'].mean()
        
        model_category = model_data['model_category'].iloc[0]
        model_size = model_data['model_size'].iloc[0]
        
        # Get metrics from the English claim data
        precision = model_data[model_data['claim_language'] == 'en']['precision'].iloc[0]
        recall = model_data[model_data['claim_language'] == 'en']['recall'].iloc[0]
        
        df_summary = pd.concat([df_summary, pd.DataFrame({
            'model_name': [model],
            'model_category': [model_category],
            'model_size': [model_size],
            'en_performance': [en_perf],
            'avg_non_en_performance': [non_en_perf],
            'precision': [precision],
            'recall': [recall],
            'avg_f1': [model_data['f1_score'].mean()]
        })])
    
    return df_filtered, df_summary


In [25]:
df_summary

Unnamed: 0,model_name,model_category,model_size,en_performance,non_en_performance,precision,recall,avg_f1
0,gemma-2-9b-bnb-4bit,Open LLM,2.9,0.372865,0.288436,0.375214,0.389972,0.316579
0,gpt-4o-mini,Closed LLM,20.0,0.419517,0.396073,0.506056,0.577265,0.403888
0,bert-base-multilingual-uncased,BERT-based,0.11,0.115331,0.121334,0.071181,0.303704,0.119333
0,indobert-base-uncased,BERT-based,0.11,0.17284,0.185277,0.12395,0.34963,0.181131
0,xlm-roberta-base,BERT-based,0.125,0.074074,0.074074,0.041667,0.333333,0.074074
0,bert-base-arabic,BERT-based,0.11,0.122449,0.122449,0.075,0.333333,0.122449
0,Llama-3.2-3B-Instruct,Open LLM,3.0,0.285641,0.273346,0.31089,0.365128,0.277444
0,mistral-7b-instruct-v0.3-bnb-4bit,Open LLM,7.0,0.296002,0.282162,0.33116,0.326154,0.286776
0,Qwen2.5-7B-bnb-4bit,Open LLM,7.0,0.347996,0.332806,0.354342,0.35943,0.337869
0,claude-3-5-sonnet-20240620,Closed LLM,20.0,0.425829,0.395954,0.54373,0.588433,0.410892


In [26]:
df_filtered

Unnamed: 0,accuracy,precision,recall,f1_score,folder_name,date,dataset_name,split_name,model_name,person,model_category,model_size,claim_language
2,0.345,0.289295,0.277037,0.269059,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:38:56,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_en,gemma-2-9b-bnb-4bit,Erland,Open LLM,2.9,arb
5,0.37,0.330929,0.321083,0.307813,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:23:46,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_en,gemma-2-9b-bnb-4bit,Erland,Open LLM,2.9,idn
8,0.455,0.375214,0.389972,0.372865,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 21:09:02,fake_news_detection_dataset_cross_lingual_form...,train_claim_en_evidence_en,gemma-2-9b-bnb-4bit,Erland,Open LLM,2.9,en
11,0.38,0.486184,0.590541,0.389355,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 19:41:04,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_en,gpt-4o-mini,Bilal,Closed LLM,20.0,arb
14,0.39,0.521879,0.582222,0.402792,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 19:24:07,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_en,gpt-4o-mini,Bilal,Closed LLM,20.0,idn
17,0.415,0.506056,0.577265,0.419517,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 19:06:40,fake_news_detection_dataset_cross_lingual_form...,train_claim_en_evidence_en,gpt-4o-mini,Bilal,Closed LLM,20.0,en
20,0.22,0.073702,0.325926,0.120219,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 12:54:28,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_en,bert-base-multilingual-uncased,Erland,BERT-based,0.11,arb
23,0.225,0.075,0.333333,0.122449,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 12:53:47,fake_news_detection_dataset_cross_lingual_form...,train_claim_idn_evidence_en,bert-base-multilingual-uncased,Erland,BERT-based,0.11,idn
26,0.205,0.071181,0.303704,0.115331,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 12:53:06,fake_news_detection_dataset_cross_lingual_form...,train_claim_en_evidence_en,bert-base-multilingual-uncased,Erland,BERT-based,0.11,en
28,0.215,0.131639,0.371852,0.194238,fake_news_detection_dataset_cross_lingual_form...,2024-10-28 12:51:41,fake_news_detection_dataset_cross_lingual_form...,train_claim_arb_evidence_en,indobert-base-uncased,Erland,BERT-based,0.11,arb


In [33]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

def create_bubble_plot(df, x_metric, y_metric, title, filename, label_offsets=None):
    """
    Create bubble plot with manual control over label positions
    
    Parameters:
    - label_offsets: dict with model names as keys and (x_offset, y_offset) as values
                    e.g., {'Claude': (0.02, 0.01), 'GPT-4': (-0.02, 0.01)}
    """
    if label_offsets is None:
        label_offsets = {}
    
    plt.figure(figsize=(10, 8))
    plt.style.use('seaborn-v0_8-paper')

    plt.rcParams.update({
        'font.size': 12,          # Base font size
        'axes.titlesize': 16,     # Title font size
        'axes.labelsize': 14,     # Axis label font size
        'xtick.labelsize': 12,    # X-axis tick labels
        'ytick.labelsize': 12,    # Y-axis tick labels
        'legend.title_fontsize': 8,  # Legend title font size
        'legend.fontsize': 8     # Legend text font size
    })
    
    # Create color mapping for categories
    categories = ['BERT-based', 'Open LLM', 'Closed LLM']
    colors = ['#ff9999', '#66b3ff', '#99ff99']
    color_map = dict(zip(categories, colors))
    
    # Create scatter plot
    dummy_scatter_plot = []
    for category in categories:
        mask = df['model_category'] == category
        
        # Calculate bubble sizes
        sizes = df[mask]['model_size'] * 100
        
        plt.scatter(df[mask][x_metric], 
                   df[mask][y_metric],
                   s=sizes,
                   c=[color_map[category]],
                   alpha=0.6,
                   label=category)

        # Create a dummy scatter plot with uniform size for legend
        dummy_scatter = plt.scatter([], [], 
                                  s=200,  # Fixed size for legend
                                  c=[color_map[category]],
                                  alpha=0.6,
                                  label=category)
        dummy_scatter_plot.append(dummy_scatter)
        
        # Add model name labels
        for idx, row in df[mask].iterrows():
            model_name = model_map_to_short[row['model_name']]
            
            x_offset, y_offset = label_offsets.get(model_name, (5, 5))
        
            plt.annotate(model_name,
                        (row[x_metric], row[y_metric]),
                        xytext=(x_offset, y_offset),
                        textcoords='offset points',
                        fontsize=12)
    
    plt.xlabel(x_metric.replace('_', ' ').title())
    plt.ylabel(y_metric.replace('_', ' ').title())
    plt.title(title)

    # Set axes limits from 0 to 1
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    
    # Add grid for better readability
    plt.grid(True, linestyle='--', alpha=0.3)
    
    # Create legend with uniform-sized markers
    plt.legend(dummy_scatter_plot,
              categories,
              title="Model Category",
              bbox_to_anchor=(1.05, 1.05),
              loc='lower right',
              borderaxespad=0.5,
              framealpha=0.9,
              labelspacing=1.5)
    
    
    # Add size legend with increased spacing
    size_legend_elements = []
    sizes = [0.1, 3, 7, 20]
    labels = ['0.1B params', '3B params', '7B params', 'Proprietary (>7B params)']
    
    for size, label in zip(sizes, labels):
        size_legend_elements.append(
            plt.scatter([], [], s=size*100, c='gray', alpha=0.3, label=label)
        )
    
    # legend2 = plt.legend(handles=size_legend_elements,
    #                     title="Model Size",
    #                     bbox_to_anchor=(1.05, 0.5),
    #                     loc='center left',
    #                     borderaxespad=0,
    #                     labelspacing=5,
    #                     handletextpad=5)
    
    plt.tight_layout()
    plt.savefig(filename, format='pdf', bbox_inches='tight', dpi=300)
    plt.close()

# Example usage:
# Define custom offsets for specific models



# Create processed dataframes
df_filtered, df_summary = create_processed_dataframes(metrics_df)

# Now you can use these with the bubble plot function:
label_offsets = {
    # 'Claude': (10, 10),      # Move Claude label 10 points right, 10 points up
    # 'GPT-4': (-10, 5),       # Move GPT-4 label 10 points left, 5 points up
    # 'Llama': (5, -5),        # Move Llama label 5 points right, 5 points down
    # 'Mistral': (-5, -10),    # Move Mistral label 5 points left, 10 points down
    # 'Qwen': (0, 5),          # Move Qwen label 15 points up
    "XLM-R" : (-5, -10),
    "IndoBERT" : (0, 10),
    # "ArabicBERT": (0, 10),
    "mBERT" : (0, -10),
    "ArabicBERT" : (0, 10)
}
create_bubble_plot(df_summary, 
                  'precision', 
                  'recall',
                  'Precision-Recall Trade-off by Model Type and Size',
                  'precision_recall_bubble_adjusted.pdf',
                  label_offsets=label_offsets)

label_offsets = {
    'Claude': (10, 10),      # Move Claude label 10 points right, 10 points up
    'GPT-4': (-10, 5),       # Move GPT-4 label 10 points left, 5 points up
    # 'Llama': (5, -5),        # Move Llama label 5 points right, 5 points down
    'Mistral': (-5, -10),    # Move Mistral label 5 points left, 10 points down
    'Qwen': (0, 5),          # Move Qwen label 15 points up
    "XLM-R" : (-5, 10),
    "ArabicBERT": (0, 10),
}

create_bubble_plot(df_summary, 
                  'en_performance', 
                  'avg_non_en_performance',
                  'Cross-lingual Performance by Model Type and Size',
                  'crosslingual_bubble.pdf',
                  label_offsets=label_offsets)


In [69]:
erland_df["model_name"].unique()

array(['bert-base-multilingual-uncased', 'indobert-base-uncased',
       'Qwen2.5-7B-bnb-4bit', 'claude-3-5-sonnet-20240620'], dtype=object)

In [7]:
metrics_df.to_csv("metrics.csv", index=False)

In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set style for publication-quality figures
plt.style.use('seaborn-v0_8-paper')
sns.set_context("paper", font_scale=1.2)

In [10]:
# 1. Model Performance Comparison across Metrics

plt.figure(figsize=(12, 6))
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
x = np.arange(len(metrics_df['model_name']))
width = 0.2

for i, metric in enumerate(metrics):
    plt.bar(x + i*width, metrics_df[metric], width, label=metric.capitalize())

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison Across All Metrics')
plt.xticks(x + width*1.5, metrics_df['model_name'], rotation=45, ha='right')
plt.legend(loc='upper right', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.savefig('model_comparison.pdf', format='pdf', bbox_inches='tight', dpi=300)
plt.close()

# 2. Language Pair Performance Heatmap
# Assuming you have language pair data
# Create a pivot table for language pairs
lang_pairs = ['en-en', 'en-arb', 'en-idn', 'arb-arb', 'arb-en', 'arb-idn', 'idn-idn', 'idn-en', 'idn-arb']
languages = ['English', 'Arabic', 'Indonesian']
performance_matrix = np.random.rand(3, 3)  # Replace with actual data

plt.figure(figsize=(8, 6))
sns.heatmap(performance_matrix, annot=True, fmt='.3f', 
            xticklabels=languages, yticklabels=languages,
            cmap='YlOrRd', vmin=0, vmax=1)
plt.title('Cross-lingual Performance Matrix (F1-Score)')
plt.xlabel('Evidence Language')
plt.ylabel('Claim Language')
plt.tight_layout()
plt.savefig('language_heatmap.pdf', format='pdf', bbox_inches='tight', dpi=300)
plt.close()

# 3. Performance Distribution Box Plot
plt.figure(figsize=(10, 6))
df_melted = metrics_df.melt(id_vars=['model_name'], value_vars=metrics, 
                    var_name='Metric', value_name='Score')

sns.boxplot(data=df_melted, x='Metric', y='Score', width=0.5)
plt.title('Distribution of Model Performance by Metric')
plt.xlabel('Evaluation Metric')
plt.ylabel('Score')
plt.tight_layout()
plt.savefig('metric_distribution.pdf', format='pdf', bbox_inches='tight', dpi=300)
plt.close()

In [73]:
def generate_individual_latex_tables(df, person_name):
    categories = [
        ("BERT-Based Models", bert_based_models, "bert"),
        ("Open LLMs", open_llms, "open-llms"),
        ("Closed LLMs", closed_llms, "closed-llms")
    ]
    
    latex_str = ""
    
    for category_name, models_dict, label_suffix in categories:
        latex_str += "\\begin{table}[h]\n"
        latex_str += f"    \\caption{{Cross-Lingual Evaluation Results of {person_name}'s Evaluation ({category_name})}}\n"
        latex_str += f"    \\label{{tab:{label_suffix}-{person_name.lower()}}}\n"
        latex_str += "    \\small\n"
        latex_str += "    \\begin{tabularx}{\\columnwidth}{l l l X X X X}\n"
        latex_str += "    \\toprule\n"
        latex_str += "    \\textbf{Model} & \\textbf{Claim} & \\textbf{Evidence} & \\textbf{Acc} & \\textbf{Prec} & \\textbf{Rec} & \\textbf{F1} \\\\ \n"
        latex_str += "    \\midrule\n"
        
        # Filter dataframe for models in the current category
        category_models = df[df["model_name"].isin(models_dict.keys())]
        for _, row in category_models.iterrows():
            claim_lang = row["split_name"].split('_')[2]
            evidence_lang = row["split_name"].split('_')[4]
            claim_lang = "en" if claim_lang == "en" else "id" if claim_lang == "idn" else "ar"
            evidence_lang = "en" if evidence_lang == "en" else "id" if evidence_lang == "idn" else "ar"
            latex_str += f"    {model_map_to_short[row['model_name']]} & {claim_lang} & {evidence_lang} & {row['accuracy']:.2f} & {row['precision']:.2f} & {row['recall']:.2f} & {row['f1_score']:.2f} \\\\ \n"
        
        latex_str += "    \\bottomrule\n"
        latex_str += "    \\end{tabularx}\n"
        latex_str += "\\end{table}\n\n"
    
    return latex_str

# Generate LaTeX tables for Bilal and Erland
latex_bilal_tables = generate_individual_latex_tables(bilal_df, "Bilal")
latex_erland_tables = generate_individual_latex_tables(erland_df, "Erland")

# Print the LaTeX code for each person's tables
print("\nLaTeX Tables for Bilal's Evaluation:\n")
print(latex_bilal_tables)
print("\nLaTeX Tables for Erland's Evaluation:\n")
print(latex_erland_tables)



LaTeX Tables for Bilal's Evaluation:

\begin{table}[h]
    \caption{Cross-Lingual Evaluation Results of Bilal's Evaluation (BERT-Based Models)}
    \label{tab:bert-bilal}
    \small
    \begin{tabularx}{\columnwidth}{l l l X X X X}
    \toprule
    \textbf{Model} & \textbf{Claim} & \textbf{Evidence} & \textbf{Acc} & \textbf{Prec} & \textbf{Rec} & \textbf{F1} \\ 
    \midrule
    XLM-R & ar & ar & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & ar & id & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & ar & en & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & id & ar & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & id & id & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & id & en & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & en & ar & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & en & id & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    XLM-R & en & en & 0.12 & 0.04 & 0.33 & 0.07 \\ 
    ArabicBERT & ar & ar & 0.23 & 0.07 & 0.33 & 0.12 \\ 
    ArabicBERT & ar & id & 0.23 & 0.30 & 0.36 & 0.17 \\ 
    ArabicBERT & ar & en & 0.23 & 0.07 