In [15]:
import itertools
import json
import os

import numpy as np
from pathlib import Path
import pandas as pd

In [35]:
BASE_DIR = "../../evaluation/output_evals/commonAccent"
DIRECTION_PAIRS = ['de_en','en_de','en_es','en_fr','en_it','en_pt','en_zh','es_en','it_en']
SYSTEM_NAMES = ["aya_canary-v2", "aya_owsm4.0-ctc","aya_seamlessm4t","aya_whisper",
                "canary-v2","desta2-8b","gemma_canary-v2","gemma_owsm4.0-ctc","gemma_seamlessm4t", "gemma_whisper",
                "owsm4.0-ctc","phi4multimodal","qwen2audio-7b","seamlessm4t",
                "tower_canary-v2", "tower_owsm4.0-ctc","tower_seamlessm4t","tower_whisper",
                "voxtral-small-24b","whisper","spirelm"]

In [36]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [18]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [19]:
def add_accent_column(df, manifests_dir="../../manifests/commonAccent"):
    """
    Adds an 'accent' column to the DataFrame by reading all .jsonl files in the given directory.
    
    Args:
        df (pd.DataFrame): The DataFrame containing at least a 'sample_id' column.
        manifests_dir (str or Path): Directory containing .jsonl manifest files.

    Returns:
        pd.DataFrame: The original DataFrame with a new 'accent' column.
    """
    manifests_dir = Path(manifests_dir)
    accent_map = {}

    # Read all .jsonl files in the directory
    for file in manifests_dir.glob("*.jsonl"):
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sid = str(record.get("sample_id"))  # keep as string for safety
                    acc = record.get("benchmark_metadata", {}).get("acc")
                    if sid and acc:
                        accent_map[sid] = acc
                except json.JSONDecodeError:
                    continue  # skip bad lines just in case

    if not accent_map:
        print("No accent data found in manifest files.")
        df["accent"] = None
        return df

    # Map the accent values onto the DataFrame
    df = df.copy()
    df["accent"] = df["sample_id"].astype(str).map(accent_map)

    return df

In [20]:
def compute_accent_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """
    df = df.copy()

    # --- Split linguapy_score into two separate columns ---
    df[["linguapy_flag", "linguapy_lang"]] = pd.DataFrame(
        df["linguapy_score"].tolist(), index=df.index
    )

    # --- Define penalties ---
    penalty_by_metric = {
        "metricx_qe": 25,
        "xcomet_qe": 0,
    }

    # --- Strict score per row ---
    for metric in penalty_by_metric.keys():
        df[f"{metric}_strict"] = df.apply(
            lambda row: row[f"{metric}_score"]
            if row["linguapy_flag"] == 0
            else penalty_by_metric[metric],
            axis=1,
        )

    # --- Aggregate by system × accent ---
    agg_cols = {
        "linguapy_flag": "mean",  # average from 0–1
    }
    for metric in penalty_by_metric.keys():
        agg_cols[f"{metric}_score"] = "mean"
        agg_cols[f"{metric}_strict"] = "mean"

    result = (
        df.groupby(["system", "accent"])
        .agg(agg_cols)
        .reset_index()
        .rename(columns={"linguapy_flag": "linguapy_avg"})
    )

    result['linguapy_avg'] = result['linguapy_avg']*100
    return result

In [37]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)



In [38]:
df=convert_results_to_dataframe(results_full)

In [39]:
#Need to add the column for accent ID
df = add_accent_column(df)

In [40]:
col_map = {
    "linguapy_avg":"LinguaPy",
    "metricx_qe_strict":"QEMetricX_24-Strict-linguapy",
    "xcomet_qe_strict": "XCOMET-QE-Strict-linguapy"
}

#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df = compute_accent_strict_scores(sub_df)
    #Standardize col names
    sub_df = sub_df.rename(columns=col_map)
    #Save 
    sub_df.to_csv(f"commonAccent_{pair}.csv",index=False)

In [33]:
sub_df

Unnamed: 0,system,accent,LinguaPy,metricx_qe_score,QEMetricX_24-Strict-linguapy,xcomet_qe_score,XCOMET-QE-Strict-linguapy
0,canary-v2,BASILICATA TRENTINO,3.0,4.760072,5.370204,0.766789,0.74972
1,canary-v2,EMILIANO,3.333333,3.8634,4.67922,0.885539,0.852206
2,canary-v2,MERIDIONALE,2.631579,3.556093,4.138213,0.884554,0.858294
3,canary-v2,TENDENTE AL SICULO MA NON MARCATO,4.0,4.072699,4.824354,0.846406,0.817873
4,canary-v2,VENETO,4.0,4.032336,4.870938,0.858331,0.827708
5,desta2-8b,BASILICATA TRENTINO,2.0,6.81307,7.057817,0.534913,0.532062
6,desta2-8b,EMILIANO,0.0,5.185234,5.185234,0.70791,0.70791
7,desta2-8b,MERIDIONALE,2.631579,4.122293,4.580931,0.754748,0.740045
8,desta2-8b,TENDENTE AL SICULO MA NON MARCATO,3.0,5.502043,6.057432,0.697144,0.682545
9,desta2-8b,VENETO,3.0,9.424629,9.778428,0.487218,0.481436


In [41]:
import glob

# Read all CSVs produced per direction
csv_files = glob.glob("commonAccent_*.csv")
all_frames = []
for path in csv_files:
    tmp = pd.read_csv(path)
    # infer direction from filename: commonAccent_{pair}.csv
    pair = path.split("commonAccent_")[-1].rsplit(".csv", 1)[0]
    tmp["direction"] = pair
    all_frames.append(tmp)

if not all_frames:
    raise RuntimeError("No commonAccent_*.csv files found in the working directory.")

full = pd.concat(all_frames, ignore_index=True)

# Normalize column names that might vary in capitalization or presence
# Expected useful columns: system, accent, direction, metrics...
non_metric_cols = {"system", "accent", "direction"}
metric_columns = [c for c in full.columns if c not in non_metric_cols]

# Aggregate across accents for each (system, direction)
agg = (
    full.groupby(["system", "direction"], as_index=False)[metric_columns]
        .mean()
)

# Desired direction column order (expanded for commonAccent)
desired_order = [
    "en_de", "en_es", "en_fr", "en_it", "en_pt", "en_zh",
    "de_en", "es_en", "it_en"
]

# Create and save one pivot per metric
for metric in metric_columns:
    pivot_df = agg.pivot_table(index="system", columns="direction", values=metric, aggfunc="mean")
    # Reorder columns based on desired order (keep only those present)
    cols = [c for c in desired_order if c in pivot_df.columns]
    # Append any remaining directions not listed to the end (stable order)
    remaining = [c for c in pivot_df.columns if c not in cols]
    pivot_df = pivot_df[cols + remaining]

    # # Add average column across available directions
    # pivot_df["average"] = pivot_df.mean(axis=1)

    # # Sort by average desc
    # pivot_df = pivot_df.sort_values("average", ascending=False)

    # Save
    out_name = f"commonAccent_{metric}_pivot.csv"
    pivot_df.to_csv(out_name)

    display(pd.Series({"saved": out_name, "rows": len(pivot_df), "cols": len(pivot_df.columns)}))


saved    commonAccent_LinguaPy_pivot.csv
rows                                  21
cols                                   9
dtype: object

saved    commonAccent_metricx_qe_score_pivot.csv
rows                                          21
cols                                           9
dtype: object

saved    commonAccent_QEMetricX_24-Strict-linguapy_pivo...
rows                                                    21
cols                                                     9
dtype: object

saved    commonAccent_xcomet_qe_score_pivot.csv
rows                                         21
cols                                          9
dtype: object

saved    commonAccent_XCOMET-QE-Strict-linguapy_pivot.csv
rows                                                   21
cols                                                    9
dtype: object