In [1]:
import pandas as pd
from pathlib import Path

In [2]:
base_dir = Path("../../analysis/")
clean_dir = base_dir / "fleurs"
noisy_dir = base_dir / "noisy_fleurs_ambient"

In [3]:
langs = [ 'en_es', 'en_fr', 'en_pt', 'en_it', 'en_de', 'en_nl',  'en_zh',     
                    'es_en', 'fr_en', 'pt_en', 'it_en', 'de_en',  'zh_en', ]

In [4]:
def compute_diffs(clean_file, noisy_file, fill_missing_with_zero=False):
    """Compute differences between clean and noisy results: clean - noisy"""
    
    clean_df = pd.read_csv(clean_file)
    noisy_df = pd.read_csv(noisy_file)

    # Merge with clear suffixes
    merged = clean_df.merge(
        noisy_df, on='system', how='outer', suffixes=('_clean', '_noisy'), indicator=True
    )

    # Matching column names
    clean_cols = list(merged.filter(like='_clean').columns)
    noisy_cols = [c.replace('_clean', '_noisy') for c in clean_cols]

    # Keep only columns that exist in both
    common_pairs = [(c_col, n_col) for c_col, n_col in zip(clean_cols, noisy_cols) if n_col in merged.columns]

    # Start output table
    out = merged[['system', '_merge']].copy()

    # Compute diff: clean - noisy
    for c_col, n_col in common_pairs:
        base = c_col[:-6]  # remove trailing '_clean'
        diff_col = f'{base}_diff'

        # Convert to numeric
        clean_vals = pd.to_numeric(merged[c_col], errors='coerce')
        noisy_vals = pd.to_numeric(merged[n_col], errors='coerce')

        if fill_missing_with_zero:
            clean_vals = clean_vals.fillna(0)
            noisy_vals = noisy_vals.fillna(0)

        out[diff_col] = (clean_vals - noisy_vals).round(4)

    return out

In [7]:
result = {}
for lang in langs:
    clean_file = clean_dir / f"fleurs_{lang}.csv"
    noisy_file = noisy_dir / f"noisy_fleurs_ambient_{lang}.csv"
    
    diff_df  = compute_diffs(clean_file, noisy_file)
    result[lang] = diff_df
    noisy_df = pd.read_csv(noisy_file)
    system_order = noisy_df['system'].tolist()
    diff_df = diff_df.set_index('system').reindex(system_order).reset_index()
    out_file = noisy_dir / f"diff_{lang}.csv"
    diff_df.to_csv(out_file, index=False)

In [8]:
for k, v in result.items():
    print(k)
    print(v, '\n\n')

en_es
               system      _merge  LinguaPy_diff  metricx_qe_score_diff  \
0       aya_canary-v2  right_only            NaN                    NaN   
1     aya_owsm4.0-ctc  right_only            NaN                    NaN   
2     aya_seamlessm4t  right_only            NaN                    NaN   
3         aya_whisper  right_only            NaN                    NaN   
4           canary-v2        both         0.1555                -2.4195   
5           desta2-8b  right_only            NaN                    NaN   
6     gemma_canary-v2  right_only            NaN                    NaN   
7   gemma_owsm4.0-ctc  right_only            NaN                    NaN   
8   gemma_seamlessm4t  right_only            NaN                    NaN   
9       gemma_whisper  right_only            NaN                    NaN   
10        owsm4.0-ctc  right_only            NaN                    NaN   
11     phi4multimodal        both         0.3110                -0.6161   
12      qwen2audio-