In [1]:
#!/usr/bin/env python3
import os, sys
from normalize_scores_utils import *

import logging
logger = logging.getLogger('logger')
logger.setLevel(logging.INFO)

# Main Part: Table and Correlations

In [2]:
data_folder = '/mnt/c/Users/user/Desktop/work/metric_sensitivity_analysis'

# Load the ACES scores
ACES_scores_2022_path = os.path.join(data_folder, 'aces-scored-2022-all-scores.only.quote_errors_removed.tsv')
ACES_scores_2023_path = os.path.join(data_folder, 'aces-scored-2023-all-scores.quote_errors_removed.tsv')
# 2023 later
ACES_scores_2022 = load_ACES_scores(ACES_scores_2022_path, good_token='.-good', bad_token='.-bad', mapping=METRIC_NAMES_MAPPING)
ACES_scores_2023 = load_ACES_scores(ACES_scores_2023_path, good_token='.-good', bad_token='.-bad', mapping=METRIC_NAMES_MAPPING, skip_metrics=[])
metrics_names_2022 = set(ACES_scores_2022.keys())
metrics_names_2023 = set(ACES_scores_2023.keys())

# Load WMT22 metric scores
WMT22_scores_path = os.path.join(data_folder, 'WMT22-metric-scores')
WMT22_scores = load_WMT_scores(WMT22_scores_path, set(metrics_names_2022).union(set(metrics_names_2023)))

# Load WMT23 metric scores
WMT23_scores_path = os.path.join(data_folder, 'wmt23metrics-submissions-v2')
WMT23_scores = load_WMT_scores_23(WMT23_scores_path, set(metrics_names_2022).union(set(metrics_names_2023)))

# calculate sensitivities
metrics_names_2022 = list(set(ACES_scores_2022.keys()).intersection(set(WMT22_scores.keys())))
sensitivities_2022, _, _, phenomena_2022, means_good_2022, means_bad_2022 = calculate_sensitivities(ACES_scores_2022, WMT22_scores, mapping=PHENOMENA_MAPPING)
metrics_names_2023 = list(set(ACES_scores_2023.keys()).intersection(set(WMT23_scores.keys())))
sensitivities_2023, _, _, phenomena_2023, means_good_2023, means_bad_2023 = calculate_sensitivities(ACES_scores_2023, WMT23_scores, mapping=PHENOMENA_MAPPING)

# load the ACES scores from the paper
ACES_summary_2022 = load_ACES_scores_summary_2022()
ACES_summary_2023 = load_ACES_scores_summary_2023(skip_metrics=[])

100%|██████████| 21/21 [00:37<00:00,  1.78s/it]
100%|██████████| 83/83 [00:57<00:00,  1.44it/s]


Sensitivities


100%|██████████| 10/10 [00:36<00:00,  3.61s/it]


Sensitivities


100%|██████████| 10/10 [00:32<00:00,  3.27s/it]


### Overview Table

In [14]:
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green3}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red3}', '\colorbox{red4}', '\colorbox{red5}']
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red4}', '\colorbox{red5}']

# From the ACES 2022 Paper:
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEURT-20", "COMET-20", "YISI-1"],
                        "reference-based": ["metricx_xl_DA_2019", 'metricx_xl_MQM_2020', "metricx_xxl_MQM_2020"],
                        "reference-free": ["COMETKiwi", 'Cross-QE', "UniTE-src"]
                    }
METRICS_GROUPING_SHORT_2023 = {"baseline": ["COMETKiwi", "MS-COMET-QE-22", "BLEURT-20"],
                        "reference-based": ["MetricX-23-c", 'MetricX-23', "XCOMET-Ensemble"],
                        "reference-free": ["GEMBA-MQM", 'MetricX-23-QE', "CometKiwi-XXL"]
                    }

phenomena = ['omission', 'mistranslation', 'untranslated', 'real-world knowledge', 'wrong language']
# phenomena = ["hallucination-number-level-1", "hallucination-number-level-2", "hallucination-number-level-3"]

# print(make_header(scores=ACES_scores_2022, ACES_column=True, p_header_1=PHENOMENA_HEADER_1, p_header_2=PHENOMENA_HEADER_2))
print(generate_summary_table(sensitivities_2023, metrics_groups=METRICS_GROUPING_SHORT_2023, phenomena=PHENOMENA, ACES_column=True, global_colors=False))
# print(make_footer(averages=SUMMARY_AVERAGES_2023, phenomena=PHENOMENA))

COMETKiwi					&	\phantom{-}0.196	&	\phantom{-}0.618	&	\phantom{-}0.721	&	-0.180	&	\phantom{-}0.285	&	\phantom{-}0.719	&	\phantom{-}0.439	&	\phantom{-}0.316	&	-0.767	&	\phantom{-}0.218	&	\phantom{-}13.14	 \\ 
MS-COMET-QE-22					&	-0.040	&	\phantom{-}0.391	&	\phantom{-}0.258	&	\colorbox[HTML]{B2EAB1}{\textbf{\phantom{-}2.730}}	&	\phantom{-}0.126	&	\phantom{-}0.257	&	\phantom{-}0.185	&	\phantom{-}0.095	&	-0.654	&	\phantom{-}0.226	&	\phantom{-}7.57	 \\ 
BLEURT-20					&	\phantom{-}0.094	&	\phantom{-}0.314	&	\phantom{-}0.177	&	\phantom{-}1.545	&	\phantom{-}0.353	&	\phantom{-}0.126	&	-0.002	&	\phantom{-}0.048	&	\colorbox[HTML]{B2EAB1}{\textbf{\phantom{-}0.732}}	&	\colorbox[HTML]{B2EAB1}{\textbf{\phantom{-}0.281}}	&	\phantom{-}6.25	 \\ 
\midrule 
MetricX-23-c					&	\phantom{-}0.021	&	\phantom{-}0.413	&	\phantom{-}0.645	&	\phantom{-}0.334	&	\phantom{-}0.399	&	\phantom{-}0.593	&	\phantom{-}0.330	&	\phantom{-}0.766	&	-0.728	&	\phantom{-}0.131	&	\phantom{-}10.79	 \\ 
MetricX-23					&	\phantom{-}0.

### Single and Double Tables for Mistranslation Groups -dicourse, hallucinations, other

In [3]:
PHENOMENA_MAPPING_MISTRANSLATION = {
    'ambiguous-translation-wrong-discourse-connective-since-causal': 'discourse',
    'ambiguous-translation-wrong-discourse-connective-since-temporal': 'discourse',
    'ambiguous-translation-wrong-discourse-connective-while-contrast': 'discourse',
    'ambiguous-translation-wrong-discourse-connective-while-temporal': 'discourse',
    'ambiguous-translation-wrong-gender-female-anti': 'other',
    'ambiguous-translation-wrong-gender-female-pro': 'other',
    'ambiguous-translation-wrong-gender-male-anti': 'other',
    'ambiguous-translation-wrong-gender-male-pro': 'other',
    # 'ambiguous-translation-wrong-sense-frequent': 'other',
    # 'ambiguous-translation-wrong-sense-infrequent': 'other',
    'anaphoric_group_it-they:deletion': 'discourse',
    'anaphoric_group_it-they:substitution': 'discourse',
    'anaphoric_intra_non-subject_it:deletion': 'discourse',
    'anaphoric_intra_non-subject_it:substitution': 'discourse',
    'anaphoric_intra_subject_it:deletion': 'discourse',
    'anaphoric_intra_subject_it:substitution': 'discourse',
    'anaphoric_intra_they:deletion': 'discourse',
    'anaphoric_intra_they:substitution': 'discourse',
    'anaphoric_singular_they:deletion': 'discourse',
    'anaphoric_singular_they:substitution': 'discourse',
    'coreference-based-on-commonsense': 'discourse',
    'hallucination-date-time': 'hallucination',
    'hallucination-named-entity-level-1': 'hallucination',
    'hallucination-named-entity-level-2': 'hallucination',
    'hallucination-named-entity-level-3': 'hallucination',
    'hallucination-number-level-1': 'hallucination',
    'hallucination-number-level-2': 'hallucination',
    'hallucination-number-level-3': 'hallucination',
    'hallucination-real-data-vs-ref-word': 'hallucination',
    'hallucination-real-data-vs-synonym': 'hallucination',
    'hallucination-unit-conversion-amount-matches-ref': 'hallucination',
    'hallucination-unit-conversion-unit-matches-ref': 'hallucination',
    'lexical-overlap': 'other',
    'modal_verb:deletion': 'other',
    'modal_verb:substitution': 'other',
    'nonsense': 'hallucination',
    'ordering-mismatch': 'other',
    'overly-literal-vs-correct-idiom': 'other',
    'overly-literal-vs-explanation': 'other',
    'overly-literal-vs-ref-word': 'other',
    'overly-literal-vs-synonym': 'other',
    'pleonastic_it:deletion': 'discourse',
    'pleonastic_it:substitution': 'discourse',
    'xnli-addition-contradiction': 'other',
    'xnli-addition-neutral': 'other',
    'xnli-omission-contradiction': 'other',
    'xnli-omission-neutral': 'other'
}

In [4]:
metrics_names_2022 = list(set(ACES_scores_2022.keys()).intersection(set(WMT22_scores.keys())))
sensitivities_2022_mistranslation, _, _, phenomena_2022_mistranslation, _, _ = calculate_sensitivities(ACES_scores_2022, WMT22_scores, mapping=PHENOMENA_MAPPING_MISTRANSLATION)
ACES_scores_2022_mistranslation = map_to_higher(ACES_scores_2022, PHENOMENA_MAPPING_MISTRANSLATION)
ACES_summary_2022_mistranslation, phenomena_mistranslation = calculate_tau_correlations(ACES_scores_2022_mistranslation, phenomena=set(PHENOMENA_MAPPING_MISTRANSLATION.values()))

Sensitivities


100%|██████████| 3/3 [00:12<00:00,  4.26s/it]


In [5]:
metrics_names_2023 = list(set(ACES_scores_2023.keys()).intersection(set(WMT23_scores.keys())))
sensitivities_2023_mistranslation, _, _, phenomena_2023_mistranslation, _, _ = calculate_sensitivities(ACES_scores_2023, WMT23_scores, mapping=PHENOMENA_MAPPING_MISTRANSLATION)
ACES_scores_2023_mistranslation = map_to_higher(ACES_scores_2023, PHENOMENA_MAPPING_MISTRANSLATION)
ACES_summary_2023_mistranslation, phenomena_mistranslation = calculate_tau_correlations(ACES_scores_2023_mistranslation, phenomena=set(PHENOMENA_MAPPING_MISTRANSLATION.values()))

Sensitivities


100%|██████████| 3/3 [00:08<00:00,  2.90s/it]


In [6]:
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green3}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red3}', '\colorbox{red4}', '\colorbox{red5}']
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red4}', '\colorbox{red5}']

# From the ACES 2022 Paper:
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEU", "COMET-20", "COMET-QE"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }
METRICS_GROUPING_SHORT_2023 = {"baseline": ["BLEU", "BERTScore", "BLEURT-20", "YISI-1"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }

phenomena = ['untranslated', 'real-world knowledge', 'wrong language']
phenomena = ["discourse", "hallucination", "other"]

PHENOMENA_HEADER_1_MISTRANSLATION = dict(zip(phenomena, ['\\textbf{disco.}', '\\textbf{halluci.}', '\\textbf{other}']))
PHENOMENA_HEADER_2_MISTRANSLATION = dict(zip(phenomena, ['', '',  '']))

NUM_SAMPLES_MISTRANSLATION = {p:0 for p in phenomena}
for p,target in PHENOMENA_MAPPING_MISTRANSLATION.items():
    NUM_SAMPLES_MISTRANSLATION[target] += len(ACES_scores_2022["BLEU"][p][0])

# print(make_header(phenomena=phenomena, scores=ACES_summary_2022_mistranslation, ACES_column=False, p_header_1=PHENOMENA_HEADER_1_MISTRANSLATION, p_header_2=PHENOMENA_HEADER_2_MISTRANSLATION, num_samples=NUM_SAMPLES_MISTRANSLATION))
# print(make_header(phenomena=phenomena, scores=ACES_summary_2022_mistranslation, ACES_column=False, p_header_1=PHENOMENA_HEADER_1_MISTRANSLATION, p_header_2=PHENOMENA_HEADER_2_MISTRANSLATION, num_samples=NUM_SAMPLES_MISTRANSLATION))
# print(generate_summary_table(sensitivities_2022_mistranslation, metrics_groups=METRICS_GROUPING_2022, phenomena=phenomena, ACES_column=False, global_colors=False))
print(generate_summary_table_double(ACES_summary_2023_mistranslation, sensitivities_2023_mistranslation, metrics_groups=METRICS_GROUPING_2023, phenomena=phenomena, ACES_column=False, global_colors=False))
# print(make_footer(averages=SUMMARY_AVERAGES_2023, phenomena=PHENOMENA))

BERTScore					&	\phantom{-}0.516	&	-0.183	&	\phantom{-}0.371	&	\phantom{-}0.011	&	-0.226	&	\phantom{-}0.207		 \\ 
BLEU					&	\phantom{-}0.142	&	-0.497	&	-0.187	&	-0.033	&	-0.331	&	\phantom{-}0.245		 \\ 
BLEURT-20					&	\phantom{-}0.589	&	\phantom{-}0.033	&	\phantom{-}0.390	&	\phantom{-}0.137	&	-0.071	&	\phantom{-}0.450		 \\ 
chrF					&	\phantom{-}0.372	&	-0.266	&	\phantom{-}0.210	&	-0.014	&	-0.310	&	\phantom{-}0.172		 \\ 
COMET-22					&	\phantom{-}0.600	&	\phantom{-}0.020	&	\phantom{-}0.394	&	\phantom{-}0.195	&	-0.031	&	\phantom{-}0.439		 \\ 
COMETKiwi					&	\phantom{-}0.737	&	\phantom{-}0.454	&	\phantom{-}0.719	&	\phantom{-}0.247	&	\phantom{-}0.317	&	\phantom{-}1.325		 \\ 
f200spBLEU					&	\phantom{-}0.153	&	-0.296	&	-0.205	&	-0.039	&	-0.337	&	\phantom{-}0.204		 \\ 
MS-COMET-QE-22					&	\phantom{-}0.609	&	\phantom{-}0.244	&	\phantom{-}0.505	&	\phantom{-}0.201	&	\phantom{-}0.125	&	\phantom{-}0.418		 \\ 
Random-sysname					&	-0.123	&	-0.114	&	-0.131	&	-0.004	&	\phantom{-}0.012	&	-0.005	

### Hallucination -Numbers double table

In [12]:
PHENOMENA_MAPPING_NUMBERS = {
    'hallucination-number-level-1': 'hallucination-number-level-1',
    'hallucination-number-level-2': 'hallucination-number-level-2',
    'hallucination-number-level-3': 'hallucination-number-level-3'
}
metrics_names_2022 = list(set(ACES_scores_2022.keys()).intersection(set(WMT22_scores.keys())))
sensitivities_2022_numbers, _, _, phenomena_2022_numbers, _, _ = calculate_sensitivities(ACES_scores_2022, WMT22_scores, mapping=PHENOMENA_MAPPING_NUMBERS)
ACES_summary_2022_numbers, phenomena_numbers = calculate_tau_correlations(ACES_scores_2022, phenomena=PHENOMENA_MAPPING_NUMBERS.values())


Sensitivities


100%|██████████| 3/3 [00:11<00:00,  3.98s/it]


In [9]:
metrics_names_2023 = list(set(ACES_scores_2023.keys()).intersection(set(WMT23_scores.keys())))
sensitivities_2023_numbers, _, _, phenomena_2023_numbers, _, _ = calculate_sensitivities(ACES_scores_2023, WMT23_scores, mapping=PHENOMENA_MAPPING_NUMBERS)
ACES_summary_2023_numbers, phenomena_numbers = calculate_tau_correlations(ACES_scores_2023, phenomena=PHENOMENA_MAPPING_NUMBERS.values())

Sensitivities


100%|██████████| 3/3 [00:11<00:00,  3.96s/it]


In [16]:
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green3}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red3}', '\colorbox{red4}', '\colorbox{red5}']
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red4}', '\colorbox{red5}']

# From the ACES 2022 Paper:
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEU", "COMET-20", "COMET-QE"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }
METRICS_GROUPING_SHORT_2023 = {"baseline": ["BLEU", "BERTScore", "BLEURT-20", "YISI-1"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }

phenomena = ['untranslated', 'real-world knowledge', 'wrong language']
phenomena = ["hallucination-number-level-1", "hallucination-number-level-2", "hallucination-number-level-3"]

PHENOMENA_HEADER_1_NUMBERS = dict(zip(phenomena, ['\\textbf{Level 1}', '\\textbf{Level 2}', '\\textbf{Level 3}']))
PHENOMENA_HEADER_2_NUMBERS = dict(zip(phenomena, ['', '',  '']))

# NUM_SAMPLES_NUMBERS = {p:len(ACES_scores_2022["BLEU"][p][0]) for p in phenomena}

# print(make_header(phenomena=phenomena, ACES_column=False, p_header_1=PHENOMENA_HEADER_1_NUMBERS, p_header_2=PHENOMENA_HEADER_2_NUMBERS, num_samples=NUM_SAMPLES_NUMBERS))
print(generate_summary_table_double(ACES_summary_2023_numbers, sensitivities_2023_numbers, metrics_groups=METRICS_GROUPING_2023, phenomena=phenomena, ACES_column=False, global_colors=False))
# print(make_footer(averages=SUMMARY_AVERAGES_2023, phenomena=PHENOMENA))

BERTScore					&	\phantom{-}0.983	&	-0.171	&	-0.936	&	\phantom{-}0.127	&	-0.073	&	-0.988		 \\ 
BLEU					&	\phantom{-}0.738	&	-0.641	&	-0.989	&	\phantom{-}0.211	&	-0.267	&	-1.422		 \\ 
BLEURT-20					&	\phantom{-}0.976	&	\phantom{-}0.326	&	-0.895	&	\phantom{-}0.314	&	\phantom{-}0.110	&	-0.692		 \\ 
chrF					&	\phantom{-}0.983	&	-0.702	&	-1.000	&	\phantom{-}0.100	&	-0.214	&	-1.132		 \\ 
COMET-22					&	\phantom{-}0.936	&	\phantom{-}0.198	&	-0.616	&	\phantom{-}0.177	&	\phantom{-}0.061	&	-0.371		 \\ 
COMETKiwi					&	\phantom{-}0.968	&	\phantom{-}0.778	&	\phantom{-}0.550	&	\phantom{-}0.502	&	\phantom{-}0.519	&	\phantom{-}0.367		 \\ 
f200spBLEU					&	\phantom{-}0.745	&	-0.612	&	-1.000	&	\phantom{-}0.137	&	-0.258	&	-1.404		 \\ 
MS-COMET-QE-22					&	\phantom{-}0.700	&	\phantom{-}0.423	&	\phantom{-}0.287	&	\phantom{-}0.253	&	\phantom{-}0.245	&	\phantom{-}0.159		 \\ 
Random-sysname					&	-0.109	&	-0.102	&	-0.123	&	\phantom{-}0.014	&	\phantom{-}0.009	&	-0.010		 \\ 
YISI-1					&	\phantom{-}0.961	&	\p

### Sensitivities 2023 minus Sensitivities 2022
the delta tables for the sensitivities

In [23]:
sensitivities_2022.keys()

dict_keys(['MS-COMET-QE-22', 'metricx_xl_MQM_2020', 'YISI-1', 'BLEURT-20', 'f101spBLEU', 'KG-BERTScore', 'COMET-20', 'UniTE', 'f200spBLEU', 'HWTSC-TLM', 'UniTE-ref', 'COMET-22', 'COMET-QE', 'COMETKiwi', 'Cross-QE', 'metricx_xxl_MQM_2020', 'MS-COMET-22', 'BERTScore', 'metricx_xxl_DA_2019', 'BLEU', 'UniTE-src', 'HWTSC-Teacher-Sim', 'chrF', 'metricx_xl_DA_2019'])

In [26]:
sensitivities_2022["COMET-22"]

{'wrong language': -0.49683235214585464,
 'untranslated': 0.3989039346777838,
 'undertranslation': 0.21581542638160348,
 'do not translate': 0.3177422949702964,
 'mistranslation': 0.29188107829448373,
 'real-world knowledge': 0.2070908669286799,
 'addition': 0.04502006005601945,
 'punctuation': 0.2165785808252287,
 'overtranslation': 0.35213087613251653,
 'omission': 0.24999210937998778}

In [18]:
sensitivities_2023.keys()

dict_keys(['MS-COMET-QE-22', 'CometKiwi-XXL', 'MEE4', 'cometoid22-wmt23', 'prismRef', 'tokengram_F', 'BLEURT-20', 'eBLEU', 'MetricX-23-QE-c', 'KG-BERTScore', 'MATESE', 'Calibri-COMET22', 'f200spBLEU', 'MetricX-23-b', 'XLsim', 'embed_llama', 'Calibri-COMET22-QE', 'XCOMET-XL', 'MetricX-23-c', 'COMET-22', 'XCOMET-XXL', 'CometKiwi-XL', 'MetricX-23', 'cometoid22-wmt21', 'COMETKiwi', 'XCOMET-QE-Ensemble', 'GEMBA-MQM', 'XCOMET-Ensemble', 'cometoid22-wmt22', 'Random-sysname', 'BLEU', 'MetricX-23-QE', 'MetricX-23-QE-b', 'YISI-1', 'chrF', 'BERTScore'])

In [32]:
out  = ""
for p in PHENOMENA:
    out += p + "\t"
    score = sensitivities_2023["CometKiwi-XL"][p] - sensitivities_2022["COMETKiwi"][p]
    out += '&\t' + format_number(score, max_phenomena=False) + '\t'

    score = sensitivities_2023["CometKiwi-XXL"][p] - sensitivities_2022["COMETKiwi"][p]
    out += '&\t' + format_number(score, max_phenomena=False) + '\t'

    score = sensitivities_2023["KG-BERTScore"][p] - sensitivities_2022["KG-BERTScore"][p]
    out += '&\t' + format_number(score, max_phenomena=False) + '\t'

    score = sensitivities_2023["XCOMET-Ensemble"][p] - sensitivities_2022["COMET-22"][p]
    out += '&\t' + format_number(score, max_phenomena=False) + '\t'

    score = sensitivities_2023["XCOMET-XL"][p] - sensitivities_2022["COMET-22"][p]
    out += '&\t' + format_number(score, max_phenomena=False) + '\t'

    score = sensitivities_2023["XCOMET-XXL"][p] - sensitivities_2022["COMET-22"][p]
    out += '&\t' + format_number(score, max_phenomena=False) + '\t'

    out += "\\\\ \n"

In [33]:
print(out)

addition	&	-0.032	&	-0.008	&	\phantom{-}0.036	&	\phantom{-}0.025	&	\phantom{-}0.002	&	\phantom{-}0.012	\\ 
omission	&	\phantom{-}0.140	&	\phantom{-}0.078	&	\phantom{-}0.315	&	\phantom{-}0.092	&	-0.006	&	\phantom{-}0.010	\\ 
mistranslation	&	\phantom{-}0.109	&	\phantom{-}0.118	&	\phantom{-}0.210	&	\phantom{-}0.142	&	\phantom{-}0.011	&	\phantom{-}0.057	\\ 
untranslated	&	\phantom{-}1.349	&	\phantom{-}1.339	&	\phantom{-}0.605	&	-0.191	&	-0.524	&	-0.596	\\ 
do not translate	&	-0.008	&	-0.075	&	-0.258	&	-0.069	&	-0.091	&	-0.108	\\ 
overtranslation	&	-0.113	&	-0.022	&	\phantom{-}0.453	&	\phantom{-}0.110	&	-0.042	&	\phantom{-}0.124	\\ 
undertranslation	&	-0.075	&	-0.053	&	\phantom{-}0.264	&	\phantom{-}0.092	&	-0.018	&	\phantom{-}0.160	\\ 
real-world knowledge	&	\phantom{-}0.149	&	\phantom{-}0.275	&	\phantom{-}0.197	&	\phantom{-}0.151	&	\phantom{-}0.081	&	\phantom{-}0.091	\\ 
wrong language	&	-0.055	&	-0.174	&	-0.514	&	-0.216	&	-0.153	&	-0.193	\\ 
punctuation	&	-0.050	&	-0.060	&	-0.193	&	-0.06

In [28]:
sensitivities_2023["XCOMET-Ensemble"]

{'wrong language': -0.7131359808959523,
 'untranslated': 0.20837197950513164,
 'undertranslation': 0.30798213818223785,
 'do not translate': 0.24872770491622478,
 'mistranslation': 0.433750663311249,
 'real-world knowledge': 0.3580673467580501,
 'addition': 0.07011104559175352,
 'punctuation': 0.15120605855503308,
 'overtranslation': 0.4624780136824882,
 'omission': 0.342124205438191}

# Functions to Normalize summary scores and sensitivity scores to 0-1, then calculate the difference
Not a good idea: because we don't know the ideal sensitivity score: we could scale the max sensitivtiy to 1 but our max one is not the perfect sensitivity

In [71]:
def scale_to_max_one(scores:Dict[str, Dict[str, Dict[str, int]]]):
    out = copy.deepcopy(scores)
    max = np.max([np.max(list(p.values())) for p in scores.values()])
    min = np.min([np.min(list(p.values())) for p in scores.values()])
    for m in out:
        for p in out[m]:
            out[m][p] = (out[m][p] - min) / (max - min)
    return out

def scale_to_one(scores:Dict[str, Dict[str, Dict[str, int]]]):
    out = copy.deepcopy(scores)
    sum = 0.0
    for p in scores.values():
        sum += np.sum(list(p.values()))
    for m in out:
        for p in out[m]:
            out[m][p] /= sum
    return out

def diff(scores1, scores2):
    scores_out = {}
    for metric in scores1:
        if metric in METRIC_NAMES_MAPPING:
            metric = METRIC_NAMES_MAPPING[metric]
        if metric in scores2:
            metric2 = metric
            scores_out[metric] = {p:scores2[metric2][p]-scores1[metric][p] for p in scores1[metric] if p in scores2[metric]} 
        elif metric in METRIC_MAPPING_BACK:
            metric2 = METRIC_MAPPING_BACK[metric]
            scores_out[metric] = {p:scores2[metric2][p]-scores1[metric][p] for p in scores1[metric] if p in scores2[metric]} 
        else:
            print(metric)
        
    return scores_out

# Plots

In [26]:
# create groups here:
means = {metric:[sensitivities_2022[metric][p] for p in PHENOMENA] for metric in metrics_names_2022}
tau = {metric:[ACES_summary_2022[metric][p] for p in PHENOMENA] for metric in metrics_names_2022 if metric in ACES_summary_2022}

In [21]:
# Group the metrics:
bleus = ["BLEU", "f101spBLEU", "f200spBLEU"]
comets = ["COMET-20", "COMET-22", "MS-COMET-22", "MS-COMET-QE-22"]
xl = ["metricx_xl_DA_2019", "metricx_xl_MQM_2020", "metricx_xxl_DA_2019", "metricx_xxl_MQM_2020"]
unite = ["UniTE", "UniTE-src", "UniTE-ref"]

In [28]:
grouped_line_plot([means, tau], bleus, ["Mean(good-bad)", "tau"], PHENOMENA)

In [38]:
def grouped_line_plot(groups: List[Dict[str,list]], metrics_names: List[str], group_labels: List[str], phenomena: List[str]):
    '''
    Inputs: 
        1. means and tau scores
        format = {
            metric1: [score for phenomenon 1, score for phenomenon 2, ..]
        }
        2. A list of the labels for: 
            the groups (mean (good-bad), tau, ...)
            metrics
            phenomena (the order is important because in means and tau scores the scores are ordered acc. to phenomena)
    '''
    assert len(groups) > 0 and len(groups) == len(group_labels) and len(metrics_names) > 0
    fig = go.Figure()
    colors = [['lightsteelblue',  'aqua', 'aquamarine', 'darkturquoise'],
        ['chocolate', 'coral', 'crimson', 'orange']]
    for i,group in enumerate(groups):
        for j,metric in enumerate(metrics_names):
            fig.add_trace(go.Scatter(x=phenomena, y=group[metric],mode='lines',name=group_labels[i]+" - "+metric,
                          line=dict(color=colors[i][j])))
    fig.update_layout(
        title=" ".join(group_labels)
        )
    fig.show()

# Extra Latex Tables

In [None]:
def find_max_on_col(scores:Dict[str, Dict[str, Dict[str, int]]], metrics_names:List[str], phenomena:List[str]=PHENOMENA, k_highest:int=1) -> Dict[str,str]:
    max_metrics = {str(k): [[] for metric in metrics_names] for k in range(k_highest)}
    avgs = []
    for i,p in enumerate(phenomena):
        col = []
        for metric in metrics_names:
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            if metric not in scores:
                col.append(-np.inf)
            else:
                col.append(scores[metric][p])
        for k in range(k_highest):
            max_ids = np.where(col == np.partition(col, -k-1)[-k-1])[0]
            for max_id in max_ids:
                max_metrics[str(k)][max_id].append(i)
        col = np.array(col)
        avgs.append(np.average(col[col > -np.inf]))
    return max_metrics, avgs

def make_header(scores:Dict[str, Dict[str, Dict[str, int]]], phenomena:List[str]=PHENOMENA, p_header_1:dict=PHENOMENA_HEADER_1, p_header_2:dict=PHENOMENA_HEADER_2, num_samples:dict=None, phenomena_mapping:Dict[str,str]=None, ACES_column:bool=True) -> str:
    if phenomena_mapping == None:
        phenomena_mapping = PHENOMENA_MAPPING
    if num_samples == None:
        num_samples = {p:0 for p in phenomena}
        for p,target in phenomena_mapping.items():
            num_samples[target] += len(scores["BLEU"][p][0])
    res = "\\begin{table*}[ht] \n \small \n \setlength{\\tabcolsep}{3.75pt} \n \centering \n \\begin{tabular}{@{}lccccccccccc@{}} \n \\\\\\toprule \n"
    for p in phenomena:
        res += " & "
        res += p_header_1[p]
    if ACES_column:
        res += """ & \\textbf{ACES-}"""
    res += " \\\\\n"
    for p in phenomena:
        res += " & "
        res += p_header_2[p]
    if ACES_column:
        res += """ & \\textbf{Score}"""
    res += "\\\\\n\midrule\n\\textit{\\textbf{Examples}} "
    for p in phenomena:
        res += " & "
        res += '\\textit{' + str(num_samples[p]) + '}'
    res += """\\\\ \n \midrule"""
    return res

def make_footer(averages:List, phenomena:List[str]=PHENOMENA) -> str:
    res = "\midrule\nAverage (all metrics)\t"
    for p in phenomena:
        if p in phenomena:
            res += " & "
            res += format_number(averages[p])
    res += """\\\\ \n \\bottomrule"""
    return res

def generate_summary_table(scores:Dict[str, Dict[str, Dict[str, int]]], metrics_groups:Dict[str,list] = METRICS_GROUPING_2022, phenomena:List[str]=PHENOMENA, ACES_column:bool=True, global_colors:bool=True, k_highest:int=1, colors:List[str]=None) -> str:
    """
    if k_highest % 2 == 1:
        colors = COLORS[len(COLORS)//2-k_highest//2:len(COLORS)//2+k_highest//2+1]
    else:
        colors = COLORS[len(COLORS)//2-k_highest//2:len(COLORS)//2] + COLORS[len(COLORS)//2+1:len(COLORS)//2+k_highest//2+1]
    """
    if global_colors:
        k_highest = 1
    out = ''
    metrics_names = []
    for group in metrics_groups.values():
        for metric in group:
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metrics_names.append(METRIC_NAMES_MAPPING[metric])
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metrics_names.append(METRIC_MAPPING_BACK[metric])
            else:
                metrics_names.append(metric)
    # print(metrics_names)
    max_in_columns, avgs = find_max_on_col(scores, metrics_names=metrics_names, phenomena=phenomena, k_highest=k_highest)
    if ACES_column:
        aces_scores_col = []
        for metric in metrics_names:
            # print(metric)
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            row = {}
            # print(metric)
            for p_id, p in enumerate(phenomena):
                if metric not in scores:
                    row[p] = 0.0
                else:
                    row[p] = scores[metric][p]
            aces_scores_col.append(comp_aces_score(row))
        # print(aces_scores_col)
        aces_scores_col_colors = {m_id:"" for m_id in range(len(metrics_names))}
    
    if global_colors:
        maximum = np.max([np.max(list(p.values())) for metric,p in scores.items() if metric in metrics_names])
        minimum = np.min([np.min(list(p.values())) for metric,p in scores.items() if metric in metrics_names])
        # print(minimum, maximum, metrics_names)
    if ACES_column:
        if global_colors:
            for i in range(len(aces_scores_col)):
                aces_scores_col_colors[i] = map_to_color(aces_scores_col[i], np.max(aces_scores_col), np.min(aces_scores_col))
        elif k_highest == 1:
            max_aces_ids = np.where(list(aces_scores_col) == np.max(aces_scores_col))[0]
            for i in max_aces_ids:
                aces_scores_col_colors[i] = '\colorbox[HTML]{B2EAB1}'
        else:
            for k in range(k_highest):
                max_aces_ids = np.where(aces_scores_col == np.partition(aces_scores_col, -k-1)[-k-1])[0]
                for i in max_aces_ids:
                    aces_scores_col_colors[i] = colors[k]
                   
    for group, metrics in metrics_groups.items():
        for m_id, metric in enumerate(metrics):
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            out += format_metric(metric) + '\t\t\t\t\t'
            for p_id, p in enumerate(phenomena):
                if metric not in scores:
                    out += '&\t ---- \t' 
                else:
                    if global_colors:
                        out += '&\t' + format_number(scores[metric][p], max_phenomena=True, color=map_to_color(scores[metric][p], maximum, minimum)) + '\t'   
                    elif k_highest == 1:
                        max_ids = max_in_columns['0'][metrics_names.index(metric)]
                        out += '&\t' + format_number(scores[metric][p], max_phenomena=p_id in max_ids) + '\t'   
                    else:
                        for k in range(k_highest):
                            max_ids = max_in_columns[str(k)][metrics_names.index(metric)]
                            if p_id in max_ids:
                                color=colors[k]
                                break
                        out += '&\t' + format_number(scores[metric][p], max_phenomena=True, color=color) + '\t'   
            if ACES_column:
                tmp_color =  aces_scores_col_colors[metrics_names.index(metric)]    
                out += '&\t' + format_number(aces_scores_col[metrics_names.index(metric)], dec='0.00', max_phenomena=tmp_color!="", color=tmp_color)
            out += '\t \\\\ \n'
        out += '\midrule \n'
    out += 'Average\t\t\t\t\t'
    for p_id, p in enumerate(phenomena):
        out += '&\t' + format_number(avgs[p_id], max_phenomena=True, color=map_to_color(avgs[p_id], max=np.max(avgs), min=np.min(avgs))) + '\t'
    out += '\\\\'
    return out