In [1]:
#!/usr/bin/env python3
import os, sys
from normalize_scores_utils import *

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Main Part: Table and Correlations

In [4]:
data_folder = '/mnt/c/Users/user/Desktop/work/metric_sensitivity_analysis'

# Load the ACES scores
ACES_scores_2022_path = os.path.join(data_folder, 'aces-scored-2022-all-scores.only.quote_errors_removed.tsv')
ACES_scores_2023_path = os.path.join(data_folder, 'aces-scored-2023-all-scores.quote_errors_removed.tsv')
# 2023 later
ACES_scores_2022 = load_ACES_scores(ACES_scores_2022_path, good_token='.-good', bad_token='.-bad', metric_mapping=METRIC_NAMES_MAPPING)
ACES_scores_2023 = load_ACES_scores(ACES_scores_2023_path, good_token='.-good', bad_token='.-bad', metric_mapping=METRIC_NAMES_MAPPING, skip_metrics=[])
metrics_names_2022 = set(ACES_scores_2022.keys())
metrics_names_2023 = set(ACES_scores_2023.keys())

# Load WMT scores
WMT_scores_path = os.path.join(data_folder, 'mt-metrics-eval-v2/wmt22/metric-scores')
WMT_scores = load_WMT_scores(WMT_scores_path, set(metrics_names_2022).union(set(metrics_names_2023)))

# calculate sensitivities
metrics_names_2022 = list(set(ACES_scores_2022.keys()).intersection(set(WMT_scores.keys())))
sensitivities_2022, _, _, phenomena_2022 = calculate_sensitivities(ACES_scores_2022, WMT_scores, mapping=PHENOMENA_MAPPING)
metrics_names_2023 = list(set(ACES_scores_2023.keys()).intersection(set(WMT_scores.keys())))
sensitivities_2023, _, _, phenomena_2023 = calculate_sensitivities(ACES_scores_2023, WMT_scores, mapping=PHENOMENA_MAPPING)

# load the ACES scores from the paper
ACES_summary_2022 = load_ACES_scores_summary_2022()
ACES_summary_2023 = load_ACES_scores_summary_2023(skip_metrics=[])

cs-en


100%|██████████| 57/57 [00:02<00:00, 28.00it/s]


cs-uk


100%|██████████| 57/57 [00:01<00:00, 46.75it/s]


de-en


100%|██████████| 57/57 [00:02<00:00, 27.27it/s]


de-fr


100%|██████████| 57/57 [00:00<00:00, 59.06it/s]


en-cs


100%|██████████| 57/57 [00:02<00:00, 23.72it/s]


en-de


100%|██████████| 57/57 [00:02<00:00, 19.86it/s]


en-hr


100%|██████████| 57/57 [00:01<00:00, 33.18it/s]


en-ja


100%|██████████| 57/57 [00:01<00:00, 40.42it/s]


en-liv


100%|██████████| 57/57 [00:00<00:00, 124.32it/s]


en-ru


100%|██████████| 57/57 [00:01<00:00, 34.43it/s]


en-uk


100%|██████████| 57/57 [00:01<00:00, 51.56it/s]


en-zh


100%|██████████| 57/57 [00:02<00:00, 24.03it/s]


fr-de


100%|██████████| 57/57 [00:01<00:00, 55.52it/s]


ja-en


100%|██████████| 57/57 [00:01<00:00, 42.79it/s]


liv-en


100%|██████████| 57/57 [00:00<00:00, 127.57it/s]


ru-en


100%|██████████| 57/57 [00:01<00:00, 42.39it/s]


ru-sah


100%|██████████| 57/57 [00:00<00:00, 91.80it/s] 


sah-ru


100%|██████████| 57/57 [00:00<00:00, 102.30it/s]


uk-cs


100%|██████████| 57/57 [00:01<00:00, 29.84it/s]


uk-en


100%|██████████| 57/57 [00:01<00:00, 50.23it/s]


zh-en


100%|██████████| 57/57 [00:03<00:00, 18.10it/s]


In [41]:
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green3}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red3}', '\colorbox{red4}', '\colorbox{red5}']
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red4}', '\colorbox{red5}']

# From the ACES 2022 Paper:
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEU", "BERTScore", "BLEURT-20", "YISI-1"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEU", "BERTScore", "BLEURT-20", "YISI-1"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }

phenomena = ["addition", 'mistranslation', 'untranslated', 'do not translate', 'real-world knowledge']
print(make_header(phenomena=PHENOMENA))
print(generate_summary_table(sensitivities_2022, metrics_groups=METRICS_GROUPING_2022, phenomena=PHENOMENA, k_highest=11))
# print(make_footer(averages=SUMMARY_AVERAGES_2023, phenomena=PHENOMENA))

\begin{table*}[ht] 
 \small 
 \setlength{\tabcolsep}{3.75pt} 
 \centering 
 \begin{tabular}{@{}lccccccccccc@{}} 
 \\\toprule 
 & \hyperref[sec:addition-omission]{\textbf{addition}} & \hyperref[sec:addition-omission]{\textbf{omission}} & \hyperref[sec:source-disambig]{\textbf{mistranslation}} & \hyperref[sec:untranslated]{\textbf{untranslated}} & \hyperref[sec:do-not-translate]{\textbf{do not}} & \hyperref[sec:overtranslation_undertranslation]{\textbf{overtranslation}} & \hyperref[sec:overtranslation_undertranslation]{\textbf{undertranslation}} & \hyperref[sec:real-world-knowledge]{\textbf{real-world}} & \hyperref[sec:wrong_language]{\textbf{wrong}} & \hyperref[sec:punctuation]{\textbf{punctuation}} & \textbf{ACES-} \\
 &  &  &  &  & \hyperref[sec:do-not-translate]{\textbf{translate}} &  &  & \hyperref[sec:real-world-knowledge]{\textbf{knowledge}} & \hyperref[sec:wrong_language]{\textbf{language}} &  & \textbf{Score}\\
\midrule
\textit{\textbf{Examples}}  & \textit{999} & \textit{999} &

# Functions to Normalize summary scores and sensitivity scores to 0-1, then calculate the difference
In process

In [71]:
def scale_to_max_one(scores:Dict[str, Dict[str, Dict[str, int]]]):
    out = copy.deepcopy(scores)
    max = np.max([np.max(list(p.values())) for p in scores.values()])
    min = np.min([np.min(list(p.values())) for p in scores.values()])
    for m in out:
        for p in out[m]:
            out[m][p] = (out[m][p] - min) / (max - min)
    return out

def scale_to_one(scores:Dict[str, Dict[str, Dict[str, int]]]):
    out = copy.deepcopy(scores)
    sum = 0.0
    for p in scores.values():
        sum += np.sum(list(p.values()))
    for m in out:
        for p in out[m]:
            out[m][p] /= sum
    return out

def diff(scores1, scores2):
    scores_out = {}
    for metric in scores1:
        if metric in METRIC_NAMES_MAPPING:
            metric = METRIC_NAMES_MAPPING[metric]
        if metric in scores2:
            metric2 = metric
            scores_out[metric] = {p:scores2[metric2][p]-scores1[metric][p] for p in scores1[metric] if p in scores2[metric]} 
        elif metric in METRIC_MAPPING_BACK:
            metric2 = METRIC_MAPPING_BACK[metric]
            scores_out[metric] = {p:scores2[metric2][p]-scores1[metric][p] for p in scores1[metric] if p in scores2[metric]} 
        else:
            print(metric)
        
    return scores_out

# Plots

In [26]:
# create groups here:
means = {metric:[sensitivities_2022[metric][p] for p in PHENOMENA] for metric in metrics_names_2022}
tau = {metric:[ACES_summary_2022[metric][p] for p in PHENOMENA] for metric in metrics_names_2022 if metric in ACES_summary_2022}

In [21]:
# Group the metrics:
bleus = ["BLEU", "f101spBLEU", "f200spBLEU"]
comets = ["COMET-20", "COMET-22", "MS-COMET-22", "MS-COMET-QE-22"]
xl = ["metricx_xl_DA_2019", "metricx_xl_MQM_2020", "metricx_xxl_DA_2019", "metricx_xxl_MQM_2020"]
unite = ["UniTE", "UniTE-src", "UniTE-ref"]

In [28]:
grouped_line_plot([means, tau], bleus, ["Mean(good-bad)", "tau"], PHENOMENA)

In [38]:
def grouped_line_plot(groups: List[Dict[str,list]], metrics_names: List[str], group_labels: List[str], phenomena: List[str]):
    '''
    Inputs: 
        1. means and tau scores
        format = {
            metric1: [score for phenomenon 1, score for phenomenon 2, ..]
        }
        2. A list of the labels for: 
            the groups (mean (good-bad), tau, ...)
            metrics
            phenomena (the order is important because in means and tau scores the scores are ordered acc. to phenomena)
    '''
    assert len(groups) > 0 and len(groups) == len(group_labels) and len(metrics_names) > 0
    fig = go.Figure()
    colors = [['lightsteelblue',  'aqua', 'aquamarine', 'darkturquoise'],
        ['chocolate', 'coral', 'crimson', 'orange']]
    for i,group in enumerate(groups):
        for j,metric in enumerate(metrics_names):
            fig.add_trace(go.Scatter(x=phenomena, y=group[metric],mode='lines',name=group_labels[i]+" - "+metric,
                          line=dict(color=colors[i][j])))
    fig.update_layout(
        title=" ".join(group_labels)
        )
    fig.show()