In [1]:
#!/usr/bin/env python3
import os, sys
from normalize_scores_utils import *

import logging
logger = logging.getLogger('logger')
logger.setLevel(logging.INFO)

In [30]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from aces.cli.evaluate import comp_corr

In [36]:
def calculate_tau_correlations(ACES_scores: Dict[str, Dict[str, List[List]]], mapping:dict=None, phenomena:List[str]=None) -> Dict[str, Dict[str, Dict[str, int]]]:
    if mapping:
        ACES_scores = map_to_higher(ACES_scores, mapping=mapping)
    
    metrics_names = list(set(ACES_scores.keys()))
    if phenomena == None:                  
        phenomena = list(ACES_scores[metrics_names[0]].keys())
            
    stats = {}
    for p in phenomena:
        stats[p] = {}
        for metric in metrics_names:
            good = pd.Series(ACES_scores[metric][p][0])
            bad = pd.Series(ACES_scores[metric][p][1])
            tau = comp_corr(good, bad)
            stats[p][metric] = {"Tau":tau}
    sensitivities = {metric:{p: stats[p][metric]["Tau"] for p in phenomena} for metric in metrics_names}
    logger.setLevel(logging.INFO)
    return sensitivities, phenomena

In [41]:
ACES_summary_2022_numbers, phenomena = calculate_tau_correlations(ACES_scores_2022, phenomena=phenomena)

In [42]:
ACES_summary_2022_numbers

{'metricx_xxl_DA_2019': {'hallucination-number-level-1': 0.9828326180257511,
  'hallucination-number-level-2': 0.8740818467995802,
  'hallucination-number-level-3': 0.2892209178228389},
 'BLEURT-20': {'hallucination-number-level-1': 0.9763948497854077,
  'hallucination-number-level-2': 0.32214060860440713,
  'hallucination-number-level-3': -0.8932764140875133},
 'COMET-QE': {'hallucination-number-level-1': 0.6716738197424893,
  'hallucination-number-level-2': 0.5173137460650578,
  'hallucination-number-level-3': 0.400213447171825},
 'BLEU': {'hallucination-number-level-1': 0.7381974248927039,
  'hallucination-number-level-2': -0.6411332633788038,
  'hallucination-number-level-3': -0.9893276414087513},
 'Cross-QE': {'hallucination-number-level-1': 0.8969957081545065,
  'hallucination-number-level-2': 0.7355718782791185,
  'hallucination-number-level-3': 0.5090715048025614},
 'HWTSC-TLM': {'hallucination-number-level-1': 0.7618025751072961,
  'hallucination-number-level-2': 0.62014690451

# Main Part: Table and Correlations

In [2]:
data_folder = '/mnt/c/Users/user/Desktop/work/metric_sensitivity_analysis'

# Load the ACES scores
ACES_scores_2022_path = os.path.join(data_folder, 'aces-scored-2022-all-scores.only.quote_errors_removed.tsv')
ACES_scores_2023_path = os.path.join(data_folder, 'aces-scored-2023-all-scores.quote_errors_removed.tsv')
# 2023 later
ACES_scores_2022 = load_ACES_scores(ACES_scores_2022_path, good_token='.-good', bad_token='.-bad', metric_mapping=METRIC_NAMES_MAPPING)
# ACES_scores_2023 = load_ACES_scores(ACES_scores_2023_path, good_token='.-good', bad_token='.-bad', metric_mapping=METRIC_NAMES_MAPPING, skip_metrics=[])
metrics_names_2022 = set(ACES_scores_2022.keys())
# metrics_names_2023 = set(ACES_scores_2023.keys())

# Load WMT scores
WMT_scores_path = os.path.join(data_folder, 'mt-metrics-eval-v2/wmt22/metric-scores')
# WMT_scores = load_WMT_scores(WMT_scores_path, set(metrics_names_2022).union(set(metrics_names_2023)))
WMT_scores = load_WMT_scores(WMT_scores_path, set(metrics_names_2022))

# calculate sensitivities
metrics_names_2022 = list(set(ACES_scores_2022.keys()).intersection(set(WMT_scores.keys())))
sensitivities_2022, _, _, phenomena_2022, means_good_2022, means_bad_2022 = calculate_sensitivities(ACES_scores_2022, WMT_scores, mapping=PHENOMENA_MAPPING)
# metrics_names_2023 = list(set(ACES_scores_2023.keys()).intersection(set(WMT_scores.keys())))
# sensitivities_2023, _, _, phenomena_2023, means_good_2023, means_bad_2023 = calculate_sensitivities(ACES_scores_2023, WMT_scores, mapping=PHENOMENA_MAPPING)

# load the ACES scores from the paper
ACES_summary_2022 = load_ACES_scores_summary_2022()
# ACES_summary_2023 = load_ACES_scores_summary_2023(skip_metrics=[])

cs-en


100%|██████████| 24/24 [00:09<00:00,  2.63it/s]


cs-uk


100%|██████████| 24/24 [00:05<00:00,  4.68it/s]


de-en


100%|██████████| 24/24 [00:09<00:00,  2.50it/s]


de-fr


100%|██████████| 24/24 [00:04<00:00,  5.94it/s]


en-cs


100%|██████████| 24/24 [00:09<00:00,  2.62it/s]


en-de


100%|██████████| 24/24 [00:10<00:00,  2.36it/s]


en-hr


100%|██████████| 24/24 [00:06<00:00,  3.56it/s]


en-ja


100%|██████████| 24/24 [00:06<00:00,  3.64it/s]


en-liv


100%|██████████| 24/24 [00:01<00:00, 17.38it/s]


en-ru


100%|██████████| 24/24 [00:06<00:00,  3.76it/s]


en-uk


100%|██████████| 24/24 [00:03<00:00,  6.40it/s]


en-zh


100%|██████████| 24/24 [00:08<00:00,  2.77it/s]


fr-de


100%|██████████| 24/24 [00:02<00:00,  9.61it/s]


ja-en


100%|██████████| 24/24 [00:04<00:00,  4.93it/s]


liv-en


100%|██████████| 24/24 [00:01<00:00, 17.27it/s]


ru-en


100%|██████████| 24/24 [00:04<00:00,  5.14it/s]


ru-sah


100%|██████████| 24/24 [00:01<00:00, 13.19it/s]


sah-ru


100%|██████████| 24/24 [00:01<00:00, 14.79it/s]


uk-cs


100%|██████████| 24/24 [00:05<00:00,  4.35it/s]


uk-en


100%|██████████| 24/24 [00:02<00:00,  8.18it/s]


zh-en


100%|██████████| 24/24 [00:09<00:00,  2.41it/s]


In [27]:
def map_to_color(num:float, max:float, min:float) -> str:
    # colors = ["red5", "red3", "red2", "red1", "white", "white", "green1", "green2", "green3", "green5", "green5"]
    colors = ["red5","red3", "red2", "red1", "white", "white", "green1", "green2", "green3", "green5", "green5"]
    # return '\colorbox{' + colors[int((num-min)/(max-min)*10)] + '}'
    # print(num, max, min, int(num/(2*max)*10+5))
    if np.abs(max) > np.abs(min):
        return '\colorbox{' + colors[np.maximum(0, int(num/(2*np.abs(max))*10+5))] + '}'
    else:
        return '\colorbox{' + colors[np.maximum(0, int(num/(2*np.abs(min))*10+5))] + '}'

def find_max_on_col(scores:Dict[str, Dict[str, Dict[str, int]]], metrics_names:List[str], phenomena:List[str]=PHENOMENA, k_highest:int=1) -> Dict[str,str]:
    max_metrics = {str(k): [[] for metric in metrics_names] for k in range(k_highest)}
    avgs = []
    for i,p in enumerate(phenomena):
        col = []
        for metric in metrics_names:
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            if metric not in scores:
                col.append(-np.inf)
            else:
                col.append(scores[metric][p])
        for k in range(k_highest):
            max_ids = np.where(col == np.partition(col, -k-1)[-k-1])[0]
            for max_id in max_ids:
                max_metrics[str(k)][max_id].append(i)
        col = np.array(col)
        avgs.append(np.average(col[col > -np.inf]))
    return max_metrics, avgs

def make_header(phenomena:List[str]=PHENOMENA, p_header_1:dict=PHENOMENA_HEADER_1, p_header_2:dict=PHENOMENA_HEADER_2, num_samples:dict=NUM_SAMPLES, ACES_column:bool=True) -> str:
    res = "\\begin{table*}[ht] \n \small \n \setlength{\\tabcolsep}{3.75pt} \n \centering \n \\begin{tabular}{@{}lccccccccccc@{}} \n \\\\\\toprule \n"
    for p in phenomena:
        res += " & "
        res += p_header_1[p]
    if ACES_column:
        res += """ & \\textbf{ACES-}"""
    res += " \\\\\n"
    for p in phenomena:
        res += " & "
        res += p_header_2[p]
    if ACES_column:
        res += """ & \\textbf{Score}"""
    res += "\\\\\n\midrule\n\\textit{\\textbf{Examples}} "
    for p in phenomena:
        res += " & "
        res += '\\textit{' + str(num_samples[p]) + '}'
    res += """\\\\ \n \midrule"""
    return res

def make_footer(averages:List, phenomena:List[str]=PHENOMENA) -> str:
    res = "\midrule\nAverage (all metrics)\t"
    for p in PHENOMENA:
        if p in phenomena:
            res += " & "
            res += format_number(averages[p])
    res += """\\\\ \n \\bottomrule"""
    return res

def generate_summary_table(scores:Dict[str, Dict[str, Dict[str, int]]], metrics_groups:Dict[str,list] = METRICS_GROUPING_2022, phenomena:List[str]=PHENOMENA, ACES_column:bool=True, global_colors:bool=True, k_highest:int=1, colors:List[str]=None) -> str:
    """
    if k_highest % 2 == 1:
        colors = COLORS[len(COLORS)//2-k_highest//2:len(COLORS)//2+k_highest//2+1]
    else:
        colors = COLORS[len(COLORS)//2-k_highest//2:len(COLORS)//2] + COLORS[len(COLORS)//2+1:len(COLORS)//2+k_highest//2+1]
    """
    if global_colors:
        k_highest = 1
    out = ''
    metrics_names = []
    for group in metrics_groups.values():
        for metric in group:
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metrics_names.append(METRIC_NAMES_MAPPING[metric])
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metrics_names.append(METRIC_MAPPING_BACK[metric])
            else:
                metrics_names.append(metric)
    # print(metrics_names)
    max_in_columns, avgs = find_max_on_col(scores, metrics_names=metrics_names, phenomena=phenomena, k_highest=k_highest)
    if ACES_column:
        aces_scores_col = []
        for metric in metrics_names:
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            row = {}
            for p_id, p in enumerate(phenomena):
                if metric not in scores:
                    row[p] = 0.0
                else:
                    row[p] = scores[metric][p]
            aces_scores_col.append(comp_aces_score(row))
        # print(aces_scores_col)
        aces_scores_col_colors = {m_id:"" for m_id in range(len(metrics_names))}
    
    if global_colors:
        maximum = np.max([np.max(list(p.values())) for metric,p in scores.items() if metric in metrics_names])
        minimum = np.min([np.min(list(p.values())) for metric,p in scores.items() if metric in metrics_names])
        # print(minimum, maximum, metrics_names)
    if ACES_column:
        if global_colors:
            for i in range(len(aces_scores_col)):
                aces_scores_col_colors[i] = map_to_color(aces_scores_col[i], np.max(aces_scores_col), np.min(aces_scores_col))
        elif k_highest == 1:
            max_aces_ids = np.where(list(aces_scores_col) == np.max(aces_scores_col))[0]
            for i in max_aces_ids:
                aces_scores_col_colors[i] = '\colorbox[HTML]{B2EAB1}'
        else:
            for k in range(k_highest):
                max_aces_ids = np.where(aces_scores_col == np.partition(aces_scores_col, -k-1)[-k-1])[0]
                for i in max_aces_ids:
                    aces_scores_col_colors[i] = colors[k]
                   
    for group, metrics in metrics_groups.items():
        for m_id, metric in enumerate(metrics):
            if metric not in scores and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            out += format_metric(metric) + '\t\t\t\t\t'
            for p_id, p in enumerate(phenomena):
                if metric not in scores:
                    out += '&\t ---- \t' 
                else:
                    if global_colors:
                        out += '&\t' + format_number(scores[metric][p], max_phenomena=True, color=map_to_color(scores[metric][p], maximum, minimum)) + '\t'   
                    elif k_highest == 1:
                        max_ids = max_in_columns['0'][metrics_names.index(metric)]
                        out += '&\t' + format_number(scores[metric][p], max_phenomena=p_id in max_ids) + '\t'   
                    else:
                        for k in range(k_highest):
                            max_ids = max_in_columns[str(k)][metrics_names.index(metric)]
                            if p_id in max_ids:
                                color=colors[k]
                                break
                        out += '&\t' + format_number(scores[metric][p], max_phenomena=True, color=color) + '\t'   
            if ACES_column:
                tmp_color =  aces_scores_col_colors[metrics_names.index(metric)]    
                out += '&\t' + format_number(aces_scores_col[metrics_names.index(metric)], dec='0.00', max_phenomena=tmp_color!="", color=tmp_color)
            out += '\t \\\\ \n'
        out += '\midrule \n'
    out += 'Average\t\t\t\t\t'
    for p_id, p in enumerate(phenomena):
        out += '&\t' + format_number(avgs[p_id], max_phenomena=True, color=map_to_color(avgs[p_id], max=np.max(avgs), min=np.min(avgs))) + '\t'
    out += '\\\\'
    return out

In [21]:
sensitivities_unscaled_2022,  _ = calculate_sensitivities_self_scaled(ACES_scores_2022, mapping=PHENOMENA_MAPPING)
sensitivities_unscaled_2023,  _ = calculate_sensitivities_self_scaled(ACES_scores_2023, mapping=PHENOMENA_MAPPING)

In [43]:
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green3}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red3}', '\colorbox{red4}', '\colorbox{red5}']
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red4}', '\colorbox{red5}']

# From the ACES 2022 Paper:
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEU", "COMET-20", "COMET-QE"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }
METRICS_GROUPING_SHORT_2023 = {"baseline": ["BLEU", "BERTScore", "BLEURT-20", "YISI-1"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }

phenomena = ['untranslated', 'real-world knowledge', 'wrong language']
phenomena = ["hallucination-number-level-1", "hallucination-number-level-2", "hallucination-number-level-3"]

PHENOMENA_HEADER_1_NUMBERS = dict(zip(phenomena, ['\\textbf{Level 1}', '\\textbf{Level 2}', '\\textbf{Level 3}']))
PHENOMENA_HEADER_2_NUMBERS = dict(zip(phenomena, ['', '',  '']))
NUM_SAMPLES_NUMBERS = {p:len(ACES_scores_2022["BLEU"][p][0]) for p in phenomena}

print(make_header(phenomena=phenomena, ACES_column=False, p_header_1=PHENOMENA_HEADER_1_NUMBERS, p_header_2=PHENOMENA_HEADER_2_NUMBERS, num_samples=NUM_SAMPLES_NUMBERS))
print(generate_summary_table(ACES_summary_2022_numbers, metrics_groups=METRICS_GROUPING_2022, phenomena=phenomena, ACES_column=False))
# print(make_footer(averages=SUMMARY_AVERAGES_2023, phenomena=PHENOMENA))

\begin{table*}[ht] 
 \small 
 \setlength{\tabcolsep}{3.75pt} 
 \centering 
 \begin{tabular}{@{}lccccccccccc@{}} 
 \\\toprule 
 & \textbf{Level 1} & \textbf{Level 2} & \textbf{Level 3} \\
 &  &  & \\
\midrule
\textit{\textbf{Examples}}  & \textit{932} & \textit{953} & \textit{937}\\ 
 \midrule
BLEU					&	\colorbox{green3}{\textbf{\phantom{-}0.738}}	&	\colorbox{red3}{\textbf{-0.641}}	&	\colorbox{red5}{\textbf{-0.989}}		 \\ 
f101spBLEU					&	\colorbox{green3}{\textbf{\phantom{-}0.702}}	&	\colorbox{red3}{\textbf{-0.620}}	&	\colorbox{red5}{\textbf{-1.000}}		 \\ 
f200spBLEU					&	\colorbox{green3}{\textbf{\phantom{-}0.745}}	&	\colorbox{red3}{\textbf{-0.612}}	&	\colorbox{red5}{\textbf{-1.000}}		 \\ 
chrF					&	\colorbox{green5}{\textbf{\phantom{-}0.983}}	&	\colorbox{red3}{\textbf{-0.702}}	&	\colorbox{red5}{\textbf{-1.000}}		 \\ 
BERTScore					&	\colorbox{green5}{\textbf{\phantom{-}0.981}}	&	\colorbox{white}{\textbf{-0.184}}	&	\colorbox{red5}{\textbf{-0.923}}		 \\ 
BLEURT-20					&	\colorbox{gr

# Table for Hallucination Number/Named entities

In [56]:
def generate_summary_table_double(scores1:Dict[str, Dict[str, Dict[str, int]]], scores2:Dict[str, Dict[str, Dict[str, int]]], 
                                  metrics_groups:Dict[str,list] = METRICS_GROUPING_2022, phenomena:List[str]=PHENOMENA, ACES_column:bool=True, global_colors:bool=True, k_highest:int=1, colors:List[str]=None) -> str:
    """
    if k_highest % 2 == 1:
        colors = COLORS[len(COLORS)//2-k_highest//2:len(COLORS)//2+k_highest//2+1]
    else:
        colors = COLORS[len(COLORS)//2-k_highest//2:len(COLORS)//2] + COLORS[len(COLORS)//2+1:len(COLORS)//2+k_highest//2+1]
    """
    if global_colors:
        k_highest = 1
    out = ''
    metrics_names = []
    for group in metrics_groups.values():
        for metric in group:
            if metric not in scores1 and metric in METRIC_NAMES_MAPPING:
                metrics_names.append(METRIC_NAMES_MAPPING[metric])
            elif metric not in scores1 and metric in METRIC_MAPPING_BACK:
                metrics_names.append(METRIC_MAPPING_BACK[metric])
            else:
                metrics_names.append(metric)
    # print(metrics_names)
    max_in_columns1, avgs1 = find_max_on_col(scores1, metrics_names=metrics_names, phenomena=phenomena, k_highest=k_highest)
    max_in_columns2, avgs2 = find_max_on_col(scores2, metrics_names=metrics_names, phenomena=phenomena, k_highest=k_highest)
    
    if global_colors:
        maximum1 = np.max([np.max([v for p,v in value.items() if p in phenomena]) for metric,value in scores1.items() if metric in metrics_names])
        minimum1 = np.min([np.min([v for p,v in value.items() if p in phenomena]) for metric,value in scores1.items() if metric in metrics_names])
        maximum2 = np.max([np.max([v for p,v in value.items() if p in phenomena]) for metric,value in scores2.items() if metric in metrics_names])
        minimum2 = np.min([np.min([v for p,v in value.items() if p in phenomena]) for metric,value in scores2.items() if metric in metrics_names])
   
    for group, metrics in metrics_groups.items():
        for m_id, metric in enumerate(metrics):
            if metric not in scores1 and metric in METRIC_NAMES_MAPPING:
                metric = METRIC_NAMES_MAPPING[metric]
            elif metric not in scores1 and metric in METRIC_MAPPING_BACK:
                metric = METRIC_MAPPING_BACK[metric]
            out += format_metric(metric) + '\t\t\t\t\t'
            for p_id, p in enumerate(phenomena):
                if metric not in scores1:
                    out += '&\t ---- \t' 
                else:
                    if global_colors:
                        out += '&\t' + format_number(scores1[metric][p], max_phenomena=True, color=map_to_color(scores1[metric][p], maximum1, minimum1)) + '\t'   
                    elif k_highest == 1:
                        max_ids = max_in_columns1['0'][metrics_names.index(metric)]
                        out += '&\t' + format_number(scores1[metric][p], max_phenomena=p_id in max_ids) + '\t'   
                    else:
                        for k in range(k_highest):
                            max_ids = max_in_columns1[str(k)][metrics_names.index(metric)]
                            if p_id in max_ids:
                                color=colors[k]
                                break
                        out += '&\t' + format_number(scores1[metric][p], max_phenomena=True, color=color) + '\t' 
            for p_id, p in enumerate(phenomena):
                if metric not in scores2:
                    out += '&\t ---- \t' 
                else:
                    if global_colors:
                        out += '&\t' + format_number(scores2[metric][p], max_phenomena=True, color=map_to_color(scores2[metric][p], maximum2, minimum2)) + '\t'   
                    elif k_highest == 1:
                        max_ids = max_in_columns2['0'][metrics_names.index(metric)]
                        out += '&\t' + format_number(scores2[metric][p], max_phenomena=p_id in max_ids) + '\t'   
                    else:
                        for k in range(k_highest):
                            max_ids = max_in_columns2[str(k)][metrics_names.index(metric)]
                            if p_id in max_ids:
                                color=colors[k]
                                break
                        out += '&\t' + format_number(scores2[metric][p], max_phenomena=True, color=color) + '\t'    
            out += '\t \\\\ \n'
        out += '\midrule \n'
    out += 'Average\t\t\t\t\t'
    for p_id, p in enumerate(phenomena):
        out += '&\t' + format_number(avgs1[p_id], max_phenomena=True, color=map_to_color(avgs1[p_id], max=np.max(avgs1), min=np.min(avgs1))) + '\t'
        out += '&\t' + format_number(avgs2[p_id], max_phenomena=True, color=map_to_color(avgs2[p_id], max=np.max(avgs2), min=np.min(avgs2))) + '\t'
    out += '\\\\'
    return out

In [None]:
metrics_names_2022 = list(set(ACES_scores_2022.keys()).intersection(set(WMT_scores.keys())))
sensitivities_2022_unmapped, _, _, phenomena_2022_unmapped, _, _ = calculate_sensitivities(ACES_scores_2022, WMT_scores, mapping=None)

In [57]:
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green3}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red3}', '\colorbox{red4}', '\colorbox{red5}']
COLORS = ['\colorbox{green1}', '\colorbox{green2}', '\colorbox{green4}', '\colorbox{green5}', '\colorbox{white}', '\colorbox{red1}', '\colorbox{red2}', '\colorbox{red4}', '\colorbox{red5}']

# From the ACES 2022 Paper:
METRICS_GROUPING_SHORT_2022 = {"baseline": ["BLEU", "COMET-20", "COMET-QE"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }
METRICS_GROUPING_SHORT_2023 = {"baseline": ["BLEU", "BERTScore", "BLEURT-20", "YISI-1"],
                        "reference-based": ["COMET-22", 'metricx_xl_MQM_2020', "UniTE", "UniTE-ref"],
                        "reference-free": ["COMETKiwi", 'KG-BERTScore', "UniTE-src"]
                    }

phenomena = ['untranslated', 'real-world knowledge', 'wrong language']
phenomena = ["hallucination-number-level-1", "hallucination-number-level-2", "hallucination-number-level-3"]

PHENOMENA_HEADER_1_NUMBERS = dict(zip(phenomena, ['\\textbf{Level 1}', '\\textbf{Level 2}', '\\textbf{Level 3}']))
PHENOMENA_HEADER_2_NUMBERS = dict(zip(phenomena, ['', '',  '']))
NUM_SAMPLES_NUMBERS = {p:len(ACES_scores_2022["BLEU"][p][0]) for p in phenomena}

# print(make_header(phenomena=phenomena, ACES_column=False, p_header_1=PHENOMENA_HEADER_1_NUMBERS, p_header_2=PHENOMENA_HEADER_2_NUMBERS, num_samples=NUM_SAMPLES_NUMBERS))
print(generate_summary_table_double(ACES_summary_2022_numbers, sensitivities_2022_unmapped, metrics_groups=METRICS_GROUPING_2022, phenomena=phenomena, ACES_column=False))
# print(make_footer(averages=SUMMARY_AVERAGES_2023, phenomena=PHENOMENA))

BLEU					&	\colorbox{green3}{\textbf{\phantom{-}0.738}}	&	\colorbox{red3}{\textbf{-0.641}}	&	\colorbox{red5}{\textbf{-0.989}}	&	\colorbox{white}{\textbf{\phantom{-}0.357}}	&	\colorbox{white}{\textbf{-0.453}}	&	\colorbox{red5}{\textbf{-2.407}}		 \\ 
f101spBLEU					&	\colorbox{green3}{\textbf{\phantom{-}0.702}}	&	\colorbox{red3}{\textbf{-0.620}}	&	\colorbox{red5}{\textbf{-1.000}}	&	\colorbox{white}{\textbf{\phantom{-}0.154}}	&	\colorbox{white}{\textbf{-0.289}}	&	\colorbox{red3}{\textbf{-1.584}}		 \\ 
f200spBLEU					&	\colorbox{green3}{\textbf{\phantom{-}0.745}}	&	\colorbox{red3}{\textbf{-0.612}}	&	\colorbox{red5}{\textbf{-1.000}}	&	\colorbox{white}{\textbf{\phantom{-}0.148}}	&	\colorbox{white}{\textbf{-0.279}}	&	\colorbox{red3}{\textbf{-1.519}}		 \\ 
chrF					&	\colorbox{green5}{\textbf{\phantom{-}0.983}}	&	\colorbox{red3}{\textbf{-0.702}}	&	\colorbox{red5}{\textbf{-1.000}}	&	\colorbox{white}{\textbf{\phantom{-}0.115}}	&	\colorbox{white}{\textbf{-0.245}}	&	\colorbox{red2}{\textbf{-1.299}

# Functions to Normalize summary scores and sensitivity scores to 0-1, then calculate the difference
In process

In [71]:
def scale_to_max_one(scores:Dict[str, Dict[str, Dict[str, int]]]):
    out = copy.deepcopy(scores)
    max = np.max([np.max(list(p.values())) for p in scores.values()])
    min = np.min([np.min(list(p.values())) for p in scores.values()])
    for m in out:
        for p in out[m]:
            out[m][p] = (out[m][p] - min) / (max - min)
    return out

def scale_to_one(scores:Dict[str, Dict[str, Dict[str, int]]]):
    out = copy.deepcopy(scores)
    sum = 0.0
    for p in scores.values():
        sum += np.sum(list(p.values()))
    for m in out:
        for p in out[m]:
            out[m][p] /= sum
    return out

def diff(scores1, scores2):
    scores_out = {}
    for metric in scores1:
        if metric in METRIC_NAMES_MAPPING:
            metric = METRIC_NAMES_MAPPING[metric]
        if metric in scores2:
            metric2 = metric
            scores_out[metric] = {p:scores2[metric2][p]-scores1[metric][p] for p in scores1[metric] if p in scores2[metric]} 
        elif metric in METRIC_MAPPING_BACK:
            metric2 = METRIC_MAPPING_BACK[metric]
            scores_out[metric] = {p:scores2[metric2][p]-scores1[metric][p] for p in scores1[metric] if p in scores2[metric]} 
        else:
            print(metric)
        
    return scores_out

# Plots

In [26]:
# create groups here:
means = {metric:[sensitivities_2022[metric][p] for p in PHENOMENA] for metric in metrics_names_2022}
tau = {metric:[ACES_summary_2022[metric][p] for p in PHENOMENA] for metric in metrics_names_2022 if metric in ACES_summary_2022}

In [21]:
# Group the metrics:
bleus = ["BLEU", "f101spBLEU", "f200spBLEU"]
comets = ["COMET-20", "COMET-22", "MS-COMET-22", "MS-COMET-QE-22"]
xl = ["metricx_xl_DA_2019", "metricx_xl_MQM_2020", "metricx_xxl_DA_2019", "metricx_xxl_MQM_2020"]
unite = ["UniTE", "UniTE-src", "UniTE-ref"]

In [28]:
grouped_line_plot([means, tau], bleus, ["Mean(good-bad)", "tau"], PHENOMENA)

In [38]:
def grouped_line_plot(groups: List[Dict[str,list]], metrics_names: List[str], group_labels: List[str], phenomena: List[str]):
    '''
    Inputs: 
        1. means and tau scores
        format = {
            metric1: [score for phenomenon 1, score for phenomenon 2, ..]
        }
        2. A list of the labels for: 
            the groups (mean (good-bad), tau, ...)
            metrics
            phenomena (the order is important because in means and tau scores the scores are ordered acc. to phenomena)
    '''
    assert len(groups) > 0 and len(groups) == len(group_labels) and len(metrics_names) > 0
    fig = go.Figure()
    colors = [['lightsteelblue',  'aqua', 'aquamarine', 'darkturquoise'],
        ['chocolate', 'coral', 'crimson', 'orange']]
    for i,group in enumerate(groups):
        for j,metric in enumerate(metrics_names):
            fig.add_trace(go.Scatter(x=phenomena, y=group[metric],mode='lines',name=group_labels[i]+" - "+metric,
                          line=dict(color=colors[i][j])))
    fig.update_layout(
        title=" ".join(group_labels)
        )
    fig.show()