In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats, spatial
    
from utils.mbr_decoding_utils import (
    parse_mbrd_prefix,
    load_mbrd_by_candidate,
    load_mbrd_by_pseudo_ref,
)

# Make bold and underline the best and worst values in a Series
def highlight_best_and_worst(s):
    is_max = s == s.max()
    is_min = s == s.min()
    styles = []
    for imax, imin in zip(is_max, is_min):
        if imax:
            styles.append('font-weight: bold')
        elif imin:
            styles.append('text-decoration: underline')
        else:
            styles.append('')
    return styles

# 1. Load MBR decoding outputs
First load the MBR decoding outputs including candidates, pseudo-references, and utility matrices from `mbrd_output` directory.

In [2]:
# Set up the paths
mbrd_prefix_list = [
    # De -> En
    ## epsilon sampling x ancestral sampling
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed1.asnb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed2.asnb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed3.asnb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (same seed)
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed1.ep002nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed2.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed3.ep002nb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (different seed)
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed1.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed2.ep002nb100seed3/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed3.ep002nb100seed1/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.6)
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed1.tp06nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed2.tp06nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed3.tp06nb100seed3/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.9)
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed1.tp09nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed2.tp09nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed3.tp09nb100seed3/c100p100.e1000",

    ## epsilon sampling x beam search (beam=100)
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed1.bm100nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed2.bm100nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.de-en.ep002nb100seed3.bm100nb100seed3/c100p100.e1000",

    # En -> De
    ## epsilon sampling x ancestral sampling
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed1.asnb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed2.asnb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed3.asnb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (same seed)
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed1.ep002nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed2.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed3.ep002nb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (different seed)
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed1.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed2.ep002nb100seed3/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed3.ep002nb100seed1/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.6)
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed1.tp06nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed2.tp06nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed3.tp06nb100seed3/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.9)
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed1.tp09nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed2.tp09nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed3.tp09nb100seed3/c100p100.e1000",

    ## epsilon sampling x beam search (beam=100)
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed1.bm100nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed2.bm100nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-de.ep002nb100seed3.bm100nb100seed3/c100p100.e1000",

    # Ru -> En
    ## epsilon sampling x ancestral sampling
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed1.asnb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed2.asnb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed3.asnb100seed3/c100p100.e1000",
    
    ## epsilon sampling x epsilon sampling (same seed)
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed1.ep002nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed2.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed3.ep002nb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (different seed)
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed1.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed2.ep002nb100seed3/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed3.ep002nb100seed1/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.6)
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed1.tp06nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed2.tp06nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed3.tp06nb100seed3/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.9)
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed1.tp09nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed2.tp09nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed3.tp09nb100seed3/c100p100.e1000",

    ## epsilon sampling x beam search (beam=100)
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed1.bm100nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed2.bm100nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.ru-en.ep002nb100seed3.bm100nb100seed3/c100p100.e1000",

    # En -> Ru
    ## epsilon sampling x ancestral sampling
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed1.asnb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed2.asnb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed3.asnb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (same seed)
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed1.ep002nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed2.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed3.ep002nb100seed3/c100p100.e1000",

    ## epsilon sampling x epsilon sampling (different seed)
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed1.ep002nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed2.ep002nb100seed3/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed3.ep002nb100seed1/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.6)
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed1.tp06nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed2.tp06nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed3.tp06nb100seed3/c100p100.e1000",

    ## epsilon sampling x top-p sampling (p=0.9)
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed1.tp09nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed2.tp09nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed3.tp09nb100seed3/c100p100.e1000",

    ## epsilon sampling x beam search (beam=100)
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed1.bm100nb100seed1/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed2.bm100nb100seed2/c100p100.e1000",
    "mbrd_output/wmt19.newstest2019.en-ru.ep002nb100seed3.bm100nb100seed3/c100p100.e1000",
]

In [3]:
# Load the mbr decoding results
cnd_df_list = []
pref_df_list = []

for mbrd_prefix in tqdm(mbrd_prefix_list):
    lang, cnd_sampling_method, cnd_nbest, cnd_seed, pref_sampling_method, pref_nbest, pref_seed, seed_combination_suffix \
        = parse_mbrd_prefix(mbrd_prefix)
    candidate_method = f"{cnd_sampling_method}nb{cnd_nbest}"
    pseudo_ref_method = f"{pref_sampling_method}nb{pref_nbest}{seed_combination_suffix}"
    
    cnd_df = load_mbrd_by_candidate(mbrd_prefix=mbrd_prefix, use_cache=True)
    cnd_df.reset_index(inplace=True)
    cnd_df = cnd_df.assign(
        lang=lang,
        candidate_method=candidate_method,
        pseudo_ref_method=pseudo_ref_method,
        candidate_seed=cnd_seed,
        pseudo_ref_seed=pref_seed,
        prefix=mbrd_prefix,
    )
    
    pref_df = load_mbrd_by_pseudo_ref(mbrd_prefix=mbrd_prefix, use_cache=True)
    pref_df = pref_df.reset_index().assign(
        lang=lang,
        candidate_method=candidate_method,
        pseudo_ref_method=pseudo_ref_method,
        candidate_seed=cnd_seed,
        pseudo_ref_seed=pref_seed,
        prefix=mbrd_prefix,
    )
    
    cnd_df_list.append(cnd_df)
    pref_df_list.append(pref_df)

cnd_df = pd.concat(cnd_df_list).set_index(
    ["lang", "candidate_method", "pseudo_ref_method", "candidate_seed", "example_index", "candidate_index"]
).sort_index()
pref_df = pd.concat(pref_df_list).set_index(
    ["lang", "candidate_method", "pseudo_ref_method", "candidate_seed", "example_index", "pseudo_ref_index"]
).sort_index()

100%|██████████| 72/72 [01:14<00:00,  1.03s/it]


# 2. MBR Decoding Performance by Pseudo-reference Sampling Method

In [6]:
# Average score over seeds
score_by_seed = cnd_df.query("is_mbr").groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "candidate_seed"]
)["comet22_score"].mean()

score_over_seeds = cnd_df.query("is_mbr").groupby(
    level=["lang", "candidate_method", "pseudo_ref_method"]
)["comet22_score"].mean()

score_over_seeds.index = pd.MultiIndex.from_tuples(
    [(lang, cnd, pref, "overall") for lang, cnd, pref in score_over_seeds.index],
    names=["lang", "candidate_method", "pseudo_ref_method", "candidate_seed"]
)

# Make a table
score_table = score_over_seeds.rename(
    index={
        "de-en": "De -> En", "en-de": "En -> De", "ru-en": "Ru -> En", "en-ru": "En -> Ru"
    },
    level="lang",
).rename(
    index={
        "ep002nb100": "Epsilon (ε=0.02)"
    },
    level="candidate_method",
).rename(
    index={
        "asnb100": "Ancestral", "bm100nb100": "Beam",
        "ep002nb100": "Epsilon (ε=0.02)", "ep002nb100_diff_seeds": "Epsilon (ε=0.02)*",
        "tp06nb100": "Top-p (p=0.6)", "tp09nb100": "Top-p (p=0.9)"
    },
    level="pseudo_ref_method",
).unstack(level="lang").droplevel(level="candidate_seed", axis=0)
score_table.index.names = ["Candidate", "Psuedo-reference"]

score_table.map(lambda x: x*100).style.apply(highlight_best_and_worst, axis=0).format("{:.2f}")

Unnamed: 0_level_0,lang,De -> En,En -> De,En -> Ru,Ru -> En
Candidate,Psuedo-reference,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Epsilon (ε=0.02),Ancestral,85.82,87.51,88.41,82.02
Epsilon (ε=0.02),Beam,85.62,87.4,87.78,81.64
Epsilon (ε=0.02),Epsilon (ε=0.02),85.89,87.74,88.46,82.01
Epsilon (ε=0.02),Epsilon (ε=0.02)*,85.87,87.74,88.46,81.98
Epsilon (ε=0.02),Top-p (p=0.6),85.69,87.57,88.26,81.76
Epsilon (ε=0.02),Top-p (p=0.9),86.04,87.82,88.61,82.18


# 3. Rank correlation between features and MBR decoding performance

In [4]:
# Compute the correlation between the scores
def spearmans_test(df, column_pairs):
    results = []
    for column_a, column_b in column_pairs:
        statistic, pvalue = stats.spearmanr(
            df[column_a], df[column_b],
        )
        results.append({
            "column_a": column_a,
            "column_b": column_b,
            "statistic": statistic,
            "pvalue": pvalue,
        })
    return pd.DataFrame(results).set_index(["column_a", "column_b"])

## 3.1. Baseline features
As assumed in previous studies, the following features are used as basic features of the pseudo-reference set:
- Average probability
- Cumulative probability
- Similarity to the candidate set
- Similarity to the reference

In [8]:
# Compute average probability (mean_logprobs), candidate similarity (utilities_T),
# and reference similarity (comet22_score)
basic_features = pref_df.groupby(
    ["lang", "candidate_method", "pseudo_ref_method", "candidate_seed", "example_index"]
).agg({
    "mean_logprobs": "mean", # Use log-probability as probability
    "utilities_T": lambda x: np.stack(x.to_list()).mean(), # Use mean of utilities for candidates
    "comet22_score": "mean" # Use COMET-2.2 scores for candidates
})

basic_features = basic_features.rename(
    columns={"mean_logprobs": "avg_prob", "utilities_T": "cand_sim", "comet22_score": "ref_sim"}
)

In [58]:
# Compute cumulative probability from sum_logprobs

# Convert logprobs to probabilities
cumprob_pref_df = pref_df.copy()
cumprob_pref_df["unique_prob"] = cumprob_pref_df["sum_logprobs"].map(lambda x: np.exp(x)*100)

# Remove duplicated samples (fill with 0)
is_sample_duplicated = cumprob_pref_df.groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "candidate_seed", "example_index"]
).apply(
    lambda x: x["sentence"].duplicated().reset_index(drop=True).to_frame()
)
is_sample_duplicated.rename(columns={"sentence": "duplicated"}, inplace=True)
is_sample_duplicated.index.names = is_sample_duplicated.index.names[:-1] + ["pseudo_ref_index"]
cumprob_pref_df.loc[is_sample_duplicated["duplicated"], "unique_prob"] = 0

# Sort by unique_prob
sorted_probs = cumprob_pref_df.groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "candidate_seed", "example_index"]
).apply(
    lambda x: x["unique_prob"].sort_values(ascending=False).reset_index(drop=True).to_frame()
)
sorted_probs.index.names = sorted_probs.index.names[:-1] + ['sorted_sample_index']

# Calculate cumulative probability
cum_probs = sorted_probs.groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "candidate_seed", "example_index"]
).apply(
    lambda x: x["unique_prob"].cumsum().reset_index(drop=True).to_frame()
)
cum_probs.index.names = cum_probs.index.names[:-1] + ['sorted_sample_index']

# Get final cumulative probability
final_cum_probs = cum_probs.reset_index("sorted_sample_index").query("sorted_sample_index == 99")
basic_features["cum_prob"] = final_cum_probs["unique_prob"]

In [61]:
# Add a line of mbr decoding performance
basic_features["comet22_score"] = cnd_df.query("is_mbr").droplevel("candidate_index")["comet22_score"]

In [62]:
# Set one language pair
lang = "en-ru"

# Make a table
basic_features_table = basic_features.loc[[lang]].groupby(
    ["lang", "pseudo_ref_method"]
).mean()
basic_features_table

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_prob,cand_sim,ref_sim,cum_prob,comet22_score
lang,pseudo_ref_method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
en-ru,asnb100,-3.846425,0.71247,0.552044,0.369665,0.884066
en-ru,bm100nb100,-0.716911,0.874732,0.847305,0.44446,0.877774
en-ru,ep002nb100,-0.944499,0.866774,0.835692,0.417734,0.884603
en-ru,ep002nb100_diff_seeds,-0.944499,0.865838,0.835692,0.417734,0.884641
en-ru,tp06nb100,-0.798637,0.870553,0.842584,0.368753,0.882555
en-ru,tp09nb100,-1.693568,0.83095,0.776949,0.406266,0.886071


In [64]:
# Spearman's rank correlation test
column_pairs = [
    ("comet22_score", "avg_prob"),
    ("comet22_score", "cum_prob"),
    ("comet22_score", "cand_sim"),
    ("comet22_score", "ref_sim"),
]
basic_features_sp_test = pd.concat(
    [spearmans_test(df=basic_features_table, column_pairs=column_pairs)],
    keys=[lang],
    names=["lang"],
)
basic_features_sp_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,statistic,pvalue
lang,column_a,column_b,Unnamed: 3_level_1,Unnamed: 4_level_1
en-ru,comet22_score,avg_prob,-0.637748,0.173071
en-ru,comet22_score,cum_prob,-0.057977,0.913132
en-ru,comet22_score,cand_sim,-0.657143,0.156175
en-ru,comet22_score,ref_sim,-0.637748,0.173071


## 3.2. L2 distance with kNN
We use the L2 distance of the samples in the pseudo-reference set from the reference as a feature.

In [66]:
knn_result_list = []
k_list = [5, 25, 50, 75, 100]

index_locations = sorted(set(
    [(lang, candidate_method, pseudo_ref_method, seed) 
        for lang, candidate_method, pseudo_ref_method, seed, *_ in pref_df.index]
))

for lang, candidate_method, pseudo_ref_method, seed in tqdm(index_locations):
    utilities_T_ref = cnd_df.loc[
        (lang, candidate_method, pseudo_ref_method, seed), :
    ].groupby("example_index")["comet22_score"].apply(np.array)
    utilities_T_ref.name = "utilities_T_ref"
    
    utilities_T_pref = pref_df.loc[
        (lang, candidate_method, pseudo_ref_method, seed), "utilities_T"
    ]

    utilities_T = utilities_T_pref.to_frame().join(utilities_T_ref, how="left")

    utilities_T["L2_distance"] = utilities_T.apply(lambda row: np.linalg.norm(row["utilities_T"] - row["utilities_T_ref"]), axis=1)
    utilities_T_L2d_sorted = utilities_T.sort_values("L2_distance")

    # Calculate the average L2 distance for each k
    for k in k_list:    
        knn_result_list.append({
            "lang": lang,
            "candidate_method": candidate_method,
            "pseudo_ref_method": pseudo_ref_method,
            "seed": seed,
            "comet22_score": cnd_df.query("is_mbr").loc[
                (lang, candidate_method, pseudo_ref_method, seed), "comet22_score"].mean(),
            "k": k,
            "L2_distance": utilities_T_L2d_sorted["L2_distance"].groupby("example_index").head(k).mean(),
        })

knn_result_df = pd.DataFrame(knn_result_list).set_index(
    ["lang", "candidate_method", "pseudo_ref_method", "seed", "k"]
)

100%|██████████| 72/72 [11:38<00:00,  9.70s/it]


In [76]:
# Set one language pair
lang = "en-ru"

# Make a table
knn_table = knn_result_df.loc[[lang]].groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "k"]
).agg({
    "L2_distance": "mean"
}).unstack(level="k") 

comet22_score_table_for_knn = knn_result_df.loc[[lang]].groupby(
    ["lang", "candidate_method", "pseudo_ref_method"]
).agg({"comet22_score": "mean"})

comet22_score_table_for_knn.columns = pd.MultiIndex.from_tuples(
    [("comet22_score", "mean")]
)

knn_table = pd.concat([comet22_score_table_for_knn, knn_table], axis=1)
knn_table.style.apply(highlight_best_and_worst, axis=0).format("{:.4f}")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,comet22_score,L2_distance,L2_distance,L2_distance,L2_distance,L2_distance
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,5,25,50,75,100
lang,candidate_method,pseudo_ref_method,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
en-ru,ep002nb100,asnb100,0.8841,0.2271,0.4202,0.6686,0.9704,1.4039
en-ru,ep002nb100,bm100nb100,0.8778,0.3784,0.4417,0.4813,0.5121,0.5445
en-ru,ep002nb100,ep002nb100,0.8846,0.2814,0.3654,0.4207,0.4636,0.5127
en-ru,ep002nb100,ep002nb100_diff_seeds,0.8846,0.2579,0.3452,0.4021,0.4465,0.4963
en-ru,ep002nb100,tp06nb100,0.8826,0.3002,0.3786,0.4304,0.4702,0.5121
en-ru,ep002nb100,tp09nb100,0.8861,0.1756,0.2626,0.3329,0.4028,0.5496


In [None]:
# Spearman's rank correlation test
column_pairs = [
    (("comet22_score", "mean"), ("L2_distance", 5)),
    (("comet22_score", "mean"), ("L2_distance", 25)),
    (("comet22_score", "mean"), ("L2_distance", 50)),
    (("comet22_score", "mean"), ("L2_distance", 75)),
    (("comet22_score", "mean"), ("L2_distance", 100)),
]
knn_sp_test = pd.concat(
    [spearmans_test(df=knn_table, column_pairs=column_pairs)],
    keys=[lang],
    names=["lang"],
)
knn_sp_test

## 3.3. Mahalanobis distance
We use the Mahalanobis distance of the samples in the pseudo-reference set from the reference as a feature.

Make sure you have pre-computed covariance matrices in the each directory: `mbrd_output/*/c100p100e1000.example_covariance.pkl`

In [8]:
mah_result_list = []

index_locations = sorted(set(
    [(lang, candidate_method, pseudo_ref_method, seed) 
        for lang, candidate_method, pseudo_ref_method, seed, *_ in pref_df.index]
))
for lang, candidate_method, pseudo_ref_method, seed in tqdm(index_locations):
    cnd_df_ = cnd_df.loc[(lang, candidate_method, pseudo_ref_method, seed), :]
    pref_df_ = pref_df.loc[(lang, candidate_method, pseudo_ref_method, seed), :]

    mbrd_prefix = cnd_df_["prefix"].iloc[0]
    example_covariance = pd.read_pickle(f"{mbrd_prefix}.example_covariance.pkl")

    mah_result = example_covariance.copy()
    
    mah_result["comet22_score"] = cnd_df_.query("is_mbr").droplevel("candidate_index")["comet22_score"]

    nodup_candidates_indices = cnd_df_.groupby("example_index").apply(
        lambda x: x[~x["sentence"].duplicated()].index.get_level_values("candidate_index")
    )

    mah_result["utilities_T_pref"] = pref_df_.groupby("example_index").apply(
        lambda x: np.stack(x["utilities_T"].values)[:, nodup_candidates_indices.loc[x.name]],
    )

    mah_result["utilities_T_ref"] = cnd_df_.groupby("example_index").apply(
        lambda x: x["comet22_score"].values[nodup_candidates_indices.loc[x.name]],
    )

    def compute_regularized_mahalanobis(row: pd.Series, regularizer: float):
        if row["nodup_candidates"] is None:
            return np.nan
        
        example_util_T_pref = row["utilities_T_pref"]
        example_util_T_ref = row["utilities_T_ref"]
        cov = row["nodup_candidates"].covariance_
        if not cov.shape:
            cov = cov.reshape(1,1)
        
        distance = spatial.distance.mahalanobis(
            u=example_util_T_ref,
            v=example_util_T_pref.mean(axis=0),
            VI=np.linalg.inv(cov + np.eye(cov.shape[0]) * regularizer)
        )
        return distance

    mah_result["mahalanobis_distance"] = mah_result.apply(
        compute_regularized_mahalanobis,
        regularizer=1e-5,
        axis=1
    )

    mah_result = mah_result.assign(
        lang=lang,
        candidate_method=candidate_method,
        pseudo_ref_method=pseudo_ref_method,
        seed=seed,
    )

    mah_result_list.append(mah_result)

mah_result_df = pd.concat(mah_result_list).reset_index().set_index(
    ["lang", "candidate_method", "pseudo_ref_method", "seed", "example_index"]
).sort_index()

100%|██████████| 72/72 [04:00<00:00,  3.35s/it]


In [9]:
# Set one language pair
lang = "en-ru"

# Make a table
mah_table = mah_result_df.loc[[lang]].groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "seed"]
).agg({
    "mahalanobis_distance": "median" # Use median instead of mean to avoid unstable inverse computation of covariance
}).groupby(
    level=["lang", "candidate_method", "pseudo_ref_method"]
).mean()

comet22_score_table_for_mah = mah_result_df.loc[[lang], ["comet22_score"]].groupby(
    level=["lang", "candidate_method", "pseudo_ref_method"]
).agg({"comet22_score": "mean"})

mah_table = pd.concat(
    [comet22_score_table_for_mah, mah_table], axis=1
)
mah_table.style.apply(highlight_best_and_worst, axis=0).format("{:.3f}")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,comet22_score,mahalanobis_distance
lang,candidate_method,pseudo_ref_method,Unnamed: 3_level_1,Unnamed: 4_level_1
en-ru,ep002nb100,asnb100,0.884,8.911
en-ru,ep002nb100,bm100nb100,0.878,20.905
en-ru,ep002nb100,ep002nb100,0.885,7.272
en-ru,ep002nb100,ep002nb100_diff_seeds,0.885,9.04
en-ru,ep002nb100,tp06nb100,0.883,13.167
en-ru,ep002nb100,tp09nb100,0.886,7.755


In [10]:
# Spearman's rank correlation test
column_pairs = [
    ("comet22_score", "mahalanobis_distance"),
]
mah_sp_test = pd.concat(
    [spearmans_test(df=mah_table, column_pairs=column_pairs)],
    keys=[lang],
    names=["lang"],
)
mah_sp_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,statistic,pvalue
lang,column_a,column_b,Unnamed: 3_level_1,Unnamed: 4_level_1
en-ru,comet22_score,mahalanobis_distance,-0.714286,0.110787


## 3.4. Local Outlier Factor
We use the local outlier factor of the samples in the pseudo-reference set as a feature.

Make sure you have pre-computed LOF scores in the each directory: `mbrd_output/*/c100p100e1000.example_lof.pkl`

In [5]:
lof_result_list = []
n_neighbors_list = [5, 25, 50, 75, 100]

index_locations = sorted(set(
    [(lang, candidate_method, pseudo_ref_method, seed) 
        for lang, candidate_method, pseudo_ref_method, seed, *_ in pref_df.index]
))

for lang, candidate_method, pseudo_ref_method, seed in tqdm(index_locations):
    mbrd_prefix = cnd_df.loc[(lang, candidate_method, pseudo_ref_method, seed), "prefix"].iloc[0]

    lof_models = pd.read_pickle(f"{mbrd_prefix}.example_lof.pkl")
    lof_result = lof_models.loc[pd.IndexSlice[:, n_neighbors_list], :].reset_index("n_neighbors")

    lof_result["comet22_score"] = cnd_df.query("is_mbr").loc[
        (lang, candidate_method, pseudo_ref_method, seed), "comet22_score"
    ].droplevel("candidate_index")

    lof_result["utilities_T_ref"] = cnd_df.loc[
        (lang, candidate_method, pseudo_ref_method, seed), "comet22_score"
    ].groupby("example_index").apply(np.array)

    lof_result["ref_lof_score"] = lof_result.apply(
        lambda row: -1 * row["lof_model"].score_samples([row["utilities_T_ref"]])[0], axis=1
        # Here, we multiply by -1 to get the acutual LOF score since `score_samples()` returns the "opposite of LOF score":
        # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html
    )
    lof_result = lof_result.reset_index().assign(
        lang=lang,
        candidate_method=candidate_method,
        pseudo_ref_method=pseudo_ref_method,
        seed=seed,
    )
    lof_result_list.append(lof_result)

lof_result_df = pd.concat(lof_result_list).set_index(
    ["lang", "candidate_method", "pseudo_ref_method", "seed", "example_index", "n_neighbors"]
)

100%|██████████| 72/72 [20:44<00:00, 17.28s/it]


In [6]:
# Set one language pair
lang = "en-ru"

# Make a table
lof_table = lof_result_df.loc[[lang]].groupby(
    level=["lang", "candidate_method", "pseudo_ref_method", "n_neighbors"]
).agg({
    "ref_lof_score": "median", # We use median instead of mean to avoid unstable computation of LOF
}).unstack(level="n_neighbors")

comet22_score_table_for_lof = lof_result_df.loc[[lang]].groupby(
    ["lang", "candidate_method", "pseudo_ref_method"]
).agg({"comet22_score": "mean"})
comet22_score_table_for_lof.columns = pd.MultiIndex.from_tuples(
    [(f"comet22_score", "mean")]
)

lof_table = pd.concat([comet22_score_table_for_lof, lof_table], axis=1)
lof_table.style.apply(highlight_best_and_worst, axis=0).format("{:.2f}")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,comet22_score,ref_lof_score,ref_lof_score,ref_lof_score,ref_lof_score,ref_lof_score
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,5,25,50,75,100
lang,candidate_method,pseudo_ref_method,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
en-ru,ep002nb100,asnb100,0.88,1.07,1.13,1.13,1.05,1.0
en-ru,ep002nb100,bm100nb100,0.88,2.42,2.06,1.85,1.51,1.02
en-ru,ep002nb100,ep002nb100,0.88,1.1,1.12,1.11,1.03,1.0
en-ru,ep002nb100,ep002nb100_diff_seeds,0.88,1.11,1.14,1.14,1.05,1.0
en-ru,ep002nb100,tp06nb100,0.88,1.35,1.36,1.35,1.17,1.0
en-ru,ep002nb100,tp09nb100,0.89,1.03,1.02,1.01,0.99,1.0


In [7]:
# Spearman's rank correlation test
column_pairs = [
    (("comet22_score", "mean"), ("ref_lof_score", 5)),
    (("comet22_score", "mean"), ("ref_lof_score", 25)),
    (("comet22_score", "mean"), ("ref_lof_score", 50)),
    (("comet22_score", "mean"), ("ref_lof_score", 75)),
    (("comet22_score", "mean"), ("ref_lof_score", 100)),
]
lof_sp_test = pd.concat(
    [spearmans_test(df=lof_table, column_pairs=column_pairs)],
    keys=[lang],
    names=["lang"],
)
lof_sp_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,statistic,pvalue
lang,column_a,column_b,Unnamed: 3_level_1,Unnamed: 4_level_1
en-ru,"(comet22_score, mean)","(ref_lof_score, 5)",-0.771429,0.072397
en-ru,"(comet22_score, mean)","(ref_lof_score, 25)",-0.828571,0.041563
en-ru,"(comet22_score, mean)","(ref_lof_score, 50)",-0.828571,0.041563
en-ru,"(comet22_score, mean)","(ref_lof_score, 75)",-0.828571,0.041563
en-ru,"(comet22_score, mean)","(ref_lof_score, 100)",-0.657143,0.156175
