In [1]:
from collections import defaultdict

import numpy as np
import pandas as pd
import scipy.stats as stats

In [2]:
# Load data
scores = pd.read_table('mt_ir_scores_all.tsv')
scores.columns

Index(['dataset', 'lang', 'size', 'bpe.tokens', 'model', 'query', 'map',
       'map.gm', 'p.5', 'p.10', 'r.10', 'rbo', 'ndcg', 'bleu.lower',
       'bleu.lower.stem', 'bleu.lower.depunc', 'bleu.lower.stem.depunc',
       'prec1.lower', 'prec1.lower.stem', 'prec1.lower.depunc',
       'prec1.lower.stem.depunc', 'prec2.lower', 'prec2.lower.stem',
       'prec2.lower.depunc', 'prec2.lower.stem.depunc', 'prec3.lower',
       'prec3.lower.stem', 'prec3.lower.depunc', 'prec3.lower.stem.depunc',
       'prec4.lower', 'prec4.lower.stem', 'prec4.lower.depunc',
       'prec4.lower.stem.depunc'],
      dtype='object')

In [3]:
# Compute pairwise metrics correlations
corr_dfs = []
for lang in scores.lang.unique():
    for dataset in scores.dataset.unique():
        dataset_scores = scores[(scores.lang == lang) & (scores.dataset == dataset)]
        for model in dataset_scores.model.unique():
            model_scores = dataset_scores[dataset_scores.model == model]
            common_fields = {'dataset': [dataset], 'lang': [lang], 'model': [model]}
            model_corr = model_scores.corr('kendall')
            for measure1, row in model_corr.items():
                for measure2, value, in row.items():
                    if measure1 != measure2:
                        row_dict = dict(common_fields)
                        row_dict.update({'measure1': [measure1], 'measure2': [measure2], 'corr': [value]})
                        corr_dfs.append(pd.DataFrame(row_dict))
                            
corrs = pd.concat(corr_dfs, ignore_index=True)

In [4]:
# Write out correlations
corrs.to_csv('mt_ir_scores_corr.tsv', sep='\t', index=False)

In [5]:
# Compare BPE for each model and dataset
def condition_key_size(row):
    return (row.model, row.lang, row['size'])

def mean_diff(seq1, seq2):
    assert len(seq1) == len(seq2)
    return sum(s1 - s2 for s1, s2 in zip(seq1, seq2)) / len(seq1)

def median_diff(seq1, seq2):
    assert len(seq1) == len(seq2)
    return np.median(seq1) - np.median(seq2)

for dataset in scores.dataset.unique():
    dataset_scores = scores[scores.dataset == dataset]
    for model in dataset_scores.model.unique():
        condition_bpe_scores = defaultdict(dict)
        for _, row in dataset_scores[dataset_scores.model == model].iterrows():
            assert len(condition_bpe_scores[condition_key_size(row)]) < 2
            condition_bpe_scores[condition_key_size(row)][row['bpe.tokens']] = row['map']

        bpe16k, bpe32k = zip(*((condition_scores[16000], condition_scores[32000]) for condition_scores in condition_bpe_scores.values()))
        test = stats.wilcoxon(bpe32k, bpe16k)
        print(dataset, model, 'hypothesis: BPE 32k != BPE 16k')
        print('Mean 32k - 16k', mean_diff(bpe32k, bpe16k))
        print(test)
        print()

europarl bm25 hypothesis: BPE 32k != BPE 16k
Mean 32k - 16k 0.009574999999999997
WilcoxonResult(statistic=24.0, pvalue=0.02280365600647676)

wiki bm25 hypothesis: BPE 32k != BPE 16k
Mean 32k - 16k -0.056543749999999976
WilcoxonResult(statistic=0.0, pvalue=0.00043680902321148305)

wiki neural hypothesis: BPE 32k != BPE 16k
Mean 32k - 16k -0.010381250000000002
WilcoxonResult(statistic=4.0, pvalue=0.0009350911193154891)



In [6]:
stats.wilcoxon(list(range(20, 40)), list(range(20)))

WilcoxonResult(statistic=0.0, pvalue=7.74421643104407e-06)

In [None]:
suire,