In [1]:
import os
import json
import jsonlines
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr, weightedtau
from collections import defaultdict

In [2]:
tuned_results = {
    "mnli": [84.7, 82.2, 83.9, 81.5, 87.6, 84.0],
    "qqp": [89.6, 88.5, 89.2, 87.8, 91.9, 89.4],
    "qnli": [91.8, 89.2, 91.0, 88.2, 92.8, 90.8],
    "sst2": [92.7, 91.3, 91.7, 90.4, 94.8, 92.5],
    "cola": [56.3, 51.3, 58.2, 47.2, 63.6, 59.3],
    "mrpc": [88.6, 87.5, 87.8, 85.6, 90.2, 86.6],
    "rte": [69.3, 59.9, 66.1, 60.6, 78.7, 67.9],
    "wnli": [53.5, 56.3, 46.5, 56.3, 57.7, 52.1],
}

dataset_sizes = {
    "mnli": 392702,
    "qqp": 363846,
    "qnli": 104743,
    "sst2": 67349,
    "cola": 8551,
    "mrpc": 3668,
    "rte": 2490,
    "wnli": 635,
}

In [6]:
datasets = ['QNLI']
encoders = ["bert-base-uncased",
            "distilbert-base-uncased",
            "bert-base-cased",
            "distilbert-base-cased",
            "roberta-base",
            "distilroberta-base"]

all_results = {}
target_model_lm = "albert-base-v2"
model_sim_methods = ["DSE-MeanEmb", "DSE-MeanSim", "PA", "RSA", "DDS"]


static_methods = ["kNN-1", "kNN-3", "kNN-5", "GBC", "HScore", "HScoreR", "LFC", "MSC", "PARC", "TransRate"] + model_sim_methods
dynamic_methods = ["NLEEP",  "SFDA", "Logistic", "LogME", "EarlyStop-50", "EarlyStop-100", "EarlyStop-500"]

all_methods = static_methods + dynamic_methods
for dataset in datasets:
    for pooling in ['first']:
        dt_results = {
            "Models": encoders,
            "Fine-Tuned": tuned_results[dataset]
        }
        for method in all_methods:
            
            if method == 'EarlyStop-1':
                method_result_list = get_earlystop_result(dataset, pooling, 1, 'frozen')
            elif method == 'EarlyStop-3':
                method_result_list = get_earlystop_result(dataset, pooling, 3, 'frozen')
            else:
                method_result_file = f'../../resources/output/glue/{dataset}/model_selection_results/{method}_{pooling}.jsonl'
                method_result_list = []
                with jsonlines.open(method_result_file) as reader:
                    for result_dict in reader:
                        method_result_list.append(result_dict['avg_score'])
            
            dt_results[method] = method_result_list
        
        all_results[dataset] = pd.DataFrame(dt_results)

In [7]:
all_results['QNLI']

Unnamed: 0,Models,Fine-Tuned,GBC,HScore,HScoreR,kNN-1,kNN-3,kNN-5,Logistic,LogME,NLEEP-5000,NLEEP-10000
0,bert-base-uncased,91.8,-1.087231,0.228444,0.228444,0.554091,0.55885,0.570749,0.71847,-0.612357,0.504292,0.502428
1,distilbert-base-uncased,89.2,-1.307048,0.227498,0.227498,0.535786,0.54622,0.552627,0.711148,-0.613275,0.502602,0.501481
2,bert-base-cased,91.0,-1.360442,0.248023,0.248023,0.546037,0.559949,0.562511,0.722131,-0.599783,0.501917,0.50114
3,distilbert-base-cased,88.2,-1.559806,0.218174,0.218174,0.538532,0.539447,0.543108,0.714809,-0.618342,0.501037,0.500346
4,roberta-base,92.8,-1.300314,0.309771,0.309771,0.545305,0.555556,0.549881,0.736775,-0.559242,0.502529,0.501874
5,distilroberta-base,90.8,-1.337784,0.240281,0.240281,0.528647,0.531759,0.535603,0.719202,-0.607802,0.503619,0.502245


In [15]:
datasets = ['cola', 'sst2', 'mrpc', 'qqp', 'mnli', 'qnli', 'rte', 'wnli']
encoders = ["bert-base-uncased",
            "distilbert-base-uncased",
            "bert-base-cased",
            "distilbert-base-cased",
            "roberta-base",
            "distilroberta-base"]

all_corr_results = {}

target_model_lm = "albert-base-v2"
model_sim_methods = ["DSE-MeanEmb", "DSE-MeanSim", "PA", "RSA", "DDS"]


static_methods = ["kNN-1", "kNN-3", "kNN-5", "GBC", "HScore", "HScoreR", "LFC", "MSC", "PARC", "TransRate"] + model_sim_methods
dynamic_methods = ["NLEEP",  "SFDA", "Logistic", "LogME", "EarlyStop-50", "EarlyStop-100", "EarlyStop-500"]

all_methods = static_methods + dynamic_methods

method_show_name = {
    "EarlyStop-50": "EarlyStop (50 Steps)",
    "EarlyStop-100": "EarlyStop (100 Steps)",
    "EarlyStop-500": "EarlyStop (500 Steps)",
    "kNN-1": "kNN (k=1)", 
    "kNN-3": "kNN (k=3)", 
    "kNN-5": "kNN (k=5)",
    "Logistic": "Logistic",
    "NLEEP": "$\mathcal{N}$LEEP",
    "GBC": "GBC",
    "HScore": "H-score",
    "HScoreR": "Reg. H-score",
    "LogME": "LogME",
    "LFC": "LFC",
    "MSC": "MSC",
    "PARC": "PARC", 
    "SFDA": "SFDA", 
    "TransRate": "TransRate",
    "DSE-MeanEmb": "DSE (MeanEmb)",
    "DSE-MeanSim": "DSE (MeanSim)",
    "PA": "PA", 
    "RSA": "RSA", 
    "DDS": "DDS"
}

pooling = 'first'
max_data_size = 10000
embedding_size = 768


def ground_truth_ranking(pred_list, gt_list):
    pred_rank = np.argsort(-np.array(pred_list))
    rank = np.where(pred_rank==np.argmax(gt_list))[0].item() + 1
    return rank


all_results = {}

for method in all_methods:
    all_results[method] = defaultdict(list)
    for dataset in datasets:
        method_result_file = f"../../resources/output/glue/{dataset}/model_selection_results/{method+'-'+target_model_lm if method in model_sim_methods else method}_{dataset_sizes[dataset] if dataset_sizes[dataset] < max_data_size else max_data_size}_{embedding_size}_{pooling}.jsonl"
        method_result_list = []
        method_time_list = []
        with jsonlines.open(method_result_file) as reader:
            for result_dict in reader:
                method_result_list.append(result_dict['avg_score'])
                method_time_list.append(result_dict['avg_time'])
        
        rank = ground_truth_ranking(method_result_list, tuned_results[dataset])
        
        spr = spearmanr(method_result_list, tuned_results[dataset]).statistic
        pear = pearsonr(method_result_list, tuned_results[dataset]).statistic
        wtau = weightedtau(method_result_list, tuned_results[dataset]).statistic
        sec = str(round(np.sum(method_time_list), 2))
        if method in model_sim_methods:
            target_training_file = f"../../resources/output/glue/{dataset}/encoded_dataset/target-model_{target_model_lm}_{pooling}/target-model-5_first.jsonl"
            with jsonlines.open(target_training_file) as reader:
                for result_dict in reader:
                    sec = str(round(result_dict['avg_time'] / (60 * 60), 2)) + '+' + sec
                    break
        all_results[method]['acc@1'].append(float(rank==1))
        all_results[method]['mrr'].append(1./rank)
        all_results[method]['spr'].append(spr)
        all_results[method]['pear'].append(pear)
        all_results[method]['wtau'].append(wtau)
        all_results[method]['time'].append(sec)


def avg_time(time_list):
    train_time = 0.
    score_time = 0.
    for t in time_list:
        split_str = t.split('+')
        score_time += float(split_str[-1])
        if len(split_str) == 2:
            train_time += float(split_str[0])
    if len(split_str) == 2:
        return f"{round(train_time / len(time_list), 2)}h+{round(score_time / len(time_list), 2)}s"
    return f"{round(score_time / len(time_list), 2)}s"
        
main_results = {
    "Methods": map(lambda x: method_show_name[x], all_methods),
    # "Accuracy": map(lambda x: round(np.mean(all_results[x]['acc@1']), 3), all_methods),
    "MRR": map(lambda x: round(np.mean(all_results[x]['mrr']), 3), all_methods),
    "Spearman": map(lambda x: round(np.mean(all_results[x]['spr']), 3), all_methods),
    # "Pearson": map(lambda x: round(np.mean(all_results[x]['pear']), 3), all_methods),
    # "WKendallTau": map(lambda x: round(np.mean(all_results[x]['wtau']), 3), all_methods),
    "Running Time": map(lambda x: avg_time(all_results[x]['time']), all_methods)
}


main_results = pd.DataFrame(main_results)

specific_results = np.array([
   [m] + ['%.3f'%s for s in all_results[m]['mrr']] + ['%.3f'%np.mean(all_results[m]['mrr'])] for m in all_results.keys()
])
mrr_results = pd.DataFrame(specific_results, columns=['Methods'] + datasets +['MRR'])

specific_results = np.array([
   [m] + ['%.3f'%s for s in all_results[m]['spr']] + ['%.3f'%np.mean(all_results[m]['spr'])] for m in all_results.keys()
])
spr_results = pd.DataFrame(specific_results, columns=['Methods'] + datasets+['Mean SC'])

In [17]:
for i in range(mrr_results.shape[0]):
    print(' & '.join(list(mrr_results.iloc[i]))+ r' \\')

kNN-1 & 0.333 & 0.200 & 0.200 & 0.200 & 0.500 & 0.333 & 0.167 & 0.250 & 0.273 \\
kNN-3 & 0.500 & 0.200 & 0.333 & 0.200 & 0.500 & 0.500 & 0.250 & 0.333 & 0.352 \\
kNN-5 & 0.250 & 0.200 & 0.200 & 0.200 & 1.000 & 0.500 & 0.250 & 1.000 & 0.450 \\
GBC & 0.500 & 0.167 & 0.333 & 1.000 & 0.500 & 0.333 & 1.000 & 0.200 & 0.504 \\
HScore & 1.000 & 0.200 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.167 & 0.796 \\
HScoreR & 1.000 & 0.200 & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 & 0.167 & 0.796 \\
LFC & 0.200 & 0.167 & 0.200 & 0.200 & 0.200 & 0.200 & 0.200 & 0.167 & 0.192 \\
MSC & 1.000 & 0.167 & 0.250 & 1.000 & 1.000 & 0.250 & 1.000 & 0.333 & 0.625 \\
PARC & 1.000 & 0.167 & 0.333 & 0.500 & 1.000 & 0.250 & 1.000 & 0.200 & 0.556 \\
TransRate & 0.167 & 0.167 & 0.200 & 0.200 & 0.200 & 0.200 & 0.200 & 0.200 & 0.192 \\
DSE-MeanEmb & 0.333 & 0.167 & 0.250 & 0.250 & 0.500 & 0.167 & 0.500 & 0.500 & 0.333 \\
DSE-MeanSim & 0.500 & 0.167 & 0.250 & 0.250 & 0.500 & 0.167 & 0.500 & 0.500 & 0.354 \\
PA & 0.167 & 0.1

In [39]:
main_results

Unnamed: 0,Methods,MRR,Spearman,Running Time
0,kNN (k=1),0.273,0.067,6.53s
1,kNN (k=3),0.352,0.254,6.57s
2,kNN (k=5),0.45,0.188,6.55s
3,GBC,0.504,0.196,0.29s
4,H-score,0.796,0.596,2.16s
5,Reg. H-score,0.796,0.596,2.11s
6,LFC,0.192,-0.197,9.07s
7,MSC,0.625,0.063,6.33s
8,PARC,0.556,0.281,89.42s
9,TransRate,0.192,-0.183,1.35s


In [40]:
for i in range(main_results.shape[0]):
    print(' & '.join(list(main_results.iloc[i][:1]) + ['%.3f'%s for s in main_results.iloc[i][1:-1]] + list(main_results.iloc[i][-1:]))+ r' \\')

kNN (k=1) & 0.273 & 0.067 & 6.53s \\
kNN (k=3) & 0.352 & 0.254 & 6.57s \\
kNN (k=5) & 0.450 & 0.188 & 6.55s \\
GBC & 0.504 & 0.196 & 0.29s \\
H-score & 0.796 & 0.596 & 2.16s \\
Reg. H-score & 0.796 & 0.596 & 2.11s \\
LFC & 0.192 & -0.197 & 9.07s \\
MSC & 0.625 & 0.063 & 6.33s \\
PARC & 0.556 & 0.281 & 89.42s \\
TransRate & 0.192 & -0.183 & 1.35s \\
DSE (MeanEmb) & 0.333 & 0.147 & 2.37h+0.19s \\
DSE (MeanSim) & 0.354 & 0.161 & 2.37h+0.41s \\
PA & 0.217 & -0.161 & 2.37h+2.48s \\
RSA & 0.325 & -0.071 & 2.37h+135.69s \\
DDS & 0.325 & -0.057 & 2.37h+24.48s \\
$\mathcal{N}$LEEP & 0.562 & 0.262 & 79.64s \\
SFDA & 0.521 & 0.558 & 10.95s \\
Logistic & 0.781 & 0.423 & 3.13s \\
LogME & 0.792 & 0.474 & 1.86s \\
EarlyStop (50 Steps) & 0.248 & 0.077 & 172.24s \\
EarlyStop (100 Steps) & 0.838 & 0.624 & 144.47s \\
EarlyStop (500 Steps) & 0.854 & 0.758 & 314.51s \\


In [35]:
f = np.random.standard_normal([5, 10])
np.cov(f, rowvar=False).shape

(10, 10)

In [71]:
for i in range(time_results.shape[0]):
    # print(' & '.join(list(time_results.iloc[i])[:1]+list(map(all2str, list(time_results.iloc[i])[1:]))) + r' \\')
    print(' & '.join([time_results.iloc[i][0], time_results.iloc[i][-1].replace(r'+',r'h+')+'s', r' \\']))

EarlyStop (100 Steps) & 144.47s &  \\
EarlyStop (500 Steps) & 314.51s &  \\
kNN (k=1) & 6.53s &  \\
kNN (k=3) & 6.57s &  \\
kNN (k=5) & 6.55s &  \\
Logistic & 3.13s &  \\
$\mathcal{N}$LEEP & 79.64s &  \\
GBC & 0.29s &  \\
H-score & 2.16s &  \\
Reg. H-score & 2.11s &  \\
LogME & 1.86s &  \\
LFC & 9.07s &  \\
MSC & 6.33s &  \\
PARC & 89.42s &  \\
SFDA & 10.95s &  \\
TransRate & 1.35s &  \\
DSE (MeanEmb) & 2.37h+0.19s &  \\
DSE (MeanSim) & 2.37h+0.41s &  \\
PA & 2.37h+2.48s &  \\
RSA & 2.37h+135.69s &  \\
DDS & 2.37h+24.48s &  \\


In [13]:
all2str = lambda x:"%.3f"%x
for i in range(corr_results.shape[0]):
    print(' & '.join(list(corr_results.iloc[i])[:1]+list(map(all2str, list(corr_results.iloc[i])[1:]))) + r' \\')

EarlyStop (100 Steps) & 0.092 & 0.934 & 0.935 & 0.906 & 0.680 & 0.987 & 0.900 & 0.622 & 0.757 \\
EarlyStop (500 Steps) & 0.892 & 0.916 & 0.930 & 0.956 & 0.936 & 0.855 & 0.908 & 0.635 & 0.878 \\
kNN (k=1) & 0.650 & 0.381 & -0.644 & -0.091 & 0.674 & 0.468 & -0.270 & -0.540 & 0.079 \\
kNN (k=3) & 0.729 & -0.006 & -0.150 & -0.088 & 0.683 & 0.801 & -0.393 & -0.010 & 0.196 \\
kNN (k=5) & 0.571 & 0.071 & -0.504 & -0.118 & 0.716 & 0.682 & 0.142 & 0.076 & 0.204 \\
Logistic & 0.793 & 0.078 & 0.069 & 0.550 & 0.891 & 0.535 & 0.748 & 0.390 & 0.507 \\
$\mathcal{N}$LEEP & 0.491 & -0.843 & -0.286 & 0.666 & 0.613 & 0.726 & 0.711 & 0.680 & 0.345 \\
GBC & 0.674 & -0.928 & -0.067 & 0.367 & 0.492 & 0.690 & 0.826 & -0.277 & 0.222 \\
H-score & 0.950 & -0.055 & 0.663 & 0.892 & 0.934 & 0.769 & 0.977 & 0.231 & 0.670 \\
Reg. H-score & 0.950 & -0.055 & 0.663 & 0.892 & 0.934 & 0.769 & 0.977 & 0.231 & 0.670 \\
LogME & 0.929 & -0.576 & 0.661 & 0.738 & 0.897 & 0.723 & 0.936 & -0.471 & 0.480 \\
LFC & -0.474 & -0.484 &

In [27]:
corr_results

Unnamed: 0,Methods,cola,sst2,mrpc,qqp,mnli,qnli,rte,wnli
0,EarlyStop-100,0.092,0.934,0.935,0.906,0.68,0.987,0.9,0.622
1,EarlyStop-500,0.892,0.916,0.93,0.956,0.936,0.855,0.908,0.635
2,kNN-1,0.65,0.381,-0.644,-0.091,0.674,0.468,-0.27,-0.54
3,kNN-3,0.729,-0.006,-0.15,-0.088,0.683,0.801,-0.393,-0.01
4,kNN-5,0.571,0.071,-0.504,-0.118,0.716,0.682,0.142,0.076
5,Logistic,0.793,0.078,0.069,0.55,0.891,0.535,0.748,0.39
6,NLEEP,0.491,-0.843,-0.286,0.666,0.613,0.726,0.711,0.68
7,GBC,0.674,-0.928,-0.067,0.367,0.492,0.69,0.826,-0.277
8,HScore,0.95,-0.055,0.663,0.892,0.934,0.769,0.977,0.231
9,HScoreR,0.95,-0.055,0.663,0.892,0.934,0.769,0.977,0.231


In [36]:
time_results

Unnamed: 0,Methods,cola,sst2,mrpc,qqp,mnli,qnli,rte,wnli,Average Time
0,EarlyStop (100 Steps),69.669,82.948,65.265,450.619,173.595,118.171,94.976,100.55,144.474
1,EarlyStop (500 Steps),233.29,240.946,235.381,597.971,394.733,296.989,359.09,157.654,314.507
2,kNN (k=1),1.108,1.159,0.44,33.781,9.638,5.524,0.38,0.197,6.528
3,kNN (k=3),1.168,1.064,0.328,34.066,9.788,5.766,0.27,0.117,6.571
4,kNN (k=5),1.145,1.061,0.332,34.162,9.757,5.584,0.253,0.126,6.552
5,Logistic,3.623,4.04,1.91,4.619,4.513,4.309,1.539,0.505,3.132
6,$\mathcal{N}$LEEP,113.262,129.352,23.811,148.595,104.781,91.428,17.381,8.477,79.636
7,GBC,0.409,0.408,0.131,0.408,0.403,0.422,0.084,0.027,0.287
8,H-score,2.715,2.506,1.517,2.428,2.761,2.68,1.413,1.226,2.156
9,Reg. H-score,2.656,2.444,1.529,2.468,2.631,2.586,1.39,1.197,2.113
