In [1]:
import os
import json
import jsonlines
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import weightedtau

In [2]:
tuned_results = {
    "MNLI": [84.7, 82.2, 83.9, 81.5, 87.6, 84.0],
    "QQP": [89.6, 88.5, 89.2, 87.8, 91.9, 89.4],
    "QNLI": [91.8, 89.2, 91.0, 88.2, 92.8, 90.8],
    "SST-2": [92.7, 91.3, 91.7, 90.4, 94.8, 92.5],
    "CoLA": [56.3, 51.3, 58.2, 47.2, 63.6, 59.3],
    "MRPC": [88.6, 87.5, 87.8, 85.6, 90.2, 86.6],
    "RTE": [69.3, 59.9, 66.1, 60.6, 78.7, 67.9],
    "WNLI": [53.5, 56.3, 46.5, 56.3, 57.7, 52.1],
}

In [6]:
datasets = ['QNLI']
encoders = ["bert-base-uncased",
            "distilbert-base-uncased",
            "bert-base-cased",
            "distilbert-base-cased",
            "roberta-base",
            "distilroberta-base"]

all_results = {}
all_methods = ['GBC', 'HScore', 'HScoreR', 'kNN-1', 'kNN-3', 'kNN-5', 'Logistic', 'LogME', 'NLEEP-5000', 'NLEEP-10000']
for dataset in datasets:
    # if not os.path.exists(f'../../resources/output/glue/{dataset}/'):
    #     continue
    # all_methods = set([file.split('_')[0] for file in os.listdir(f'../../resources/output/{dataset}/results/')])
    for pooling in ['first']:
        dt_results = {
            "Models": encoders,
            "Fine-Tuned": tuned_results[dataset]
        }
        for method in all_methods:
            
            if method == 'EarlyStop-1':
                method_result_list = get_earlystop_result(dataset, pooling, 1, 'frozen')
            elif method == 'EarlyStop-3':
                method_result_list = get_earlystop_result(dataset, pooling, 3, 'frozen')
            else:
                method_result_file = f'../../resources/output/glue/{dataset}/model_selection_results/{method}_{pooling}.jsonl'
                method_result_list = []
                with jsonlines.open(method_result_file) as reader:
                    for result_dict in reader:
                        method_result_list.append(result_dict['avg_score'])
            
            dt_results[method] = method_result_list
        
        all_results[dataset] = pd.DataFrame(dt_results)

In [7]:
all_results['QNLI']

Unnamed: 0,Models,Fine-Tuned,GBC,HScore,HScoreR,kNN-1,kNN-3,kNN-5,Logistic,LogME,NLEEP-5000,NLEEP-10000
0,bert-base-uncased,91.8,-1.087231,0.228444,0.228444,0.554091,0.55885,0.570749,0.71847,-0.612357,0.504292,0.502428
1,distilbert-base-uncased,89.2,-1.307048,0.227498,0.227498,0.535786,0.54622,0.552627,0.711148,-0.613275,0.502602,0.501481
2,bert-base-cased,91.0,-1.360442,0.248023,0.248023,0.546037,0.559949,0.562511,0.722131,-0.599783,0.501917,0.50114
3,distilbert-base-cased,88.2,-1.559806,0.218174,0.218174,0.538532,0.539447,0.543108,0.714809,-0.618342,0.501037,0.500346
4,roberta-base,92.8,-1.300314,0.309771,0.309771,0.545305,0.555556,0.549881,0.736775,-0.559242,0.502529,0.501874
5,distilroberta-base,90.8,-1.337784,0.240281,0.240281,0.528647,0.531759,0.535603,0.719202,-0.607802,0.503619,0.502245


In [4]:
datasets = ['MNLI']
encoders = ["bert-base-uncased",
            "distilbert-base-uncased",
            "bert-base-cased",
            "distilbert-base-cased",
            "roberta-base",
            "distilroberta-base"]

all_corr_results = {}

all_methods = ['GBC', 'HScore', 'HScoreR', 'kNN-1', 'kNN-3', 'kNN-5', 'Logistic', 'LogME', 'MSC-5000', 'MSC-10000', 'MSC-20000', 'MSC-30000', 'MSC-50000', 'SFDA', 'TransRate']

for pooling in ['first']:
    corr_results = {
        "Methods": all_methods
    }

    for dataset in datasets:
        # if not os.path.exists(f'../../resources/output/{dataset}/'):
        #     continue
        corr_results[dataset+'-rho'] = []
        corr_results[dataset+'-tau'] = []
        corr_results[dataset+'-time'] = []
        for method in all_methods:
            if method == 'EarlyStop-1':
                method_result_list = get_earlystop_result(dataset, pooling, 1, 'frozen')
            elif method == 'EarlyStop-3':
                method_result_list = get_earlystop_result(dataset, pooling, 3, 'frozen')
            else:   
                method_result_file = f'../../resources/output/glue/{dataset}/model_selection_results/{method}_{pooling}.jsonl'
                method_result_list = []
                method_time_list = []
                with jsonlines.open(method_result_file) as reader:
                    for result_dict in reader:
                        method_result_list.append(result_dict['avg_score'])
                        method_time_list.append(result_dict['avg_time'])

            rho = pearsonr(method_result_list, tuned_results[dataset]).statistic
            tau = weightedtau(method_result_list, tuned_results[dataset]).statistic
            sec = np.sum(method_time_list)
            corr_results[dataset+'-rho'].append(round(rho, 3))
            corr_results[dataset+'-tau'].append(round(tau, 3))
            corr_results[dataset+'-time'].append(round(sec, 3))
corr_results = pd.DataFrame(corr_results)

In [5]:
corr_results

Unnamed: 0,Methods,MNLI-rho,MNLI-tau,MNLI-time
0,GBC,0.559,0.314,33.436
1,HScore,0.922,0.762,92.916
2,HScoreR,0.922,0.762,100.987
3,kNN-1,0.499,0.275,666.716
4,kNN-3,0.374,0.314,730.368
5,kNN-5,0.431,0.248,708.039
6,Logistic,0.817,0.686,479.703
7,LogME,0.918,0.762,172.207
8,MSC-5000,0.522,0.348,32.963
9,MSC-10000,0.285,0.39,63.522
