In [35]:
import numpy as np

In [44]:
y = np.array([1, 2, 3, 4, 5])
y[np.where(y==1)[0]].tolist()

[1]

In [43]:
5/2

2.5

In [12]:
(time.time() - st_time) / 2 + 20

191.49718165397644

In [5]:
y = np.random.binomial(1, 0.5, 2000)
lsm = (y[:, None] == y[None, :]).astype(np.float32) * 2 - 1 # label similariy matrix

In [6]:
np.sum(lsm * lsm) / (np.linalg.norm(lsm, ord=2) * np.linalg.norm(lsm, ord=2))

1.0

In [45]:
import os
import json
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import weightedtau

In [2]:
gt_dict = {
    # Classification
    "agnews": {
        "mean": {
            "frozen": [92.62, 93.30, 92.26, 87.52, 90.05, 92.55, 90.06],
            "tuned": [93.51, 91.70, 93.85, 92.62, 93.16, 93.34, 92.40]
        },
        "first": {
            "frozen": [91.52, 92.71, 91.65, 84.50, 88.84, 93.16, 89.25],
            "tuned": [93.51, 92.57, 93.77, 93.05, 93.19, 93.15, 92.32]
        }
    },
    
    "arline": {
        "mean": {
            "frozen": [82.58, 84.10, 81.71, 78.46, 76.67, 84.89, 77.58],
            "tuned": [84.03, 85.43, 83.89, 83.17, 82.55, 86.05, 82.05]
        },
        "first": {
            "frozen": [80.88, 83.29, 79.95, 75.98, 75.50, 84.57, 76.01],
            "tuned": [84.27, 85.19, 83.99, 82.70, 81.62, 85.51, 82.27]
        }
    },
    
    "scierc": {
        "mean": {
            "frozen": [49.56, 51.07, 45.98, 48.64, 56.64, 46.75, 58.83],
            "tuned": [75.84, 78.80, 73.13, 73.61, 81.60, 76.65, 80.12]
        },
        "first": {
            "frozen": [41.94, 40.51, 41.35, 42.94, 41.98, 42.87, 45.35],
            "tuned": [80.20, 67.71, 75.95, 76.57, 83.89, 78.25, 82.93]
        }
    },
    
    "mnli": {
        "mean": {
            "frozen": [59.18, 64.18, 58.13, 56.53, 60.12, 61.77, 59.57],
            "tuned": [81.85, 86.57, 79.64, 79.21, 80.89, 85.41, 80.41]
        },
        "first": {
            "frozen": [59.64, 61.48, 57.13, 57.52, 62.40, 59.23, 61.59],
            "tuned": [82.23, 86.71, 80.54, 79.54, 80.84, 85.32, 80.40]
        }
    },
    
    "qnli": {
        "mean": {
            "frozen": [75.75, 78.09, 74.25, 74.69, 78.21, 77.49, 76.84],
            "tuned": [88.17, 92.17, 86.26, 84.13, 88.19, 91.22, 87.24]
        },
        "first": {
            "frozen": [72.23, 74.42, 71.55, 73.67, 77.25, 73.99, 76.31],
            "tuned": [88.46, 92.23, 86.68, 84.31, 88.57, 91.03, 86.77]
        }
    },
    
    "rte": {
        "mean": {
            "frozen": [56.26, 58.35, 53.96, 58.13, 56.97, 54.02, 55.46],
            "tuned": [62.09, 68.99, 57.63, 58.99, 59.98, 66.63, 64.65]
        },
        "first": {
            "frozen": [58.56, 56.04, 55.40, 55.46, 58.05, 54.10, 59.64],
            "tuned": [60.14, 67.05, 60.07, 57.64, 63.12, 63.21, 64.83]
        }
    },
    # Structured Prediction
    "en-ewt": {
        "mean": {
            "frozen": [85.04, 86.10, 86.98, 85.05, 85.95, 86.50, 87.54],
            "tuned": [94.16, 94.85, 93.36, 93.10, 93.16, 94.82, 93.29]
        }
    },
    
    "crossner-news": {
        "mean": {
            "frozen": [87.66, 88.08, 88.41, 69.86, 81.48, 88.55, 82.38],
            "tuned": [92.53, 94.59, 91.21, 78.01, 89.63, 94.23, 88.16]
        }
    },
    
    "crossner-sci": {
        "mean": {
            "frozen": [43.22, 47.00, 45.96, 32.89, 43.24, 45.51, 43.98],
            "tuned": [38.68, 62.27, 37.97, 20.96, 47.73, 54.05, 53.44]
        }
    },
    
    "jobstack": {
        "mean": {
            "frozen": [73.64, 74.06, 74.96, 61.13, 68.32, 73.72, 71.66],
            "tuned": [78.49, 81.51, 77.02, 67.07, 74.65, 79.99, 78.72]
        }
    },
}

In [31]:
def get_earlystop_result(dataset, pooling, epochs, train_type):
    seeds = [4012, 5060, 8823, 8857, 9908]
    method_result_list = [0., 0., 0., 0., 0., 0., 0.]
    method_result_folder = f'../../resources/output/{dataset}/earlystopping-{epochs}/{train_type}'
    for i in range(7):
        for seed in seeds:
            result_json = method_result_folder + f'/model{i}-{pooling}-mlp-rs{seed}/{dataset}-validation-pred-results.json'
            with open(result_json, "r", encoding="utf-8") as f:
                result = json.load(f)
                method_result_list[i] += result['micro-F1']
        method_result_list[i] /= 5
        method_result_list[i] = round(method_result_list[i], 2)
    return method_result_list

In [32]:
datasets = ['rte']
encoders = ["bert-base-uncased",
            "roberta-base",
            "distilbert-base-uncased",
            "emilyalsentzer/Bio_ClinicalBERT",
            "dmis-lab/biobert-v1.1",
            "cardiffnlp/twitter-roberta-base",
            "allenai/scibert_scivocab_uncased"]

all_dt_results = {}
all_methods = ['EarlyStop-1', 'EarlyStop-3', 'kNN', 'Logistic', 'MSC', 'LFC', 'HScore', 'HScoreR', 'LogME', 'GBC', 'PARC', 'SFDA', 'TransRate', 'NLEEP']
for dataset in datasets:
    if not os.path.exists(f'../../resources/output/{dataset}/'):
        continue
    # all_methods = set([file.split('_')[0] for file in os.listdir(f'../../resources/output/{dataset}/results/')])
    for pooling in ['first']:
        dt_results = {
            "Language Model": encoders,
            "Frozen": gt_dict[dataset][pooling]["frozen"],
            "Tuned": gt_dict[dataset][pooling]["tuned"]
        }
        for method in all_methods:
            
            if method == 'EarlyStop-1':
                method_result_list = get_earlystop_result(dataset, pooling, 1, 'frozen')
            elif method == 'EarlyStop-3':
                method_result_list = get_earlystop_result(dataset, pooling, 3, 'frozen')
            else:
                method_result_file = f'../../resources/output/{dataset}/results/{method}_{pooling}.txt'
                with open(method_result_file, 'r', encoding='utf-8') as f:
                    method_result_lines = f.readlines()
                method_result_list = []
                for line in method_result_lines:
                    score = line.strip().split(' ')[-1]
                    method_result_list.append(float(score))
            dt_results[method] = method_result_list
        
        all_dt_results[f"{dataset}-{pooling}"] = pd.DataFrame(dt_results)

In [29]:
all_dt_results.keys()

dict_keys(['rte-first'])

In [33]:
all_dt_results['rte-first']

Unnamed: 0,Language Model,Frozen,Tuned,EarlyStop-1,EarlyStop-3,kNN,Logistic,MSC,LFC,HScore,HScoreR,LogME,GBC,PARC,SFDA,TransRate,NLEEP
0,bert-base-uncased,58.56,60.14,50.36,57.84,0.5054,0.5199,0.0013,0.0075,0.402,0.3429,-0.7176,-1.3267,0.4695,0.5629,94.0101,0.504
1,roberta-base,56.04,67.05,49.64,54.24,0.491,0.5848,0.0014,0.0005,0.4516,0.3465,-0.7015,-1.2817,0.5921,0.5545,15.7446,0.5049
2,distilbert-base-uncased,55.4,60.07,50.36,55.4,0.4874,0.5235,0.0005,0.0028,0.37,0.2991,-0.7251,-1.5517,0.1639,0.5567,81.5544,0.5016
3,emilyalsentzer/Bio_ClinicalBERT,55.46,57.64,50.79,53.81,0.4946,0.5379,0.0014,0.0062,0.3495,0.314,-0.7188,-1.3086,0.3683,0.5546,82.3709,0.5023
4,dmis-lab/biobert-v1.1,58.05,63.12,53.81,58.92,0.5451,0.6173,0.0014,0.0052,0.4052,0.3525,-0.7118,-1.3179,0.4398,0.5659,78.5851,0.5022
5,cardiffnlp/twitter-roberta-base,54.1,63.21,49.93,54.24,0.509,0.4982,0.0009,0.0005,0.3808,0.2812,-0.7197,-1.4424,0.2884,0.5334,21.1391,0.5057
6,allenai/scibert_scivocab_uncased,59.64,64.83,55.04,55.97,0.5199,0.556,0.0013,0.0066,0.4044,0.3463,-0.7149,-1.3222,0.4004,0.5663,94.0239,0.5056


In [40]:
datasets = list(gt_dict.keys())
encoders = ["bert-base-uncased",
            "roberta-base",
            "distilbert-base-uncased",
            "emilyalsentzer/Bio_ClinicalBERT",
            "dmis-lab/biobert-v1.1",
            "cardiffnlp/twitter-roberta-base",
            "allenai/scibert_scivocab_uncased"]

all_corr_results = {}

all_methods = ['EarlyStop-1', 'EarlyStop-3', 'kNN', 'Logistic', 'MSC', 'LFC', 'HScore', 'HScoreR', 'LogME', 'GBC', 'PARC', 'SFDA', 'TransRate', 'NLEEP']

for pooling in ['first']:
    for train_type in ['frozen', 'tuned']:
        corr_results = {
            "Methods": all_methods
        }
        
        for dataset in datasets:
            if not os.path.exists(f'../../resources/output/{dataset}/'):
                continue
            corr_results[dataset+'-rho'] = []
            corr_results[dataset+'-tau'] = []
            for method in all_methods:
                if method == 'EarlyStop-1':
                    method_result_list = get_earlystop_result(dataset, pooling, 1, 'frozen')
                elif method == 'EarlyStop-3':
                    method_result_list = get_earlystop_result(dataset, pooling, 3, 'frozen')
                else:   
                    method_result_file = f'../../resources/output/{dataset}/results/{method}_{pooling}.txt'
                    with open(method_result_file, 'r', encoding='utf-8') as f:
                        method_result_lines = f.readlines()
                    method_result_list = []
                    for line in method_result_lines:
                        score = line.strip().split(' ')[-1]
                        method_result_list.append(float(score))

                rho = pearsonr(method_result_list, gt_dict[dataset][pooling][train_type]).statistic
                tau = weightedtau(method_result_list, gt_dict[dataset][pooling][train_type]).statistic
                corr_results[dataset+'-rho'].append(round(rho, 3))
                corr_results[dataset+'-tau'].append(round(tau, 3))
        all_corr_results[f"{train_type}-{pooling}"] = pd.DataFrame(corr_results)

In [35]:
all_corr_results.keys()

dict_keys(['frozen-first', 'tuned-first'])

In [42]:
all_corr_results['tuned-first']

Unnamed: 0,Methods,rte-rho,rte-tau
0,EarlyStop-1,0.218,-0.101
1,EarlyStop-3,-0.002,-0.053
2,kNN,0.245,0.077
3,Logistic,0.465,0.269
4,MSC,0.227,0.313
5,LFC,-0.488,-0.319
6,HScore,0.864,0.753
7,HScoreR,0.388,0.371
8,LogME,0.754,0.661
9,GBC,0.291,0.417
