In [1]:
import pandas as pd
import numpy as np
import warnings

## Get dataframe containing all the experiments

In [2]:
warnings.filterwarnings("ignore")

# # if we have no csv yet where the results are stored
# from tqdm import tqdm
# from results_dataframe import scan_multirun, scan_outputs

# records = scan_multirun() + scan_outputs()
# df = pd.DataFrame(records)
# df.sort_values(
#     ["dataset", "label_column", "encoder", "nways", "kshots", "readout", "timestamp"], inplace=True
# )
# df.to_csv("results_dataframe.csv", index=False)

df = pd.read_csv("results_dataframe.csv", index_col=False)
print(df)

100%|██████████| 577/577 [00:02<00:00, 223.58it/s]
100%|██████████| 11/11 [00:00<00:00, 264.82it/s]

      dataset      encoder readout  nways  kshots label_column  f1_micro  \
0    conll-de  bert-german      LR      5       1     ner_tags     29.42   
1    conll-de  bert-german      LR      5       5     ner_tags     65.98   
2    conll-de  bert-german      LR      5      10     ner_tags     71.43   
3    conll-de     gottbert      LR      5       1     ner_tags     26.27   
4    conll-de     gottbert      LR      5       5     ner_tags     58.37   
..        ...          ...     ...    ...     ...          ...       ...   
492    wnut17     spanbert      LR      5       5     ner_tags     42.29   
493    wnut17     spanbert      LR      5      10     ner_tags     48.84   
494    wnut17        xlnet      LR      5       1     ner_tags     24.36   
495    wnut17        xlnet      LR      5       5     ner_tags     42.26   
496    wnut17        xlnet      LR      5      10     ner_tags     49.74   

     f1_micro_pm            timestamp  num_epochs  lr  weight_decay  
0           2.26 




## Reproduce Table 2: English encoders

In [3]:
dataset_list = [
    "conll2003",
    "ontonotes",
    "fewnerd",
    "wnut17",
    "wikiann",
    "wikigold",
    "lenovo",
]
encoder_list = [
    "random",
    "bert",
    "bert-cased",
    "albert",
    "roberta",
    "spanbert",
    "xlnet",
]

new_df = df.loc[
    (df["dataset"].isin(dataset_list))
    & (df["encoder"].isin(encoder_list))
    & (df["nways"] == 5)
    & (df["readout"] == "LR")
]
# customize string ordering for the some columns
new_df["dataset"] = pd.Categorical(new_df["dataset"], dataset_list)
new_df["encoder"] = pd.Categorical(new_df["encoder"], encoder_list)
new_df.sort_values(
    ["dataset", "label_column", "kshots", "encoder", "timestamp"], inplace=True
)
new_df.drop_duplicates(
    ["dataset", "label_column", "kshots", "encoder"], keep="last", inplace=True
)

print(
    new_df[
        [
            "dataset",
            "label_column",
            "kshots",
            "encoder",
            "f1_micro",
            "f1_micro_pm",
            "timestamp",
        ]
    ]
)

       dataset  label_column  kshots     encoder  f1_micro  f1_micro_pm  \
116  conll2003      ner_tags       1      random      9.52         1.13   
25   conll2003      ner_tags       1        bert     21.96         1.78   
35   conll2003      ner_tags       1  bert-cased     22.04         1.77   
12   conll2003      ner_tags       1      albert     33.03         1.90   
120  conll2003      ner_tags       1     roberta     21.71         1.78   
..         ...           ...     ...         ...       ...          ...   
254     lenovo  ner_bio_tags      10  bert-cased     67.09         1.51   
248     lenovo  ner_bio_tags      10      albert     66.61         1.46   
272     lenovo  ner_bio_tags      10     roberta     70.16         1.48   
275     lenovo  ner_bio_tags      10    spanbert     54.80         1.74   
278     lenovo  ner_bio_tags      10       xlnet     63.79         1.68   

               timestamp  
116  2021-09-21 21:16:26  
25   2021-09-20 16:22:11  
35   2021-09-20 14

In [4]:
# reshape the F1-scores in a matrix for more efficient number checking
for dataset in dataset_list:
    reshaped = new_df.loc[new_df["dataset"] == dataset]
    reshaped = reshaped["f1_micro"].to_numpy().reshape((-1, len(encoder_list)))
    reshaped = pd.DataFrame(reshaped, columns=encoder_list).to_string(index=False)
    print(dataset)
    print(reshaped)

conll2003
 random  bert  bert-cased  albert  roberta  spanbert  xlnet
   9.52 21.96       22.04   33.03    21.71     18.39  18.49
  12.53 60.94       62.17   68.33    64.49     43.22  44.82
  13.71 66.11       68.79   72.76    72.09     49.79  52.43
ontonotes
 random  bert  bert-cased  albert  roberta  spanbert  xlnet
  18.66 42.71       45.09   50.45    42.74     34.30  38.40
  19.73 74.68       77.70   77.66    78.70     65.64  72.60
  18.88 80.92       82.70   82.10    83.80     74.14  78.38
fewnerd
 random  bert  bert-cased  albert  roberta  spanbert  xlnet
  21.14 49.74       48.50   54.27    51.27     39.13  47.02
  21.00 80.12       79.26   78.08    81.70     71.93  82.73
  20.62 84.07       83.21   81.17    84.95     78.39  85.73
  12.12 25.99       28.52   35.67    28.12     23.34  25.93
  15.59 53.85       56.04   59.14    58.66     45.50  52.32
  16.04 59.44       63.20   63.30    65.52     52.65  61.94
wnut17
 random  bert  bert-cased  albert  roberta  spanbert  xlnet
  18.

## Reproduce Table 3: German encoders

In [5]:
german_dataset_list = ["conll-de", "germeval", "smartdata"]
encoder_list = ["random", "bert-german", "gottbert", "xlm"]

new_df = df.loc[
    (df["dataset"].isin(german_dataset_list))
    & (df["nways"] == 5)
    & (df["readout"] == "LR")
]
new_df["dataset"] = pd.Categorical(new_df["dataset"], german_dataset_list)
new_df["encoder"] = pd.Categorical(new_df["encoder"], encoder_list)
new_df.sort_values(["dataset", "kshots", "encoder", "timestamp"], inplace=True)
print(new_df[["dataset", "kshots", "encoder", "f1_micro", "f1_micro_pm"]])

       dataset  kshots      encoder  f1_micro  f1_micro_pm
6     conll-de       1       random     12.53         1.65
0     conll-de       1  bert-german     29.42         2.26
3     conll-de       1     gottbert     26.27         2.11
9     conll-de       1          xlm     30.65         2.21
7     conll-de       5       random     15.38         1.30
1     conll-de       5  bert-german     65.98         1.81
4     conll-de       5     gottbert     58.37         1.76
10    conll-de       5          xlm     65.22         1.74
8     conll-de      10       random     16.00         1.29
2     conll-de      10  bert-german     71.43         1.59
5     conll-de      10     gottbert     64.77         1.65
11    conll-de      10          xlm     71.18         1.59
240   germeval       1       random     17.52         1.45
234   germeval       1  bert-german     25.89         1.74
237   germeval       1     gottbert     24.08         1.67
243   germeval       1          xlm     27.24         1.

In [6]:
# reshape the F1-scores in a matrix for more efficient number checking
for dataset in german_dataset_list:
    reshaped = new_df.loc[new_df["dataset"] == dataset]
    reshaped = reshaped["f1_micro"].to_numpy().reshape((-1, len(encoder_list)))
    reshaped = pd.DataFrame(reshaped, columns=encoder_list).to_string(index=False)
    print(dataset)
    print(reshaped)

conll-de
 random  bert-german  gottbert   xlm
  12.53        29.42     26.27 30.65
  15.38        65.98     58.37 65.22
  16.00        71.43     64.77 71.18
germeval
 random  bert-german  gottbert   xlm
  17.52        25.89     24.08 27.24
  20.70        61.79     54.06 58.51
  18.33        71.18     60.30 65.37
smartdata
 random  bert-german  gottbert   xlm
  26.12        52.12     49.96 53.17
  23.52        82.50     79.30 80.89
  21.55        86.01     83.10 85.66


## Reproduce Table 4(a): non-NER fine-tuned encoders

In [7]:
encoder_list = ["bert", "bert-pos", "bert-mnli", "bert-squad"]

new_df = df.loc[
    (df["encoder"].isin(encoder_list)) & (df["nways"] == 5) & (df["readout"] == "LR")
]
new_df["dataset"] = pd.Categorical(new_df["dataset"], dataset_list)
new_df["encoder"] = pd.Categorical(new_df["encoder"], encoder_list)
new_df.sort_values(
    ["dataset", "label_column", "kshots", "encoder", "timestamp"], inplace=True
)
new_df.drop_duplicates(
    ["dataset", "label_column", "kshots", "encoder"], keep="last", inplace=True
)
print(
    new_df[["dataset", "label_column", "kshots", "encoder", "f1_micro", "f1_micro_pm"]]
)

       dataset  label_column  kshots     encoder  f1_micro  f1_micro_pm
25   conll2003      ner_tags       1        bert     21.96         1.78
46   conll2003      ner_tags       1    bert-pos     43.01         1.68
42   conll2003      ner_tags       1   bert-mnli     22.29         1.78
49   conll2003      ner_tags       1  bert-squad     35.05         1.96
28   conll2003      ner_tags       5        bert     60.94         1.81
..         ...           ...     ...         ...       ...          ...
265     lenovo  ner_bio_tags       5  bert-squad     61.01         1.67
251     lenovo  ner_bio_tags      10        bert     67.45         1.54
263     lenovo  ner_bio_tags      10    bert-pos     60.61         1.51
260     lenovo  ner_bio_tags      10   bert-mnli     66.23         1.59
266     lenovo  ner_bio_tags      10  bert-squad     61.95         1.72

[96 rows x 6 columns]


In [9]:
# reshape the F1-scores in a matrix for more efficient number checking
for dataset in dataset_list:
    reshaped = new_df.loc[new_df["dataset"] == dataset]
    reshaped = reshaped["f1_micro"].to_numpy().reshape((-1, len(encoder_list)))
    reshaped = pd.DataFrame(reshaped, columns=encoder_list).to_string(index=False)
    print(dataset)
    print(reshaped)

conll2003
 bert  bert-pos  bert-mnli  bert-squad
21.96     43.01      22.29       35.05
60.94     65.72      61.34       65.94
66.11     68.46      64.71       68.50
ontonotes
 bert  bert-pos  bert-mnli  bert-squad
42.71     50.85      42.99       47.83
74.68     66.17      75.29       76.37
80.92     68.02      80.94       79.68
fewnerd
 bert  bert-pos  bert-mnli  bert-squad
49.74     43.97      46.71       51.17
80.12     63.08      77.14       78.58
84.07     66.43      81.26       81.58
25.99     34.70      26.08       35.07
53.85     49.88      52.52       59.77
59.44     52.78      58.17       63.09
wnut17
 bert  bert-pos  bert-mnli  bert-squad
25.71     32.04      25.12       29.04
51.56     44.90      48.50       51.05
58.77     49.11      56.30       54.58
wikiann
 bert  bert-pos  bert-mnli  bert-squad
24.53     32.92      23.35       33.33
48.33     43.54      46.94       55.93
54.84     45.70      53.47       63.37
wikigold
 bert  bert-pos  bert-mnli  bert-squad
18.40     37

## Reproduce Table 4(b): NER fine-tuned encoders

In [10]:
reordered_dataset_list = [
    "conll2003",
    "wikigold",
    "wikiann",
    "fewnerd",
    "wnut17",
    "ontonotes",
    "lenovo",
]
encoder_list = ["bert", "bert-conll"]

new_df = df.loc[
    (df["encoder"].isin(encoder_list)) & (df["nways"] == 5) & (df["readout"] == "LR")
]
new_df["dataset"] = pd.Categorical(new_df["dataset"], reordered_dataset_list)
new_df["encoder"] = pd.Categorical(new_df["encoder"], encoder_list)
new_df.sort_values(
    ["dataset", "label_column", "kshots", "encoder", "timestamp"], inplace=True
)
print(
    new_df[["dataset", "label_column", "kshots", "encoder", "f1_micro", "f1_micro_pm"]]
)

       dataset   label_column  kshots     encoder  f1_micro  f1_micro_pm
25   conll2003       ner_tags       1        bert     21.96         1.78
38   conll2003       ner_tags       1  bert-conll     90.46         1.09
28   conll2003       ner_tags       5        bert     60.94         1.81
39   conll2003       ner_tags       5  bert-conll     94.73         0.75
31   conll2003       ner_tags      10        bert     66.11         1.61
40   conll2003       ner_tags      10  bert-conll     94.40         0.81
402   wikigold       ner_tags       1        bert     18.40         1.60
408   wikigold       ner_tags       1  bert-conll     68.83         1.62
403   wikigold       ner_tags       5        bert     49.19         1.53
409   wikigold       ner_tags       5  bert-conll     81.40         1.24
404   wikigold       ner_tags      10        bert     55.85         1.51
410   wikigold       ner_tags      10  bert-conll     84.68         1.12
369    wikiann       ner_tags       1        bert  

In [11]:
# reshape the F1-scores in a matrix for more efficient number checking
for dataset in reordered_dataset_list:
    reshaped = new_df.loc[new_df["dataset"] == dataset]
    reshaped = reshaped["f1_micro"].to_numpy().reshape((-1, len(encoder_list)))
    reshaped = pd.DataFrame(reshaped, columns=encoder_list).to_string(index=False)
    print(dataset)
    print(reshaped)

conll2003
 bert  bert-conll
21.96       90.46
60.94       94.73
66.11       94.40
wikigold
 bert  bert-conll
18.40       68.83
49.19       81.40
55.85       84.68
wikiann
 bert  bert-conll
24.53       55.15
48.33       67.22
54.84       71.34
fewnerd
 bert  bert-conll
49.74       59.36
80.12       79.70
84.07       82.00
25.99       53.25
53.85       70.04
59.44       72.66
wnut17
 bert  bert-conll
25.71       44.96
51.56       63.99
58.77       69.76
ontonotes
 bert  bert-conll
42.71       58.99
74.68       76.21
80.92       77.75
lenovo
 bert  bert-conll
37.39       49.22
63.19       65.40
67.45       66.13


## Reproduce Table 6: readout methods

In [28]:
new_df = df.loc[
    (df["dataset"].isin(["conll2003", "ontonotes"]))
    & (df["encoder"] == "albert")
    & (df["nways"] == 5)
]
new_df["dataset"] = pd.Categorical(new_df["dataset"], dataset_list)
new_df.sort_values(["dataset", "kshots", "readout", "timestamp"], inplace=True)
new_df.drop_duplicates(
    ["dataset", "kshots", "readout", "f1_micro"], keep="last", inplace=True
)
print(
    new_df[
        ["dataset", "readout", "kshots", "f1_micro", "f1_micro_pm", "timestamp"]
    ].to_string(index=False)
)

  dataset readout  kshots  f1_micro  f1_micro_pm           timestamp
conll2003      LR       1     33.03         1.90 2021-09-20 16:48:07
conll2003      NC       1     35.21         1.87 2021-09-20 17:20:17
conll2003      NN       1     40.76         1.72 2021-09-20 17:20:17
conll2003      LR       5     68.33         1.57 2021-09-20 16:48:07
conll2003      NC       5     61.53         1.65 2021-09-20 17:20:17
conll2003      NN       5     62.24         1.60 2021-09-20 17:20:17
conll2003      LR      10     72.76         1.57 2021-09-20 16:48:07
conll2003      NC      10     62.65         1.65 2021-09-20 17:20:17
conll2003      NN      10     67.79         1.62 2021-09-20 17:20:17
ontonotes      LR       1     50.45         1.74 2021-10-11 14:25:22
ontonotes      NC       1     51.52         1.71 2021-10-11 14:25:22
ontonotes      NN       1     52.72         1.58 2021-10-11 14:25:22
ontonotes      LR       5     77.66         1.32 2021-10-11 14:25:22
ontonotes      NC       5     72.4

In [29]:
# reshape the F1-scores in a matrix for more efficient number checking
for dataset in ["conll2003", "ontonotes"]:
    reshaped = new_df.loc[new_df["dataset"] == dataset]
    reshaped = reshaped["f1_micro"].to_numpy().reshape((-1, 3))
    reshaped = pd.DataFrame(reshaped, columns=["LR", "NC", "NN"]).to_string(index=False)
    print("ALBERT on", dataset)
    print(reshaped)

ALBERT on conll2003
   LR    NC    NN
33.03 35.21 40.76
68.33 61.53 62.24
72.76 62.65 67.79
ALBERT on ontonotes
   LR    NC    NN
50.45 51.52 52.72
77.66 72.46 71.04
82.10 73.49 76.11


## Reproduce Table 5: contrastive learning (CL)

In [36]:
dataset_list_CL = ["conll2003", "ontonotes", "fewnerd", "wikigold"]
encoder_list_CL = [
    "contrastive_bert",
    "contrastive_albert",
    "contrastive_roberta",
    "contrastive_spanbert",
    "contrastive_xlnet",
]

new_df = df.loc[
    (df["dataset"].isin(dataset_list_CL))
    & (df["encoder"].isin(encoder_list_CL))
    & (df["num_epochs"] == 1)
    & (df["lr"] == 0.00005)
    & (df["weight_decay"] == 0)
]
new_df.sort_values(["dataset", "encoder", "kshots", "timestamp"], inplace=True)
new_df.drop_duplicates(["dataset", "encoder", "kshots"], keep="last", inplace=True)
print(
    new_df[
        ["dataset", "encoder", "kshots", "f1_micro", "f1_micro_pm", "timestamp"]
    ].to_string(index=False)
)

  dataset              encoder  kshots  f1_micro  f1_micro_pm           timestamp
conll2003     contrastive_bert       1     23.87         1.86 2021-10-17 15:10:49
conll2003     contrastive_bert       5     60.55         1.81 2021-10-17 15:10:49
conll2003     contrastive_bert      10     65.03         1.72 2021-10-17 15:10:49
conll2003   contrastive_albert       1     36.71         2.06 2021-10-17 16:02:46
conll2003   contrastive_albert       5     66.85         1.63 2021-10-17 16:02:46
conll2003   contrastive_albert      10     70.66         1.55 2021-10-17 16:02:46
conll2003  contrastive_roberta       1     22.57         1.83 2021-10-17 16:57:45
conll2003  contrastive_roberta       5     62.45         1.65 2021-10-17 16:57:45
conll2003  contrastive_roberta      10     70.17         1.58 2021-10-17 16:57:45
conll2003 contrastive_spanbert       1     17.61         1.62 2021-10-17 14:20:13
conll2003 contrastive_spanbert       5     44.23         1.85 2021-10-17 14:20:13
conll2003 contra

In [37]:
# reshape the F1-scores in a matrix for more efficient number checking
for dataset in dataset_list_CL:
    reshaped = new_df.loc[new_df["dataset"] == dataset]
    reshaped.sort_values(["kshots", "encoder", "timestamp"], inplace=True)
    reshaped = reshaped["f1_micro"].to_numpy().reshape((-1, len(encoder_list_CL)))
    reshaped = pd.DataFrame(reshaped, columns=encoder_list_CL).to_string(index=False)
    print(dataset)
    print(reshaped)

conll2003
 contrastive_bert  contrastive_albert  contrastive_roberta  contrastive_spanbert  contrastive_xlnet
            23.87               36.71                22.57                 17.61              18.25
            60.55               66.85                62.45                 44.23              45.93
            65.03               70.66                70.17                 49.82              49.25
ontonotes
 contrastive_bert  contrastive_albert  contrastive_roberta  contrastive_spanbert  contrastive_xlnet
            42.89               51.38                41.66                 32.95              38.64
            74.02               76.65                75.29                 64.29              70.66
            80.36               81.47                82.51                 74.72              75.99
fewnerd
 contrastive_bert  contrastive_albert  contrastive_roberta  contrastive_spanbert  contrastive_xlnet
            27.42               38.16                29.10              