In [1]:
import pandas as pd
from tqdm import tqdm
import warnings

from result_dataframe import scan_multirun, scan_outputs

## Get dataframe containing all the experiments

In [2]:
warnings.filterwarnings("ignore")
records = scan_multirun() + scan_outputs()
df = pd.DataFrame(records)
df.sort_values(
    ["dataset", "label_column", "encoder", "nways", "kshots", "readout"], inplace=True
)
print(df)

100%|██████████| 389/389 [00:01<00:00, 287.39it/s]
100%|██████████| 11/11 [00:00<00:00, 370.67it/s]

      dataset      encoder readout  nways  kshots label_column f1_micro  \
139  conll-de  bert-german      LR      5       1     ner_tags    27.90   
136  conll-de  bert-german      LR      5       5     ner_tags    66.21   
126  conll-de  bert-german      LR      5      10     ner_tags    72.52   
220  conll-de     gottbert      LR      5       1     ner_tags    26.27   
219  conll-de     gottbert      LR      5       5     ner_tags    58.37   
..        ...          ...     ...    ...     ...          ...      ...   
260    wnut17     spanbert      LR      5       5     ner_tags    42.29   
255    wnut17     spanbert      LR      5      10     ner_tags    48.84   
178    wnut17        xlnet      LR      5       1     ner_tags    24.36   
182    wnut17        xlnet      LR      5       5     ner_tags    42.26   
177    wnut17        xlnet      LR      5      10     ner_tags    49.74   

    f1_micro_pm            timestamp  
139        2.19  2021-09-20/18-13-16  
136        1.77  2021




## Reproduce Table 2

In [3]:
new_df = df.loc[
    (
        df["dataset"].isin(
            [
                "conll2003",
                "ontonotes",
                "fewnerd",
                "lenovo",
                "wikiann",
                "wnut17",
                "wikigold",
            ]
        )
    )
    & (df["encoder"].isin(["random", "bert", "albert", "roberta", "spanbert", "xlnet"]))
    & (df["nways"] == 5)
    & (df["readout"] == "LR")
]
new_df.sort_values(["dataset", "label_column", "kshots", "encoder"], inplace=True)
new_df.drop_duplicates(
    ["dataset", "label_column", "kshots", "encoder"], keep="last", inplace=True
)
print(
    new_df[
        [
            "dataset",
            "label_column",
            "kshots",
            "encoder",
            "f1_micro",
            "f1_micro_pm",
            "timestamp",
        ]
    ]
)

       dataset label_column  kshots   encoder f1_micro f1_micro_pm  \
211  conll2003     ner_tags       1    albert    33.03        1.90   
267  conll2003     ner_tags       1      bert    21.96        1.78   
288  conll2003     ner_tags       1    random     9.52        1.13   
151  conll2003     ner_tags       1   roberta    21.71        1.78   
172  conll2003     ner_tags       1  spanbert    18.39        1.57   
..         ...          ...     ...       ...      ...         ...   
205     wnut17     ner_tags      10      bert    58.77        1.87   
202     wnut17     ner_tags      10    random    18.52        1.34   
234     wnut17     ner_tags      10   roberta    63.93        1.86   
255     wnut17     ner_tags      10  spanbert    48.84        2.04   
177     wnut17     ner_tags      10     xlnet    49.74        2.04   

               timestamp  
211  2021-09-20/16-48-07  
267  2021-09-20/16-22-11  
288  2021-09-21/21-16-26  
151  2021-09-20/16-55-39  
172  2021-09-20/16-38-31

## Reproduce Table 3

In [4]:
new_df = df.loc[
    (df["encoder"].isin(["bert-german", "gottbert", "xlm"]))
    & (df["nways"] == 5)
    & (df["readout"] == "LR")
]
new_df.sort_values(["dataset", "kshots", "encoder"], inplace=True)
print(new_df[["dataset", "kshots", "encoder", "f1_micro", "f1_micro_pm"]])

       dataset  kshots      encoder f1_micro f1_micro_pm
139   conll-de       1  bert-german    27.90        2.19
220   conll-de       1     gottbert    26.27        2.11
14    conll-de       1          xlm    30.65        2.21
136   conll-de       5  bert-german    66.21        1.77
219   conll-de       5     gottbert    58.37        1.76
13    conll-de       5          xlm    65.22        1.74
126   conll-de      10  bert-german    72.52        1.60
213   conll-de      10     gottbert    64.77        1.65
7     conll-de      10          xlm    71.18        1.59
142   germeval       1  bert-german    24.76        1.72
215   germeval       1     gottbert    24.08        1.67
9     germeval       1          xlm    27.24        1.76
130   germeval       5  bert-german    59.58        1.69
218   germeval       5     gottbert    54.06        1.62
12    germeval       5          xlm    58.51        1.69
140   germeval      10  bert-german    68.26        1.54
214   germeval      10     gott

## Reproduce Table 4

In [5]:
new_df = df.loc[
    (df["encoder"].isin(["bert", "bert-pos", "bert-mnli", "bert-squad"]))
    & (df["nways"] == 5)
    & (df["readout"] == "LR")
]
new_df.sort_values(["dataset", "label_column", "kshots", "encoder"], inplace=True)
new_df.drop_duplicates(
    ["dataset", "label_column", "kshots", "encoder"], keep="last", inplace=True
)
print(
    new_df[["dataset", "label_column", "kshots", "encoder", "f1_micro", "f1_micro_pm"]]
)

       dataset label_column  kshots     encoder f1_micro f1_micro_pm
267  conll2003     ner_tags       1        bert    21.96        1.78
226  conll2003     ner_tags       1   bert-mnli    22.29        1.78
115  conll2003     ner_tags       1    bert-pos    43.01        1.68
31   conll2003     ner_tags       1  bert-squad    35.05        1.96
266  conll2003     ner_tags       5        bert    60.94        1.81
..         ...          ...     ...         ...      ...         ...
32      wnut17     ner_tags       5  bert-squad    51.05        1.81
205     wnut17     ner_tags      10        bert    58.77        1.87
87      wnut17     ner_tags      10   bert-mnli    56.30        1.92
88      wnut17     ner_tags      10    bert-pos    49.11        1.87
15      wnut17     ner_tags      10  bert-squad    54.58        1.93

[96 rows x 6 columns]


## Reproduce Table 5

In [6]:
new_df = df.loc[
    (df["encoder"].isin(["bert", "bert-conll"]))
    & (df["nways"] == 5)
    & (df["readout"] == "LR")
]
new_df.sort_values(["dataset", "label_column", "kshots", "encoder"], inplace=True)
print(
    new_df[["dataset", "label_column", "kshots", "encoder", "f1_micro", "f1_micro_pm"]]
)

       dataset   label_column  kshots     encoder f1_micro f1_micro_pm
267  conll2003       ner_tags       1        bert    21.96        1.78
61   conll2003       ner_tags       1  bert-conll    90.46        1.09
266  conll2003       ner_tags       5        bert    60.94        1.81
59   conll2003       ner_tags       5  bert-conll    94.73        0.75
263  conll2003       ner_tags      10        bert    66.11        1.61
47   conll2003       ner_tags      10  bert-conll    94.40        0.81
305    fewnerd  fine_ner_tags       1        bert    49.74        1.64
65     fewnerd  fine_ner_tags       1  bert-conll    59.36        1.61
304    fewnerd  fine_ner_tags       5        bert    80.12        1.30
64     fewnerd  fine_ner_tags       5  bert-conll    79.70        1.28
301    fewnerd  fine_ner_tags      10        bert    84.07        1.18
63     fewnerd  fine_ner_tags      10  bert-conll    82.00        1.18
293    fewnerd       ner_tags       1        bert    25.99        1.55
56    

## Reproduce Table 6

In [7]:
new_df = df.loc[
    (df["dataset"].isin(["conll2003", "ontonotes"]))
    & (df["encoder"] == "albert")
    & (df["nways"] == 5)
]
new_df.sort_values(["dataset", "kshots", "readout"], inplace=True)
new_df.drop_duplicates(["dataset", "kshots", "readout"], keep="last", inplace=True)
print(
    new_df[
        ["dataset", "readout", "kshots", "f1_micro", "f1_micro_pm", "timestamp"]
    ].to_string(index=False)
)

  dataset readout  kshots f1_micro f1_micro_pm           timestamp
conll2003      LR       1    33.03        1.90 2021-09-20/16-48-07
conll2003      NC       1    35.21        1.87 2021-09-20/17-20-17
conll2003      NN       1    40.76        1.72 2021-09-20/17-20-17
conll2003      LR       5    68.33        1.57 2021-09-20/16-48-07
conll2003      NC       5    61.53        1.65 2021-09-20/17-20-17
conll2003      NN       5    62.24        1.60 2021-09-20/17-20-17
conll2003      LR      10    72.76        1.57 2021-09-20/16-48-07
conll2003      NC      10    62.65        1.65 2021-09-20/17-20-17
conll2003      NN      10    67.79        1.62 2021-09-20/17-20-17
ontonotes      LR       1    50.45        1.74 2021-09-20/23-13-34
ontonotes      NC       1    51.52        1.71 2021-10-11/14-25-22
ontonotes      NN       1    52.72        1.58 2021-10-11/14-25-22
ontonotes      LR       5    77.66        1.32 2021-09-20/23-13-34
ontonotes      NC       5    72.46        1.35 2021-10-11/14-2