In [1]:
import json
from pathlib import Path
import seaborn as sns

In [2]:
data = json.loads(Path("../data/preprocessed/amr30-es_nl-fixed_no_processing_stratified/corpus_statistics.json").read_text(encoding="utf-8"))

In [3]:
import pandas as pd

dataset_info = data.pop("dataset")
print("DATASET INFO\n============")
dataset_lengths = []
for split_type, split_data in dataset_info.items():
    print(f"*** {split_type.upper()} ***")
    print(f"No. samples: {split_data['num_samples']:,}")
    print(f"Longest sequence of white-spaced tokens: {split_data['max_num_ws_tokens']:,}")
    dataset_lengths = [{"split_type": split_type, "num_ws_tokens": num_ws_tokens} for num_ws_tokens in split_data["num_ws_tokens"]]
    dataset_lengths = pd.DataFrame(dataset_lengths)
    # sns.displot(dataset_lengths, x="num_ws_tokens")

DATASET INFO
*** TEST ***
No. samples: 1,898
Longest sequence of white-spaced tokens: 175
*** TRAIN ***
No. samples: 55,635
Longest sequence of white-spaced tokens: 246
*** VALIDATION ***
No. samples: 1,722
Longest sequence of white-spaced tokens: 123


In [4]:
df_data = []
for tokenizer_name, splitsd in data.items():
    max_length = splitsd.pop("max_length")
    for split_type, stats in splitsd.items():
        df_data.append({
            "tokenizer": tokenizer_name,
            "max_length": max_length,
            "split_type": split_type,
            **stats
        })
df = pd.DataFrame(df_data)

In [5]:
df


Unnamed: 0,tokenizer,max_length,split_type,max_subwordtok_len_sents,subwordtok_lens_sents,num_sent_gt_maxlength,max_subwordtok_len_labels,subwordtok_lens_labels,num_lbl_gt_maxlength
0,bigscience/bloomz-560m,2048,test,393,"[1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, ...",0,835,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 8, 8, 9, ...",0
1,bigscience/bloomz-560m,2048,train,494,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,824,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",0
2,bigscience/bloomz-560m,2048,validation,249,"[1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",0,529,"[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",0
3,facebook/mbart-large-cc25,1024,test,323,"[3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, ...",0,857,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 11, 11...",0
4,facebook/mbart-large-cc25,1024,train,346,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",0,842,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",0
5,facebook/mbart-large-cc25,1024,validation,189,"[3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, ...",0,543,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",0
6,facebook/mbart-large-50-many-to-one-mmt,1024,test,323,"[3, 3, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, ...",0,857,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 9, 9, 11, 11...",0
7,facebook/mbart-large-50-many-to-one-mmt,1024,train,346,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",0,842,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",0
8,facebook/mbart-large-50-many-to-one-mmt,1024,validation,189,"[3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, ...",0,543,"[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...",0
9,google/mt5-base,1024,test,338,"[2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...",0,846,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 9, ...",0


In [6]:
import numpy as np

def print_stats(row):
    max_sents = max(row["subwordtok_lens_sents"])
    q99_sents = np.percentile(row["subwordtok_lens_sents"], 99)
    q95_sents = np.percentile(row["subwordtok_lens_sents"], 95)
    max_lbls = max(row["subwordtok_lens_labels"])
    q99_lbls = np.percentile(row["subwordtok_lens_labels"], 99)
    q95_lbls = np.percentile(row["subwordtok_lens_labels"], 95)
    tokenizer_name = row["tokenizer"]
    split_type = row["split_type"]
    print(tokenizer_name, split_type, max_sents, q99_sents, q95_sents, max_lbls, q99_lbls, q95_lbls)

df.apply(print_stats, axis=1)

bigscience/bloomz-560m test 393 108.02999999999997 78.0 835 282.1199999999999 207.0
bigscience/bloomz-560m train 494 99.0 68.0 824 251.0 177.0
bigscience/bloomz-560m validation 249 102.0 74.0 529 266.0 204.0
facebook/mbart-large-cc25 test 323 90.02999999999997 70.0 857 287.03 214.14999999999986
facebook/mbart-large-cc25 train 346 88.0 61.0 842 257.0 182.0
facebook/mbart-large-cc25 validation 189 88.0 65.0 543 272.0 210.0
facebook/mbart-large-50-many-to-one-mmt test 323 90.02999999999997 70.0 857 287.03 214.14999999999986
facebook/mbart-large-50-many-to-one-mmt train 346 88.0 61.0 842 257.0 182.0
facebook/mbart-large-50-many-to-one-mmt validation 189 88.0 65.0 543 272.0 210.0
google/mt5-base test 338 101.02999999999997 78.0 846 289.03 212.0
google/mt5-base train 412 98.0 69.0 833 255.0 181.0
google/mt5-base validation 214 101.78999999999996 75.0 537 272.0 208.0
t5-base test 542 167.05999999999995 124.0 512 297.0 215.14999999999986
t5-base train 688 155.0 109.0 512 257.0 182.0
t5-base va

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
dtype: object