In [2]:
from transformers import GPT2TokenizerFast, GPTNeoForCausalLM, GPTNeoConfig, AutoTokenizer, AlbertTokenizer, GPT2Tokenizer, RobertaTokenizer, RobertaTokenizerFast
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, load_from_disk
from transformers import RobertaForCausalLM

from tokenizers import Tokenizer, pre_tokenizers, decoders, AddedToken, normalizers, trainers
from tokenizers.normalizers import BertNormalizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

from tokenizers.implementations import SentencePieceUnigramTokenizer

from tokenizers.processors import RobertaProcessing, TemplateProcessing
from tqdm import tqdm

import json
import wandb

In [35]:
def bits_per_token(token, encoding='utf-8'):
    # Calculate the number of bits for the token based on the encoding
    return len(token)

def average_bits_per_token(vocabulary):
    total_bits = 0
    for token in vocabulary:
        total_bits += bits_per_token(token)
    average_bits = total_bits / len(vocabulary)
    return average_bits

def clean_special_characters_from_token(token):
    return token.replace("Ġ", "").replace("▁", "").replace("#", "")

def clean_special_tokens(vocab):
    vocab.remove("<pad>")
    vocab.remove("<s>")
    vocab.remove("</s>")
    vocab.remove("<unk>")
    vocab.remove("<mask>")

In [36]:
bpe_tokenizers = {tokenizer_name: RobertaTokenizer.from_pretrained(f'./tokenizers/bpe/{tokenizer_name}/', model_max_length=512) for tokenizer_name in ["1k-bpe", "3k-bpe", "6k-bpe", "10k-bpe", "15k-bpe", "20k-bpe"]}

wp_tokenizers = {tokenizer_name: RobertaTokenizerFast.from_pretrained(f'./tokenizers/wordpiece/{tokenizer_name}/', model_max_length=512) for tokenizer_name in ["1k-wp", "3k-wp", "6k-wp", "10k-wp", "15k-wp", "20k-wp"]}

sp_tokenizers = {tokenizer_name: AlbertTokenizer.from_pretrained(f'./tokenizers/sentence-piece/{tokenizer_name}/', model_max_length=512) for tokenizer_name in ["1k-sp", "3k-sp", "6k-sp", "10k-sp", "15k-sp", "20k-sp"]}


In [37]:
def calculate_vocabs_avg_bits_per_token(vocabs):
    results = {}
    
    for model_name, tok in vocabs.items():
        vocab_list = [token for token, _ in tok.get_vocab().items()]
        clean_special_tokens(vocab_list)
        cleaned_vocab = [clean_special_characters_from_token(token) for token in vocab_list]
        # print(cleaned_vocab)
        avg_characters_per_token = average_bits_per_token(cleaned_vocab)
        results[model_name] = {"avg_char_per_token": avg_characters_per_token}
    return results

In [38]:
print(calculate_vocabs_avg_bits_per_token(wp_tokenizers))

{'1k-wp': {'avg_char_per_token': 2.02713567839196}, '3k-wp': {'avg_char_per_token': 3.3512520868113524}, '6k-wp': {'avg_char_per_token': 4.074061718098416}, '10k-wp': {'avg_char_per_token': 4.527863931965983}, '15k-wp': {'avg_char_per_token': 4.83407802600867}, '20k-wp': {'avg_char_per_token': 5.0324081020255065}}


In [39]:
def add_to_wandb(result):
    ''' Log results to wandb. For this you need to map the run name to its id 
        in the table, by downloading name, id columns from wandb. 
        In hindsight, it's probably easier to save this under the model to avoid duplicates. 
    '''
    run_id = ""
    with open('./run_ids.json') as json_file:
        run_ids = json.load(json_file)
        model = result['model']
        run_id = run_ids[model]

    # resume the wandb run and log the result
    wandb.init(
        entity='tiny-transformers', project='tokenizers', id=run_id, resume='must'
    )
    print(result)
    wandb.log(result)
    wandb.finish()

In [40]:
bpe_tok_results = calculate_vocabs_avg_bits_per_token(bpe_tokenizers)
wp_tok_results = calculate_vocabs_avg_bits_per_token(wp_tokenizers)
sp_tok_results = calculate_vocabs_avg_bits_per_token(sp_tokenizers)

all_results = bpe_tok_results | wp_tok_results | sp_tok_results
model_name_results = {}

run_id = ""
with open('./run_ids.json') as json_file:
    run_ids = json.load(json_file)
    model_names = [model_name for model_name, _ in run_ids.items()]
    for tok_name, results in all_results.items():
        model_name_matches = [name for name in model_names if tok_name in name]
        for mn in model_name_matches:
            model_name_results[mn] = results
        
# print(model_name_results)
results_to_upload = []

for model_name, result in model_name_results.items():
    results_to_upload.append({"model": model_name, **result})
    
for result in results_to_upload:
    add_to_wandb(result)

{'model': 'BERT-1k-bpe-9.0M-2L-4H-780C-1024I', 'avg_char_per_token': 2.5346733668341708}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,21.30653
avg_char_per_token,2.53467
base_avg,0.5615
blimp_avg,0.57399
boolq,0.61826
cola,-0.02077
eval/loss,0.46485
eval/runtime,31.5492
eval/samples_per_second,697.006
eval/steps_per_second,43.583


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111302570013019, max=1.0)…

{'model': 'GPT-1k-bpe-9.0M-2L-4H-780C-1024I', 'avg_char_per_token': 2.5346733668341708}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,21.30653
avg_char_per_token,2.53467
base_avg,0.54083
blimp_avg,0.53707
boolq,0.61134
cola,0
eval/loss,1.29524
eval/runtime,29.0791
eval/samples_per_second,756.214
eval/steps_per_second,47.285


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111207182208697, max=1.0)…

{'model': 'BERT-3k-bpe-9.0M-2L-4H-700C-1024I', 'avg_char_per_token': 3.5869782971619366}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,29.12588
avg_char_per_token,3.58698
base_avg,0.6173
blimp_avg,0.59434
boolq,0.61549
cola,0.04265
eval/loss,1.86522
eval/runtime,36.6563
eval/samples_per_second,599.898
eval/steps_per_second,37.511


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112173844594508, max=1.0…

{'model': 'GPT-3k-bpe-9.0M-2L-4H-700C-1024I', 'avg_char_per_token': 3.5869782971619366}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,29.12588
avg_char_per_token,3.58698
base_avg,0.5417
blimp_avg,0.54008
boolq,0.62794
cola,0.02913
eval/loss,1.46783
eval/runtime,31.9157
eval/samples_per_second,689.003
eval/steps_per_second,43.082


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111215588882462, max=1.0)…

{'model': 'BERT-6k-bpe-9.0M-2L-4H-604C-1024I', 'avg_char_per_token': 4.235863219349458}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,34.14045
avg_char_per_token,4.23586
base_avg,0.60855
blimp_avg,0.59375
boolq,0.66113
cola,0.04729
eval/loss,1.55416
eval/runtime,31.9692
eval/samples_per_second,687.849
eval/steps_per_second,43.01


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112242322027063, max=1.0…

{'model': 'GPT-6k-bpe-9.0M-2L-4H-604C-1024I', 'avg_char_per_token': 4.235863219349458}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,34.14045
avg_char_per_token,4.23586
base_avg,0.56356
blimp_avg,0.55363
boolq,0.65975
cola,0.02913
eval/loss,1.55416
eval/runtime,32.0966
eval/samples_per_second,685.12
eval/steps_per_second,42.839


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011137897844633295, max=1.0…

{'model': 'BERT-10k-bpe-9.0M-2L-4H-516C-1024I', 'avg_char_per_token': 4.639419709854927}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,37.32106
avg_char_per_token,4.63942
base_avg,0.61031
blimp_avg,0.58982
boolq,0.639
cola,0
eval/loss,0.88644
eval/runtime,36.2223
eval/samples_per_second,607.085
eval/steps_per_second,37.96


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112509100025312, max=1.0…

{'model': 'GPT-10k-bpe-9.0M-2L-4H-516C-1024I', 'avg_char_per_token': 4.639419709854927}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,37.32106
avg_char_per_token,4.63942
base_avg,0.60214
blimp_avg,0.58842
boolq,0.65284
cola,0.01407
eval/loss,1.62567
eval/runtime,35.2323
eval/samples_per_second,624.143
eval/steps_per_second,39.027


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111236311133123, max=1.0)…

{'model': 'BERT-15k-bpe-9.0M-2L-4H-412C-1024I', 'avg_char_per_token': 4.923707902634211}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,39.58493
avg_char_per_token,4.92371
base_avg,0.62743
blimp_avg,0.60598
boolq,0.61549
cola,-0.02077
eval/loss,1.07712
eval/runtime,34.983
eval/samples_per_second,628.592
eval/steps_per_second,39.305


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112172521987103, max=1.0…

{'model': 'GPT-15k-bpe-9.0M-2L-4H-412C-1024I', 'avg_char_per_token': 4.923707902634211}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,39.58493
avg_char_per_token,4.92371
base_avg,0.58302
blimp_avg,0.56569
boolq,0.64039
cola,0.02656
eval/loss,1.68766
eval/runtime,34.6523
eval/samples_per_second,634.589
eval/steps_per_second,39.68


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112055377776011, max=1.0…

{'model': 'BERT-20k-bpe-9.0M-2L-4H-348C-1024I', 'avg_char_per_token': 5.121830457614403}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,41.17229
avg_char_per_token,5.12183
base_avg,0.64098
blimp_avg,0.61186
boolq,0.66252
cola,0
eval/loss,1.10353
eval/runtime,37.1743
eval/samples_per_second,591.538
eval/steps_per_second,36.988


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112321878317743, max=1.0…

{'model': 'GPT-20k-bpe-9.0M-2L-4H-348C-1024I', 'avg_char_per_token': 5.121830457614403}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,41.17229
avg_char_per_token,5.12183
base_avg,0.58858
blimp_avg,0.57132
boolq,0.65422
cola,-0.02939
eval/loss,1.74626
eval/runtime,36.6888
eval/samples_per_second,599.366
eval/steps_per_second,37.477


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112331422433877, max=1.0…

{'model': 'BERT-1k-wp-9.0M-2L-4H-780C-1024I', 'avg_char_per_token': 2.02713567839196}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,18.04221
avg_char_per_token,2.02714
base_avg,0.54389
blimp_avg,0.5565
boolq,0.60166
cola,-0.02077
eval/loss,0.40675
eval/runtime,31.5237
eval/samples_per_second,697.57
eval/steps_per_second,43.618


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112528933315642, max=1.0…

{'model': 'GPT-1k-wp-9.0M-2L-4H-780C-1024I', 'avg_char_per_token': 2.02713567839196}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,18.04221
avg_char_per_token,2.02714
base_avg,0.54712
blimp_avg,0.53224
boolq,0.60858
cola,0
eval/loss,1.20717
eval/runtime,28.9378
eval/samples_per_second,759.906
eval/steps_per_second,47.516


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111227797745313, max=1.0)…

{'model': 'BERT-3k-wp-9.0M-2L-4H-700C-1024I', 'avg_char_per_token': 3.3512520868113524}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,27.52855
avg_char_per_token,3.35125
base_avg,0.59811
blimp_avg,0.59172
boolq,0.61826
cola,0.04495
eval/loss,0.64476
eval/runtime,33.1595
eval/samples_per_second,663.158
eval/steps_per_second,41.466


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112217299847139, max=1.0…

{'model': 'GPT-3k-wp-9.0M-2L-4H-700C-1024I', 'avg_char_per_token': 3.3512520868113524}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,27.52855
avg_char_per_token,3.35125
base_avg,0.56019
blimp_avg,0.55397
boolq,0.63485
cola,0.00667
eval/loss,1.40371
eval/runtime,31.9897
eval/samples_per_second,687.408
eval/steps_per_second,42.983


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111368380047174, max=1.0)…

{'model': 'BERT-6k-wp-9.0M-2L-4H-604C-1024I', 'avg_char_per_token': 4.074061718098416}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,33.00083
avg_char_per_token,4.07406
base_avg,0.60658
blimp_avg,0.59849
boolq,0.65284
cola,0
eval/loss,0.79087
eval/runtime,32.9745
eval/samples_per_second,666.88
eval/steps_per_second,41.699


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112060066726473, max=1.0…

{'model': 'GPT-6k-wp-9.0M-2L-4H-604C-1024I', 'avg_char_per_token': 4.074061718098416}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,33.00083
avg_char_per_token,4.07406
base_avg,0.55827
blimp_avg,0.54854
boolq,0.66252
cola,0.04391
eval/loss,1.49587
eval/runtime,31.8768
eval/samples_per_second,689.843
eval/steps_per_second,43.135


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111262709988902, max=1.0)…

{'model': 'BERT-10k-wp-9.0M-2L-4H-516C-1024I', 'avg_char_per_token': 4.527863931965983}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,36.52146
avg_char_per_token,4.52786
base_avg,0.61638
blimp_avg,0.59966
boolq,0.66667
cola,0
eval/loss,0.90275
eval/runtime,34.9838
eval/samples_per_second,628.577
eval/steps_per_second,39.304


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112000722075917, max=1.0…

{'model': 'GPT-10k-wp-9.0M-2L-4H-516C-1024I', 'avg_char_per_token': 4.527863931965983}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,36.52146
avg_char_per_token,4.52786
base_avg,0.58556
blimp_avg,0.57561
boolq,0.65422
cola,0.00332
eval/loss,1.558
eval/runtime,34.7721
eval/samples_per_second,632.403
eval/steps_per_second,39.543


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112146977878486, max=1.0…

{'model': 'BERT-15k-wp-9.0M-2L-4H-412C-1024I', 'avg_char_per_token': 4.83407802600867}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,38.92444
avg_char_per_token,4.83408
base_avg,0.60342
blimp_avg,0.59174
boolq,0.63347
cola,0
eval/loss,1.0343
eval/runtime,39.3435
eval/samples_per_second,558.924
eval/steps_per_second,34.949


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112056955850373, max=1.0…

{'model': 'GPT-15k-wp-9.0M-2L-4H-412C-1024I', 'avg_char_per_token': 4.83407802600867}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,38.92444
avg_char_per_token,4.83408
base_avg,0.60633
blimp_avg,0.59115
boolq,0.60166
cola,0
eval/loss,1.61563
eval/runtime,35.0232
eval/samples_per_second,627.869
eval/steps_per_second,39.26


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112231232820907, max=1.0…

{'model': 'BERT-20k-wp-9.0M-2L-4H-348C-1024I', 'avg_char_per_token': 5.0324081020255065}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,40.50293
avg_char_per_token,5.03241
base_avg,0.61898
blimp_avg,0.61037
boolq,0.63624
cola,0
eval/loss,1.05633
eval/runtime,37.5799
eval/samples_per_second,585.153
eval/steps_per_second,36.589


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112539088612216, max=1.0…

{'model': 'GPT-20k-wp-9.0M-2L-4H-348C-1024I', 'avg_char_per_token': 5.0324081020255065}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,40.50293
avg_char_per_token,5.03241
base_avg,0.59485
blimp_avg,0.58217
boolq,0.64869
cola,-0.00753
eval/loss,1.66547
eval/runtime,37.0982
eval/samples_per_second,592.751
eval/steps_per_second,37.064


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0111122815113049, max=1.0))…

{'model': 'BERT-1k-sp-9.0M-2L-4H-780C-1024I', 'avg_char_per_token': 2.9849246231155777}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,24.60302
avg_char_per_token,2.98492
base_avg,0.56167
blimp_avg,0.5656
boolq,0.62517
cola,0
eval/loss,0.52474
eval/runtime,36.4641
eval/samples_per_second,603.058
eval/steps_per_second,37.708


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112159555907258, max=1.0…

{'model': 'GPT-1k-sp-9.0M-2L-4H-780C-1024I', 'avg_char_per_token': 2.9849246231155777}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,24.60302
avg_char_per_token,2.98492
base_avg,0.53017
blimp_avg,0.52523
boolq,0.62241
cola,0
eval/loss,1.52996
eval/runtime,33.8767
eval/samples_per_second,649.119
eval/steps_per_second,40.588


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112050822056416, max=1.0…

{'model': 'BERT-3k-sp-9.0M-2L-4H-700C-1024I', 'avg_char_per_token': 4.256761268781302}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,34.29449
avg_char_per_token,4.25676
base_avg,0.60456
blimp_avg,0.60107
boolq,0.63624
cola,0
eval/loss,0.71292
eval/runtime,38.0416
eval/samples_per_second,578.051
eval/steps_per_second,36.145


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112468700028128, max=1.0…

{'model': 'GPT-3k-sp-9.0M-2L-4H-700C-1024I', 'avg_char_per_token': 4.256761268781302}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,34.29449
avg_char_per_token,4.25676
base_avg,0.55606
blimp_avg,0.54458
boolq,0.60858
cola,0.00819
eval/loss,1.56439
eval/runtime,37.3656
eval/samples_per_second,588.51
eval/steps_per_second,36.799


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112114022641133, max=1.0…

{'model': 'BERT-6k-sp-9.0M-2L-4H-604C-1024I', 'avg_char_per_token': 4.941284403669725}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,39.65438
avg_char_per_token,4.94128
base_avg,0.61159
blimp_avg,0.59909
boolq,0.62932
cola,0.06691
eval/loss,0.84615
eval/runtime,37.9444
eval/samples_per_second,579.533
eval/steps_per_second,36.237


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111217899985301, max=1.0)…

{'model': 'GPT-6k-sp-9.0M-2L-4H-604C-1024I', 'avg_char_per_token': 4.941284403669725}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,39.65438
avg_char_per_token,4.94128
base_avg,0.58527
blimp_avg,0.57423
boolq,0.63762
cola,0.01326
eval/loss,1.55216
eval/runtime,37.3638
eval/samples_per_second,588.538
eval/steps_per_second,36.8


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111245137742824, max=1.0)…

{'model': 'BERT-10k-sp-9.0M-2L-4H-516C-1024I', 'avg_char_per_token': 5.283541770885443}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,42.35318
avg_char_per_token,5.28354
base_avg,0.61665
blimp_avg,0.60439
boolq,0.639
cola,0
eval/loss,0.91136
eval/runtime,40.5176
eval/samples_per_second,542.728
eval/steps_per_second,33.936


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112526178153025, max=1.0…

{'model': 'GPT-10k-sp-9.0M-2L-4H-516C-1024I', 'avg_char_per_token': 5.283541770885443}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,42.35318
avg_char_per_token,5.28354
base_avg,0.59579
blimp_avg,0.58994
boolq,0.63485
cola,-0.02315
eval/loss,1.60965
eval/runtime,46.488
eval/samples_per_second,473.025
eval/steps_per_second,29.578


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112046321957475, max=1.0…

{'model': 'BERT-15k-sp-9.0M-2L-4H-412C-1024I', 'avg_char_per_token': 5.556585528509503}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,44.52257
avg_char_per_token,5.55659
base_avg,0.62293
blimp_avg,0.60264
boolq,0.61411
cola,0
eval/loss,1.04516
eval/runtime,40.6448
eval/samples_per_second,541.028
eval/steps_per_second,33.83


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112254799809308, max=1.0…

{'model': 'GPT-15k-sp-9.0M-2L-4H-412C-1024I', 'avg_char_per_token': 5.556585528509503}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,44.52257
avg_char_per_token,5.55659
base_avg,0.62164
blimp_avg,0.61234
boolq,0.64039
cola,0.04729
eval/loss,1.67756
eval/runtime,39.4255
eval/samples_per_second,557.76
eval/steps_per_second,34.876


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112208322285572, max=1.0…

{'model': 'BERT-20k-sp-9.0M-2L-4H-348C-1024I', 'avg_char_per_token': 5.734233558389597}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,45.93828
avg_char_per_token,5.73423
base_avg,0.62126
blimp_avg,0.60963
boolq,0.62517
cola,0
eval/loss,1.0846
eval/runtime,42.5246
eval/samples_per_second,517.113
eval/steps_per_second,32.334


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111665199536623, max=1.0…

{'model': 'GPT-20k-sp-9.0M-2L-4H-348C-1024I', 'avg_char_per_token': 5.734233558389597}


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_char_per_token,▁

0,1
avg_bits_per_token,45.93828
avg_char_per_token,5.73423
base_avg,0.61221
blimp_avg,0.58578
boolq,0.64454
cola,0.04055
eval/loss,1.71803
eval/runtime,41.6309
eval/samples_per_second,528.214
eval/steps_per_second,33.028
