In [1]:
import os
import numpy as np
import pandas as pd
import torch
import requests

from pathlib import Path
from tempfile import gettempdir

print('CUDA:', torch.cuda.is_available(), 'GPU:', torch.cuda.get_device_name(0))

CUDA: True GPU: GeForce GTX 1070


# Data

In [2]:
file_url = 'https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt'
file_path = Path(gettempdir()) / 'oscar.eo.txt'
model_path = Path(gettempdir()) / 'EsperBERTo'

if not file_path.exists():
    with open(file_path, 'wt') as f:
        r = requests.get(file_url, allow_redirects=True)
        f.write(r.content.decode(encoding='utf-8'))
        
if not model_path.exists():
    os.makedirs(model_path)

# Tokenizer

```
# vocab.json
{
    "<s>": 0,
    "<pad>": 1,
    "</s>": 2,
    "<unk>": 3,
    "<mask>": 4,
    "!": 5,
    "\"": 6,
    "#": 7,
    "$": 8,
    "%": 9,
    "&": 10,
    "'": 11,
    "(": 12,
    ")": 13,
    #
}
 
# merges.txt
l a
Ġ k
o n
Ġ la
t a
Ġ e
Ġ d
Ġ p
#
```

In [3]:
%%time

from tokenizers import ByteLevelBPETokenizer

vocab_path = Path(gettempdir()) / 'vocab.json'
merge_path = Path(gettempdir()) / 'merges.txt'

if vocab_path.exists() and merge_path.exists():
    tokenizer = ByteLevelBPETokenizer.from_file(vocab_filename=str(vocab_path),
                                                merges_filename=str(merge_path))
else:
    print('TRAIN')
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[str(file_path)],
                    vocab_size=52_000,
                    min_frequency=2,
                    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.save_model(str(model_path))

TRAIN
CPU times: user 5min 1s, sys: 9.37 s, total: 5min 11s
Wall time: 1min


In [4]:
from tokenizers.processors import BertProcessing

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)