In [1]:
from pathlib import Path
import transformers
from tokenizers import ByteLevelBPETokenizer

In [2]:
%%time

paths = [str(x) for x in Path('.').glob('**/*.txt')]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths,vocab_size=52000,min_frequency=2,special_tokens=['<s>','<pad>','</s>','<unk>,<mask>'])

Wall time: 1.46 s


In [3]:
import os
token_dir = './content/KantaiBERT/'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model(token_dir)

['./content/KantaiBERT/vocab.json', './content/KantaiBERT/merges.txt']

In [4]:
from tokenizers.processors import BertProcessing

tokenizer =ByteLevelBPETokenizer(vocab='./content/KantaiBERT/vocab.json',merges='./content/KantaiBERT/merges.txt')

In [5]:
tokenizer.encode('The Critique of Pure Reason.').tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [6]:
tokenizer.encode('The Critique of Pure Reason.')

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(('</s>',tokenizer.token_to_id('</s>')),
                                                    ('<s>',tokenizer.token_to_id('<s>')))
tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.encode('The Critique of Pure Reason.')

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [9]:
tokenizer.encode('The Critique of Pure Reason.').tokens

['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

In [10]:
!nvidia-smi

Tue Mar 23 08:54:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.79       Driver Version: 460.79       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 166... WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   43C    P8     6W /  N/A |    431MiB /  6144MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
import torch
torch.cuda.is_available()

True

In [12]:
from transformers import RobertaConfig

config = RobertaConfig(vocab_size = 52000,
                      max_position_embeddings = 514,
                      num_attention_heads = 12,
                      num_hidden_layers = 6,
                      type_vocab_size = 1)

In [13]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('./content/KantaiBERT/', max_length = 512)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [14]:
from transformers import RobertaForMaskedLM

In [15]:
model = RobertaForMaskedLM(config = config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [16]:
print(model.num_parameters())

83504416


In [17]:
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


In [18]:
# for p in range(0,lp):
#     print(LP[p])

In [19]:
%%time

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(tokenizer = tokenizer,
                              file_path = './kent.txt',
                              block_size=128)



Wall time: 19.2 s


In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = True, mlm_probability=0.15)

In [21]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(output_dir = './content/KantaiBERT/',
                                 overwrite_output_dir = True,
                                 num_train_epochs =1,
                                 per_device_train_batch_size = 64,
                                 save_steps = 10000,
                                 save_total_limit = 2,
                                 )

trainer = Trainer(model = model,
                 args = training_args,
                 data_collator = data_collator,
                 train_dataset = dataset)

In [22]:
%%time

trainer.train()

Step,Training Loss
500,5.4629
1000,4.0612
1500,3.7815
2000,3.5556
2500,3.4342


Wall time: 10min 15s


TrainOutput(global_step=2672, training_loss=4.018088997481112, metrics={'train_runtime': 614.6533, 'train_samples_per_second': 4.347, 'total_flos': 1689347110470912.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 601561, 'init_mem_gpu_alloc_delta': 334180352, 'init_mem_cpu_peaked_delta': 18258, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1339522, 'train_mem_gpu_alloc_delta': 1009833984, 'train_mem_cpu_peaked_delta': 6925725, 'train_mem_gpu_peaked_delta': 2583970816})

In [23]:
trainer.save_model('./KantaiBERT')

In [29]:
from transformers import pipeline

fill_mask = pipeline('fill-mask',
                    model = './KantaiBERT',
                    tokenizer = tokenizer)

Some weights of RobertaModel were not initialized from the model checkpoint at ./KantaiBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
fill_mask('Human thinking involves human <mask>.')

[{'sequence': 'Human thinking involves human reason.',
  'score': 0.02658740058541298,
  'token': 394,
  'token_str': ' reason'},
 {'sequence': 'Human thinking involves human understanding.',
  'score': 0.019244344905018806,
  'token': 608,
  'token_str': ' understanding'},
 {'sequence': 'Human thinking involves human conceptions.',
  'score': 0.016122329980134964,
  'token': 615,
  'token_str': ' conceptions'},
 {'sequence': 'Human thinking involves human experience.',
  'score': 0.01592661812901497,
  'token': 538,
  'token_str': ' experience'},
 {'sequence': 'Human thinking involves human conception.',
  'score': 0.014086836948990822,
  'token': 420,
  'token_str': ' conception'}]