In [2]:
!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  11.1M      0 --:--:-- --:--:-- --:--:-- 11.1M


In [3]:
#@title Step 2:Installing Hugging Face Transformers
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install transformers[torch]
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0

tokenizers                       0.15.2
transformers                     4.38.0.dev0


In [4]:
# Training a Tokenizer

%%time
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 7.62 s, sys: 180 ms, total: 7.8 s
Wall time: 5.34 s


In [5]:
import os

token_dir = "/content/KantaiBERT"

if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model("KantaiBERT")

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [6]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt"
)

In [7]:
tokenizer.encode("The Critique of Pure Reason.").tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [8]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [9]:
tokenizer.encode("The Critique of Pure Reason.").tokens

['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

In [10]:
!nvidia-smi

Tue Feb 20 17:17:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              11W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [11]:
import torch
torch.cuda.is_available()

True

In [12]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embedding=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [13]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)


In [14]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [15]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [16]:
print(model.num_parameters())

83502880


In [17]:
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


In [18]:
%%time

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./kant.txt",
    block_size=128
)



CPU times: user 31 s, sys: 368 ms, total: 31.4 s
Wall time: 39.8 s


In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [26]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [27]:
%%time

trainer.train()

Step,Training Loss
500,4.6372
1000,4.3615
1500,4.1673
2000,4.101
2500,4.0729
3000,3.9874
3500,3.8917
4000,3.8118
4500,3.7344
5000,3.6723


CPU times: user 28min 37s, sys: 6.31 s, total: 28min 43s
Wall time: 28min 57s


TrainOutput(global_step=8016, training_loss=3.862022317574172, metrics={'train_runtime': 1737.071, 'train_samples_per_second': 295.263, 'train_steps_per_second': 4.615, 'total_flos': 2621616775994112.0, 'train_loss': 3.862022317574172, 'epoch': 3.0})

In [28]:
trainer.save_model("./KantaiBERT")

In [29]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)

In [30]:
fill_mask("Human thinking involves human <mask>")

[{'score': 0.40651190280914307,
  'token': 18,
  'token_str': '.',
  'sequence': 'Human thinking involves human.'},
 {'score': 0.034849613904953,
  'token': 394,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason'},
 {'score': 0.016875620931386948,
  'token': 35,
  'token_str': '?',
  'sequence': 'Human thinking involves human?'},
 {'score': 0.01478514727205038,
  'token': 989,
  'token_str': ' mind',
  'sequence': 'Human thinking involves human mind'},
 {'score': 0.011608880013227463,
  'token': 588,
  'token_str': ' nature',
  'sequence': 'Human thinking involves human nature'}]