### **KantaiBERT**  is trained as a `RoBERTa` Transformer with `DistilBERT` architecture.

In [None]:
!pip install transformers datasets

In [None]:
#Download the dataset.
!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"

In [1]:
!pip list | grep -E 'transformers|tokenizers'

tokenizers                            0.12.1
transformers                          4.20.1


In [2]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path('.').glob('**/*.txt')]

#Initialize a tokenizer.
tokenizer = ByteLevelBPETokenizer()

paths

['kant.txt',
 'KantaiBERT/merges.txt',
 'wandb/run-20230208_085425-26qasbzp/files/requirements.txt']

In [3]:
#Customize training.
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "",
    "",
    "",
    ""])






In [4]:
#Save files to disk.
import os
token_dir = 'KantaiBERT'

if not os.path.exists(token_dir):
    os.makedirs(token_dir)
    
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [5]:
#Load the trained tokenizer files.
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    'KantaiBERT/vocab.json', 
    'KantaiBERT/merges.txt')


tokenizer.encode('The Critique of Pure Reason').tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason']

In [6]:
tokenizer.encode('The Critique of Pure Reason')

Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ('', tokenizer.token_to_id('')), 
    ('', tokenizer.token_to_id(''))
)

tokenizer.enable_truncation(max_length = 512)

In [8]:
#Define model config.
from transformers import RobertaConfig

config = RobertaConfig(vocab_size = 52_000, 
                       max_position_embeddings = 514,
                       num_attention_heads = 12, 
                       num_hidden_layers = 6,
                       type_vocab_size = 1
                      )

print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [9]:
#Re-create the tokenizer in Transformers.
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('./KantaiBERT', 
                                             max_length = 512)

#Initialize a model from scratch.
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config = config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
print(model.num_parameters())

83504416


In [11]:
#Build the dataset.
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(tokenizer = tokenizer, 
                                file_path = './kant.txt',
                                block_size = 128)



In [12]:
#Define a data collator.
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, 
                                                mlm = True, 
                                                mlm_probability = 0.15)

In [13]:
#Initialize the trainer.
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './KantaiBERT/',
    overwrite_output_dir = True,
    num_train_epochs = 5,
    per_device_train_batch_size = 128, 
    save_steps = 10_000,
    save_total_limit = 2)


trainer = Trainer(
    model = model,
    args = training_args, 
    data_collator = data_collator,
    train_dataset = dataset)

In [14]:
%%time
trainer.train()

***** Running training *****
  Num examples = 170964
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 6680
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mkimata[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
500,6.4015
1000,5.2926
1500,4.7016
2000,4.3841
2500,4.19
3000,4.0151
3500,3.9004
4000,3.795
4500,3.7133
5000,3.6814




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 27min 7s, sys: 5.9 s, total: 27min 13s
Wall time: 27min 30s


TrainOutput(global_step=6680, training_loss=4.201414690760082, metrics={'train_runtime': 1650.6877, 'train_samples_per_second': 517.857, 'train_steps_per_second': 4.047, 'total_flos': 4507986008547840.0, 'train_loss': 4.201414690760082, 'epoch': 5.0})

In [15]:
#Save the model.
trainer.save_model('./KantaiBERT')

#Language modeling with FillMaskPipeline
from transformers import pipeline

fill_mask = pipeline(
    'fill-mask',
    model = './KantaiBERT',
    tokenizer = './KantaiBERT')

Saving model checkpoint to ./KantaiBERT
Configuration saved in ./KantaiBERT/config.json
Model weights saved in ./KantaiBERT/pytorch_model.bin
loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

2023-02-08 09:31:56.212336: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA nod

In [21]:
fill_mask('Human thinking <mask> reason.')

[{'score': 0.2920242249965668,
  'token': 266,
  'token_str': ' of',
  'sequence': 'Human thinking of reason.'},
 {'score': 0.15277273952960968,
  'token': 610,
  'token_str': ' practical',
  'sequence': 'Human thinking practical reason.'},
 {'score': 0.06844190508127213,
  'token': 465,
  'token_str': ' pure',
  'sequence': 'Human thinking pure reason.'},
 {'score': 0.059538062661886215,
  'token': 1005,
  'token_str': ' speculative',
  'sequence': 'Human thinking speculative reason.'},
 {'score': 0.03545363247394562,
  'token': 12,
  'token_str': ',',
  'sequence': 'Human thinking, reason.'}]

In [25]:
!head -10 /kaggle/working/kant.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)

The Project Gutenberg EBook of The Critique of Pure Reason, by Immanuel Kant

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.net


Title: The Critique of Pure Reason


In [26]:
fill_mask('The <mask> of Pure Reason')

[{'score': 0.02685278281569481,
  'token': 1274,
  'token_str': ' Reason',
  'sequence': 'The Reason of Pure Reason'},
 {'score': 0.023974640294909477,
  'token': 2260,
  'token_str': ' Critique',
  'sequence': 'The Critique of Pure Reason'},
 {'score': 0.010829992592334747,
  'token': 899,
  'token_str': ' faculty',
  'sequence': 'The faculty of Pure Reason'},
 {'score': 0.009400766342878342,
  'token': 3092,
  'token_str': ' Conceptions',
  'sequence': 'The Conceptions of Pure Reason'},
 {'score': 0.009031211026012897,
  'token': 415,
  'token_str': ' conception',
  'sequence': 'The conception of Pure Reason'}]

In [27]:
fill_mask('The Critique of Pure <mask>')

[{'score': 0.31438520550727844,
  'token': 1274,
  'token_str': ' Reason',
  'sequence': 'The Critique of Pure Reason'},
 {'score': 0.07076602429151535,
  'token': 14,
  'token_str': '.',
  'sequence': 'The Critique of Pure.'},
 {'score': 0.0475216805934906,
  'token': 1423,
  'token_str': ' Pure',
  'sequence': 'The Critique of Pure Pure'},
 {'score': 0.029480906203389168,
  'token': 2985,
  'token_str': ' Practical',
  'sequence': 'The Critique of Pure Practical'},
 {'score': 0.018943089991807938,
  'token': 2006,
  'token_str': ' Transcendental',
  'sequence': 'The Critique of Pure Transcendental'}]