<a href="https://colab.research.google.com/github/AndromathArcanitus/ML-code-samples/blob/main/1st_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Step 2:APRIL 2023 UPDATE: Installing Hugging Face Transformers
'''
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0
'''

"\n# We won't need TensorFlow here\n!pip uninstall -y tensorflow\n# Install `transformers` from master\n!pip install git+https://github.com/huggingface/transformers\n!pip list | grep -E 'transformers|tokenizers'\n# transformers version at notebook update --- 2.9.1\n# tokenizers version at notebook update --- 0.7.0\n"

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m115.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m126.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [3]:
#@title Step 3: Training a Tokenizer
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 4.56 s, sys: 135 ms, total: 4.7 s
Wall time: 2.74 s


In [4]:
#@title Step 4: Saving the files to disk
import os
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [5]:
#@title Step 5 Loading the Trained Tokenizer Files 
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt",
)

In [6]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [7]:
#@title Step 6: Checking Resource Constraints: GPU and NVIDIA
!nvidia-smi

Thu Jun  1 01:36:43 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
#@title Checking that PyTorch Sees CUDA
import torch
torch.cuda.is_available()

True

In [9]:
#@title Step 7: Defining the configuration of the Model
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [10]:
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.29.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [11]:
#@title Step 8: Re-creating the Tokenizer in Transformers
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)

In [12]:
#@title Step 9: Initializing a Model From Scratch
from transformers import RobertaForMaskedLM

In [13]:
model = RobertaForMaskedLM(config=config)

In [14]:
#@title Step 10: Building the Dataset
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./kant.txt",
    block_size=128,
)



CPU times: user 34.6 s, sys: 952 ms, total: 35.5 s
Wall time: 43.4 s


In [15]:
#@title Step 11: Defining a Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [16]:
#@title Step 11: Defining a Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [17]:
pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [18]:
#@title Step 12: Initializing the Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [19]:
#@title Step 13: Pre-training the Model
%%time
trainer.train()



Step,Training Loss
500,6.6431
1000,5.7161
1500,5.2541
2000,4.9854
2500,4.8142


CPU times: user 9min 42s, sys: 2.21 s, total: 9min 44s
Wall time: 9min 59s


TrainOutput(global_step=2857, training_loss=5.386596337873534, metrics={'train_runtime': 599.0931, 'train_samples_per_second': 305.173, 'train_steps_per_second': 4.769, 'total_flos': 934691298792576.0, 'train_loss': 5.386596337873534, 'epoch': 1.0})

In [20]:
#@title Step 14: Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("./KantaiBERT")

In [21]:
#@title Step 15: Language Modeling with the FillMaskPipeline
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)

In [22]:
fill_mask("Human thinking involves human <mask>.")

[{'score': 0.04575687274336815,
  'token': 394,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.02509239688515663,
  'token': 421,
  'token_str': ' conception',
  'sequence': 'Human thinking involves human conception.'},
 {'score': 0.016969574615359306,
  'token': 444,
  'token_str': ' law',
  'sequence': 'Human thinking involves human law.'},
 {'score': 0.016534477472305298,
  'token': 613,
  'token_str': ' principle',
  'sequence': 'Human thinking involves human principle.'},
 {'score': 0.014782223850488663,
  'token': 477,
  'token_str': ' will',
  'sequence': 'Human thinking involves human will.'}]

In [25]:
fill_mask("The critique of pure reason is <mask> <mask>.")

[[{'score': 0.053565531969070435,
   'token': 339,
   'token_str': ' not',
   'sequence': '<s>The critique of pure reason is not<mask>.</s>'},
  {'score': 0.034328099340200424,
   'token': 267,
   'token_str': ' the',
   'sequence': '<s>The critique of pure reason is the<mask>.</s>'},
  {'score': 0.03222261369228363,
   'token': 289,
   'token_str': ' to',
   'sequence': '<s>The critique of pure reason is to<mask>.</s>'},
  {'score': 0.017299940809607506,
   'token': 420,
   'token_str': ' only',
   'sequence': '<s>The critique of pure reason is only<mask>.</s>'},
  {'score': 0.017247438430786133,
   'token': 263,
   'token_str': ' a',
   'sequence': '<s>The critique of pure reason is a<mask>.</s>'}],
 [{'score': 0.022538399323821068,
   'token': 536,
   'token_str': ' experience',
   'sequence': '<s>The critique of pure reason is<mask> experience.</s>'},
  {'score': 0.014826348051428795,
   'token': 506,
   'token_str': ' itself',
   'sequence': '<s>The critique of pure reason is<mask