In [2]:
# GPU
!nvidia-smi

Wed Dec  7 15:13:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.85.02    Driver Version: 510.85.02    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:3B:00.0 Off |                  Off |
| 71%   85C    P2   299W / 300W |  11881MiB / 49140MiB |    100%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000    Off  | 00000000:5E:00.0 Off |                  Off |
| 73%   81C    P2   298W / 300W |   4029MiB / 49140MiB |    100%      Default |
|       

In [3]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = ["dataset/wikitext-2-raw/name_dataset.raw"]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=20_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 25.8 s, sys: 1.68 s, total: 27.5 s
Wall time: 692 ms


In [6]:
#!mkdir tokenizer_wiki_2
tokenizer.save_model("./exp_wiki/tokenizer_wiki_1_4")

['./exp_wiki/tokenizer_wiki_1_4/vocab.json',
 './exp_wiki/tokenizer_wiki_1_4/merges.txt']

In [7]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./exp_wiki/tokenizer_wiki_1_4/vocab.json",
    "./exp_wiki/tokenizer_wiki_1_4/merges.txt",
)

In [8]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [9]:
tokenizer.encode("I saw a gril with a telescope.")

Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [10]:
tokenizer.encode("I saw a gril with a telescope.").tokens

['<s>',
 'I',
 'Ġsaw',
 'Ġa',
 'Ġgr',
 'il',
 'Ġwith',
 'Ġa',
 'Ġt',
 'el',
 'esc',
 'ope',
 '.',
 '</s>']

In [11]:
# GPU availability
import torch
torch.cuda.is_available()

True

In [12]:
#random seed
import numpy as np
import random

seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [13]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=20_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1,
)

###Reload tokenizer 

In [14]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("./exp_wiki/tokenizer_wiki_1_4", max_len=512)

###Create the model

In [15]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(config=config)

### Define a data_collator.


In [16]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

2022-12-07 15:16:15.896870: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-07 15:16:16.074019: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-07 15:16:16.115704: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-07 15:16:16.850750: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

### Start training

In [18]:
from transformers import TextDataset
from transformers import Trainer, TrainingArguments

#curriculum: (block_size, batch_size, end_steps)
curriculum = [
    (64, 8, 10_000),
    (128, 8, 20_000),
    (256, 8, 30_000),
    (512, 8, 60_000),
]

last_steps = 0
is_first_phase = True

for block_size, batch_size, end_steps in curriculum:
  print(f"######## Block size = {block_size}, Batch size = {batch_size} ########")
  #Build our training and evaluation datasets
  train_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path="dataset/wikitext-2-raw/wiki.valid_1_4.raw",
      block_size=block_size,
  )
  eval_dataset = TextDataset(
      tokenizer=tokenizer,
      file_path="dataset/wikitext-2-raw/wiki.valid_2.raw",
      block_size=512,
  )
  #Set training arguments
  training_args = TrainingArguments(
      output_dir="log", #mkdir log if necessary
      overwrite_output_dir=True,
      num_train_epochs=40,
      max_steps=end_steps,
      per_gpu_train_batch_size=batch_size,
      per_gpu_eval_batch_size=batch_size,
      save_steps=10_000,
      save_total_limit=1,
      prediction_loss_only=True,
      evaluation_strategy="steps",
      logging_steps=500,
      logging_first_step=True,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
  )
  #train a language model
  if is_first_phase:
    trainer.train()
    is_first_phase = False
  else:
    trainer.train(f"log/checkpoint-{last_steps}")
  last_steps = end_steps

Creating features from dataset file at dataset/wikitext-2-raw


######## Block size = 64, Batch size = 8 ########


Saving features into cached file dataset/wikitext-2-raw/cached_lm_RobertaTokenizerFast_62_wiki.valid_1_4.raw [took 0.001 s]
Loading features from cached file dataset/wikitext-2-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid_2.raw [took 0.011 s]
using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 936
  Num Epochs = 170
  Instantaneous batch size per device = 8
  Total train batch size (w. paral

Step,Training Loss,Validation Loss
500,7.4685,7.332571
1000,7.1241,7.28658
1500,7.0915,7.28476
2000,7.074,7.254584
2500,7.0508,7.270261
3000,7.045,7.268113
3500,7.0553,7.281592
4000,7.0436,7.265557
4500,7.0509,7.250543
5000,7.0356,7.245481


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 544
  Batch size = 16
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 544
  Batch size = 16
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eva

######## Block size = 128, Batch size = 8 ########


Saving features into cached file dataset/wikitext-2-raw/cached_lm_RobertaTokenizerFast_126_wiki.valid_1_4.raw [took 0.001 s]
Loading features from cached file dataset/wikitext-2-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid_2.raw [took 0.011 s]
using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Loading model from log/checkpoint-10000.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 460
  Num Epochs = 690
  Instantaneous batch size per dev

  0%|          | 0/24 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Step,Training Loss,Validation Loss
10500,7.0341,7.277466
11000,7.0258,7.294957
11500,7.0394,7.269774
12000,7.031,7.282495
12500,7.0229,7.290156
13000,7.0215,7.287715
13500,7.0263,7.282919
14000,7.0269,7.284198
14500,7.0294,7.288043
15000,7.024,7.309116


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 544
  Batch size = 16
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 544
  Batch size = 16
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eva

######## Block size = 256, Batch size = 8 ########


Saving features into cached file dataset/wikitext-2-raw/cached_lm_RobertaTokenizerFast_254_wiki.valid_1_4.raw [took 0.001 s]
Loading features from cached file dataset/wikitext-2-raw/cached_lm_RobertaTokenizerFast_510_wiki.valid_2.raw [took 0.010 s]
using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Loading model from log/checkpoint-20000.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 228
  Num Epochs = 2000
  Instantaneous batch size per de

  0%|          | 0/5 [00:00<?, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Step,Training Loss,Validation Loss
20500,7.0155,7.325359
21000,7.0172,7.319972
21500,7.0269,7.299853
22000,7.0194,7.297007
22500,7.0197,7.296162
23000,7.0197,7.301858
23500,7.0202,7.301112
24000,7.0202,7.320256
24500,7.023,7.299975
25000,7.018,7.312531


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 544
  Batch size = 16
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 544
  Batch size = 16
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eva

In [19]:
#mkdir log if necessary
trainer.save_model("./exp_wiki/name_1234")

Saving model checkpoint to ./exp_wiki/model_wiki_1_4
Configuration saved in ./exp_wiki/model_wiki_1_4/config.json
Model weights saved in ./exp_wiki/model_wiki_1_4/pytorch_model.bin
