# FomulaBEAT

変更点
- 数式データを用いてTransformerを学習させてみる
- Tokenizerはそのまま使う。（どうなるのかもみたい）
- それ以外のパラメータはKantaiBEATと同じ
- 利用したデータはcreate_fomula1.ipynb

In [36]:
model_dir = './FomulaBEATModel/01'


In [38]:
%%time 
#@title Step 3: Training a Tokenizer
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("data/**/*.txt")]
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

print(paths)

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

['data/equations.txt']



CPU times: user 31.3 s, sys: 1.46 s, total: 32.7 s
Wall time: 1.13 s


In [39]:
#@title Step 4: Saving the files to disk
import os
if not os.path.exists(model_dir):
  os.makedirs(model_dir)
tokenizer.save_model(model_dir)

['./FomulaBEATModel/01/vocab.json', './FomulaBEATModel/01/merges.txt']

In [11]:
#@title Step 5 Loading the Trained Tokenizer Files 
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./FomulaBEATModel/token/vocab.json",
    "./FomulaBEATModel/token/merges.txt",
)

In [13]:
tokenizer.encode("(2 - ((5 - 10) * (1 + (((3 * 8) + 2) + 3)))) = 152").tokens

['(',
 '2',
 'Ġ-',
 'Ġ((',
 '5',
 'Ġ-',
 'Ġ10',
 ')',
 'Ġ*',
 'Ġ(',
 '1',
 'Ġ+',
 'Ġ(((',
 '3',
 'Ġ*',
 'Ġ8',
 ')',
 'Ġ+',
 'Ġ2',
 ')',
 'Ġ+',
 'Ġ3',
 '))))',
 'Ġ=',
 'Ġ152']

In [14]:
tokenizer.encode("(2 - ((5 - 10) * (1 + (((3 * 8) + 2) + 3)))) = 152")


Encoding(num_tokens=25, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [16]:
#@title Step 6: Checking Resource Constraints: GPU and NVIDIA 
!nvidia-smi

Sat Sep 28 08:28:26 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro RTX 6000                Off |   00000000:17:00.0 Off |                  Off |
| 33%   41C    P8             21W /  260W |       1MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
#@title Checking that PyTorch Sees CUDAnot
import torch
torch.cuda.is_available()

True

In [18]:
#@title Step 7: Defining the configuration of the Model
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [19]:
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [21]:
#@title Step 8: Re-creating the Tokenizer in Transformers
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(token_dir, max_length=512)



In [22]:
#@title Step 9: Initializing a Model From Scratch
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [23]:
print(model.num_parameters())

83504416


In [24]:
#@title Exploring the Parameters
LP=list(model.parameters())
lp=len(LP)
print(lp)
for p in range(0,lp):
  print(LP[p])

106
Parameter containing:
tensor([[ 3.4568e-02, -3.6238e-03, -1.3313e-02,  ...,  6.6635e-03,
          3.6893e-02, -1.3289e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-2.4095e-02,  1.9057e-02,  2.4376e-02,  ...,  3.3443e-02,
         -3.5509e-02, -4.4411e-02],
        ...,
        [ 8.0807e-03,  4.7282e-03,  4.4271e-02,  ..., -2.5042e-02,
          5.6747e-05, -9.5335e-03],
        [-2.0051e-02, -3.5542e-02, -1.3012e-02,  ..., -5.8444e-03,
          2.8605e-02,  2.4528e-02],
        [ 6.1061e-03,  4.0737e-03, -1.9781e-03,  ...,  1.8624e-02,
          1.1219e-02,  3.4973e-02]], requires_grad=True)
Parameter containing:
tensor([[ 0.0156,  0.0019, -0.0095,  ..., -0.0038,  0.0130, -0.0005],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0239, -0.0102, -0.0104,  ...,  0.0225, -0.0166,  0.0070],
        ...,
        [ 0.0298, -0.0330,  0.0541,  ...,  0.0126,  0.0300, -0.0031],
       

In [25]:
#@title Counting the parameters
np=0
for p in range(0,lp):#number of tensors
  PL2=True
  try:
    L2=len(LP[p][0]) #check if 2D
  except:
    L2=1             #not 2D but 1D
    PL2=False
  L1=len(LP[p])      
  L3=L1*L2
  np+=L3             # number of parameters per tensor
  if PL2==True:
    print(p,L1,L2,L3)  # displaying the sizes of the parameters
  if PL2==False:
    print(p,L1,L3)  # displaying the sizes of the parameters

print(np)              # total number of parameters

0 52000 768 39936000
1 514 768 394752
2 1 768 768
3 768 768
4 768 768
5 768 768 589824
6 768 768
7 768 768 589824
8 768 768
9 768 768 589824
10 768 768
11 768 768 589824
12 768 768
13 768 768
14 768 768
15 3072 768 2359296
16 3072 3072
17 768 3072 2359296
18 768 768
19 768 768
20 768 768
21 768 768 589824
22 768 768
23 768 768 589824
24 768 768
25 768 768 589824
26 768 768
27 768 768 589824
28 768 768
29 768 768
30 768 768
31 3072 768 2359296
32 3072 3072
33 768 3072 2359296
34 768 768
35 768 768
36 768 768
37 768 768 589824
38 768 768
39 768 768 589824
40 768 768
41 768 768 589824
42 768 768
43 768 768 589824
44 768 768
45 768 768
46 768 768
47 3072 768 2359296
48 3072 3072
49 768 3072 2359296
50 768 768
51 768 768
52 768 768
53 768 768 589824
54 768 768
55 768 768 589824
56 768 768
57 768 768 589824
58 768 768
59 768 768 589824
60 768 768
61 768 768
62 768 768
63 3072 768 2359296
64 3072 3072
65 768 3072 2359296
66 768 768
67 768 768
68 768 768
69 768 768 589824
70 768 768
71 768 768

In [26]:
%%time
#@title Step 10: Building the Dataset
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=paths[0],
    block_size=128,
)



CPU times: user 1min 4s, sys: 434 ms, total: 1min 4s
Wall time: 1min 4s


In [27]:
#@title Step 11: Defining a Data Collator
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [29]:
#@title Step 12: Initializing the Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=model_dir,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [30]:
%%time
#@title Step 13: Pre-training the Model
trainer.train()

Step,Training Loss
500,2.3276
1000,1.3802
1500,1.264
2000,1.1937
2500,1.1453
3000,1.1026
3500,1.0639
4000,1.0376
4500,1.0147
5000,0.9906


CPU times: user 43min 2s, sys: 4.75 s, total: 43min 7s
Wall time: 43min 2s


TrainOutput(global_step=15625, training_loss=1.0023751328125, metrics={'train_runtime': 2581.9497, 'train_samples_per_second': 387.304, 'train_steps_per_second': 6.052, 'total_flos': 1.2521924780212224e+16, 'train_loss': 1.0023751328125, 'epoch': 1.0})

In [40]:
#@title Step 14: Saving the Final Model(+tokenizer + config) to disk
trainer.save_model(model_dir)

In [41]:
#@title Step 15: Language Modeling with the FillMaskPipeline
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model_dir,
    tokenizer=model_dir
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [46]:
fill_mask("(8 * 10) =<mask>")


[{'score': 0.942259669303894,
  'token': 317,
  'token_str': ' 80',
  'sequence': '(8 * 10) = 80'},
 {'score': 0.009426506236195564,
  'token': 333,
  'token_str': ' 64',
  'sequence': '(8 * 10) = 64'},
 {'score': 0.008407344110310078,
  'token': 340,
  'token_str': ' 100',
  'sequence': '(8 * 10) = 100'},
 {'score': 0.004981613717973232,
  'token': 309,
  'token_str': ' 72',
  'sequence': '(8 * 10) = 72'},
 {'score': 0.004320512991398573,
  'token': 292,
  'token_str': ' 18',
  'sequence': '(8 * 10) = 18'}]

In [48]:

fill_mask("(8 * 10) =80<mask>")

[{'score': 0.8048815131187439,
  'token': 22,
  'token_str': '2',
  'sequence': '(8 * 10) =802'},
 {'score': 0.1815842241048813,
  'token': 21,
  'token_str': '1',
  'sequence': '(8 * 10) =801'},
 {'score': 0.00044562420225702226,
  'token': 28,
  'token_str': '8',
  'sequence': '(8 * 10) =808'},
 {'score': 0.0002526743046473712,
  'token': 307,
  'token_str': '12',
  'sequence': '(8 * 10) =8012'},
 {'score': 0.00021941715385764837,
  'token': 289,
  'token_str': ' 16',
  'sequence': '(8 * 10) =80 16'}]

In [49]:
fill_mask("(4 * (1 - ((((((((6 - 1) * ((3 * (8 * 8)) - 9)) + 2) + 1) * 3) - 3) - (((5 - (((8 * 7) * 9) * (8 * (((7 + 5) + 7) * 6)))) * (10 - ((8 - 2) * 10))) - 3)) + 8))) = <mask>.")

[{'score': 0.10359420627355576,
  'token': 261,
  'token_str': ' -',
  'sequence': '(4 * (1 - ((((((((6 - 1) * ((3 * (8 * 8)) - 9)) + 2) + 1) * 3) - 3) - (((5 - (((8 * 7) * 9) * (8 * (((7 + 5) + 7) * 6)))) * (10 - ((8 - 2) * 10))) - 3)) + 8))) = -.'},
 {'score': 0.08763130009174347,
  'token': 275,
  'token_str': ' 8',
  'sequence': '(4 * (1 - ((((((((6 - 1) * ((3 * (8 * 8)) - 9)) + 2) + 1) * 3) - 3) - (((5 - (((8 * 7) * 9) * (8 * (((7 + 5) + 7) * 6)))) * (10 - ((8 - 2) * 10))) - 3)) + 8))) = 8.'},
 {'score': 0.07922379672527313,
  'token': 268,
  'token_str': ' (',
  'sequence': '(4 * (1 - ((((((((6 - 1) * ((3 * (8 * 8)) - 9)) + 2) + 1) * 3) - 3) - (((5 - (((8 * 7) * 9) * (8 * (((7 + 5) + 7) * 6)))) * (10 - ((8 - 2) * 10))) - 3)) + 8))) = (.'},
 {'score': 0.06986390054225922,
  'token': 272,
  'token_str': ' 5',
  'sequence': '(4 * (1 - ((((((((6 - 1) * ((3 * (8 * 8)) - 9)) + 2) + 1) * 3) - 3) - (((5 - (((8 * 7) * 9) * (8 * (((7 + 5) + 7) * 6)))) * (10 - ((8 - 2) * 10))) - 3)) + 8))) 