In [21]:
# train the tokenizer

from tqdm import tqdm
from transformers import BertTokenizerFast, AutoTokenizer
from datasets import load_dataset
import glob

#loading any bert SentencePiece tokenizer to work as a base for the new tokenizer
tokenizer = AutoTokenizer.from_pretrained('faisalq/bert-base-arabic-senpiece')



dataset = load_dataset('text', data_files=['pretrain_text.txt'])

display(dataset)

def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(dataset['train']), batch_size)):
        yield dataset['train'][i: i +batch_size]['text']
bert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), 
                                                   vocab_size=70000
                                                   #, special_tokens=['[CLS]', '[PAD]','[SEP]','[UNK]','[MASK]']
                                                  )
bert_tokenizer.save_pretrained('PoetBERT/')

Downloading and preparing dataset text/default to /home/ffq/.cache/huggingface/datasets/text/default-4bb4e3c62e0bcdd7/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /home/ffq/.cache/huggingface/datasets/text/default-4bb4e3c62e0bcdd7/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2090907
    })
})

100%|█████████████████████████████████████████| 210/210 [00:03<00:00, 69.06it/s]






('PoemBERT/tokenizer_config.json',
 'PoemBERT/special_tokens_map.json',
 'PoemBERT/tokenizer.json')

In [3]:
# tokenizing the whole text corpus

import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0" 

from datasets import load_dataset
import glob
import tokenizers
from transformers import Trainer, TrainingArguments, LineByLineTextDataset, BertModel
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('faisalq/bert-base-arapoembert')
tokenizer = AutoTokenizer.from_pretrained('PoetBERT/')
max_seq_length = 32

dataset = load_dataset('text', data_files=['pretrain_text.txt'])



def encode_with_truncation(examples):
  return tokenizer(examples["text"], truncation=True, padding="max_length",
                   max_length=max_seq_length, return_special_tokens_mask=True)


dataset = dataset["train"].map(encode_with_truncation, batched=True) #, load_from_cache_file=True
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

dataset = dataset.remove_columns(["text"])
dataset.save_to_disk("tokenized_dataset/")

display(dataset)

Resolving data files:   0%|          | 0/71 [00:00<?, ?it/s]

Found cached dataset text (/home/ffq/.cache/huggingface/datasets/text/default-79afbc99aca1de7f/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/132392608 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 132392608
})

In [None]:
#pretraining the model

from datasets import load_from_disk

# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0" 

from datasets import load_dataset
import glob
import tokenizers
from transformers import Trainer, TrainingArguments, LineByLineTextDataset, BertModel
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import AutoTokenizer

dataset = load_from_disk("tokenized_dataset/")

max_seq_length = 32
tokenizer = AutoTokenizer.from_pretrained('PoetBERT/')
config = BertConfig( vocab_size = 70000, 
                    hidden_size = 768, 
                    num_hidden_layers = 12,
                    num_attention_heads = 12,
                    max_position_embeddings = 512)

model = BertForMaskedLM(config)
display(model.num_parameters())


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True,
                                               mlm_probability=0.15)
epochs = 1000
save_steps = 50_000 #save checkpoint every 10000 steps
batch_size = 512 #256 #64 # i don't think you can run larger batch_size on an 80GB-GPU

training_args = TrainingArguments(
    output_dir = 'PoetBERT/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 5, #only save the last 5 checkpoints
    fp16=True,
    # tf32 = True,
    learning_rate = 5e-5,  # 5e-5 is the default
    logging_steps = 10_000,
    # gradient_accumulation_steps=2,
    # gradient_checkpointing=True,

)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset=dataset

)


# trainer.train(resume_from_checkpoint=True)
trainer.train()
trainer.save_model('PoetBERT/final/')