In [None]:
# Source: https://towardsdatascience.com/how-to-build-a-wordpiece-tokenizer-for-bert-f505d97dddbb

In [None]:
pip install sentencepiece # datasets transformers==4.11.2 

In [None]:
from datasets import *
from transformers import *
from tokenizers import *
# import os
# import json

In [None]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
# if you want to train the tokenizer on both sets
# files = ["train.txt", "test.txt"]
# training the tokenizer on the training set
files = ["train.txt"]
# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 30_522
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
# whether to truncate
truncate_longer_samples = True

In [None]:
paths = ['/home/info/MyNotebooks/Datasets/MPT/MPTD/Dataset_MPT/DenunBert.txt']

In [None]:
# TREINANDO O TOKENIZADOR
from tokenizers import BertWordPieceTokenizer



# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=30_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=['[PAD', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

tokenizer.enable_truncation(max_length=max_length)


In [None]:
# SALVANDO O TOKENIZADOR - Arquivo .json
tokenizer.save("/home/info/MyNotebooks/DenunBert/Tokenizer/BertWordPiece/BertWordPiece.json") # Cria BertWordPiece.json

In [None]:
# SALVANDO O TOKENIZADOR - vocab.json
# Cria vocab.txt
# During tokenization vocab.txt is used to map text to tokens, which are then mapped to token IDs based on the row 
# number of the token in vocab.txt — those IDs are then fed into BERT!

tokenizer.save_model("/home/info/MyNotebooks/DenunBert/Tokenizer/BertWordPiece/") 

In [None]:
# CARREGANDO O TOKENIZADOR

In [None]:
from transformers import BertTokenizer

WordPieceTokenizer = BertTokenizer.from_pretrained("/home/info/MyNotebooks/DenunBert/Tokenizer/BertWordPiece/")

In [None]:
WordPieceTokenizer("O Ministério Público do Trabalho resgatou milhares de pessoas") 

In [None]:
# As our vocab.txt file contains the mappings for our tokens and token IDs (e.g., the row numbers) — we can access 
# the tokens by aligning our input_ids token IDs to the rows in vocab.txt:

with open("/home/info/MyNotebooks/DenunBert/Tokenizer/BertWordPiece/vocab.txt", 'r') as fp:
    vocab = fp.read().split('\n')

vocab[2],vocab[81],vocab[7955],vocab[5098],vocab[254],vocab[323],vocab[14577],vocab[1041],vocab[14781],vocab[219],vocab[900],vocab[3]

In [None]:
# LOADING DATAFRAMES DATASETDICTS FOR TRAINING THE MODEL

In [None]:
import datasets
import pandas as pd

paths = ['/home/info/MyNotebooks/Datasets/MPT/MPTD/Dataset_MPT/DenunBert.txt']


ds_dir = "/home/info/MyNotebooks/Datasets/MPT/MPTD/"
#ds_dir = "/home/info/.cache/huggingface/datasets/"
ds = datasets.load_from_disk(ds_dir+"Dataset_MPT")

train_dataset = ds["train"]["tip_text"] # Retorna a coluna tip_text como objeto list!
test_dataset  = ds["validation"]["tip_text"]

train_df = pd.DataFrame(train_dataset, columns=['text']) # renomeia a coluna para 'text'
test_df  = pd.DataFrame(test_dataset, columns=['text'])

train_dataset = Dataset.from_pandas(train_df) # Instancia os Datasetdicts
test_dataset = Dataset.from_pandas(test_df)


train_dir     = "/home/info/MyNotebooks/Datasets/MPT/MPTD/Partials/Bert/train_dataset" 
test_dir      = "/home/info/MyNotebooks/Datasets/MPT/MPTD/Partials/Bert/test_dataset"

train_dataset.save_to_disk(train_dir)
test_dataset.save_to_disk(test_dir)

In [None]:
# Tokenizing the Dataset
# Now that we have the tokenizer ready, the below code is responsible for tokenizing the dataset:

def encode_with_truncation(examples):
  """Mapping function to tokenize the sentences passed with truncation"""
  return WordPieceTokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return WordPieceTokenizer(examples["text"], return_special_tokens_mask=True)

# the encode function will depend on the truncate_longer_samples variable
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation

# tokenizing the train dataset
train_tokenized_dataset = train_dataset.map(encode, batched=True)
# tokenizing the testing dataset
test_tokenized_dataset = test_dataset.map(encode, batched=True)

if truncate_longer_samples:
  # remove other columns and set input_ids and attention_mask as 
  train_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
  test_tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
  test_tokenized_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
  train_tokenized_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
    
train_tokenized_dataset, test_tokenized_dataset

# Salvar os datasets tokenizados para nao necessitar tokeniza-los novamente caso o notebook reinicie o kernel.

train_tkz_dir = "/home/info/MyNotebooks/Datasets/MPT/MPTD/Partials/Bert/train_tokenized_dataset"
test_tkz_dir  = "/home/info/MyNotebooks/Datasets/MPT/MPTD/Partials/Bert/test_tokenized_dataset"

train_tokenized_dataset.save_to_disk(train_tkz_dir)
test_tokenized_dataset.save_to_disk(test_tkz_dir)

In [None]:
# Next, in the case of setting truncate_longer_samples to False, we need to join our untruncated samples together and cut them into fixed-size vectors since the model expects a fixed-sized sequence during training:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

if not truncate_longer_samples:
  train_dataset = train_dataset.map(group_texts, batched=True, batch_size=2_000,
                                    desc=f"Grouping texts in chunks of {max_length}")
  test_dataset = test_dataset.map(group_texts, batched=True, batch_size=2_000,
                                  num_proc=4, desc=f"Grouping texts in chunks of {max_length}")

In [None]:
# DATACOLLATOR
#
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=WordPieceTokenizer, mlm=True, mlm_probability=0.2
)

In [None]:
# if you have huge custom dataset separated into files
# load the splitted files

files = ['/home/info/MyNotebooks/Datasets/MPT/MPTD/Dataset_MPT/DenunBert.txt']
# dataset = load_dataset("text", data_files=files, split="train")
dataset = load_dataset("text", data_files=files)
dataset
#dataset_temp = dataset['train']
# dataset["train"]["text"][0]

In [None]:
# CONFIGURING THE MODEL
#
# initialize the model with the config
from transformers import BertConfig, BertForMaskedLM

# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 30_522
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
# whether to truncate
truncate_longer_samples = True

model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)

model = BertForMaskedLM(config=model_config)

# We initialize the model config using BertConfig, and pass the vocabulary size as well as the maximum sequence 
# length. We then pass the config to BertForMaskedLM to initialize the model itself.

In [None]:
# Se o KERNEL DO NOTEBOOK REINICIAR, alem dos passos que reconstruem o modelo e recarregam o tokenizador, devemos
# recarregar também os datasets tokenizados (ou os datasets originais):
import datasets

ds_train_dir = "/home/info/MyNotebooks/Datasets/MPT/MPTD/Partials/Bert/train_tokenized_dataset"
ds_test_dir  = "/home/info/MyNotebooks/Datasets/MPT/MPTD/Partials/Bert/test_tokenized_dataset"

train_tokenized_dataset = datasets.load_from_disk(ds_train_dir)
test_tokenized_dataset  = datasets.load_from_disk(ds_test_dir)

train_tokenized_dataset, test_tokenized_dataset

In [None]:
# TRAINING THE MODEL
from transformers import TrainingArguments, Trainer

model_path = "/home/info/MyNotebooks/DenunBert/Tokenizer/BertWordPiece/Model"

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=1, #10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=1, #8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=500,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=500,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [None]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
)


In [None]:
# train the model
trainer.train()

In [None]:
trainer.save_model("")

In [None]:
# Alternativamente ao tempo de treinamento de um modelo de linguagem base no dominio, podemos tomar algum outro
# generico em portugues (Bertimbau, Dominio Juridico?) como base e usando TransformerAdapter, refina-lo nas tarefas 
# especificas usando nosso dataset do dominio requerido.
# 
# Ou, se nem assim houver tempo, montar a arquitetura da solução com os modelos transformers disponíveis
# E avaliar o resultado de suas aplicações.
# https://huggingface.co/neuralmind/bert-large-portuguese-cased
#
# Podemos tambem usar um AdapterLang em modelo de NLP treinado em dataset juridico em ingles!
#

In [None]:
# load the model checkpoint
model = BertForMaskedLM.from_pretrained(os.path.join(model_path, "checkpoint-10000"))
# load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [None]:
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
# perform predictions
examples = [
  "Today's most trending hashtags on [MASK] is Donald Trump",
  "The [MASK] was cloudy yesterday, but today it's rainy.",
]
for example in examples:
  for prediction in fill_mask(example):
    print(f"{prediction['sequence']}, confidence: {prediction['score']}")
  print("="*50)

In [None]:
# %%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=WordPieceTokenizer,
    file_path="/home/info/MyNotebooks/Datasets/MPT/MPTD/Dataset_MPT/DenunBert.txt",
    block_size=128,
)


dataset_temp = dataset['train']
d = dataset_temp.train_test_split(test_size=0.2)

In [None]:
#from transformers import RobertaTokenizer

# initialize the tokenizer using the tokenizer we initialized and saved to file
#RobToken = RobertaTokenizer.from_pretrained("/home/info/MyNotebooks/DenunBert/Tokenizer/BertWordPiece/vocab.txt", max_len=512)


In [None]:
# if you want to train the tokenizer from scratch (especially if you have custom
# dataset loaded as datasets object), then run this cell to save it as files
# but if you already have your custom data as text files, there is no point using this
def dataset_to_text(dataset, output_filename="data.txt"):
  """Utility function to save dataset text to disk,
  useful for using the texts to train the tokenizer 
  (as the tokenizer accepts files)"""
  with open(output_filename, "w") as f:
    for t in dataset["text"]:
      print(t, file=f)

# save the training set to train.txt
dataset_to_text(d["train"], "train.txt")
# save the testing set to test.txt
dataset_to_text(d["test"], "test.txt")
# https://www.thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python

In [None]:
from datasets import load_dataset

dataset = load_dataset('squad', split='train')

dataset.features
{'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None),
'context': Value(dtype='string', id=None),
'id': Value(dtype='string', id=None),
'question': Value(dtype='string', id=None),
'title': Value(dtype='string', id=None)}
# https://huggingface.co/docs/datasets/process