In [None]:
# Sources: 
# https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/chapter7/section3_pt.ipynb#scrollTo=N2ZdQUBUsuJA
# https://huggingface.co/course/chapter7/3?fw=pt
# https://github.com/huggingface/transformers/tree/main/notebooks
# https://huggingface.co/course/chapter7/2?fw=pt


In [1]:
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForMaskedLM #, AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
# from transformers import AutoModel  # or BertModel, for BERT without pretraining heads

#model_checkpoint = "neuralmind/bert-large-portuguese-cased"
model_checkpoint = "neuralmind/bert-base-portuguese-cased"
# model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-large-portuguese-cased')
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=False)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import datasets

# Carregamento do Dataset criado
FOLDER_BASE = "/home/info/MyNotebooks/Datasets/SentencasTRT1/"
DS_FOLDER   = FOLDER_BASE + "DsClassAnot/"

import torch
torch.cuda.is_available = lambda : False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [None]:
DsClassAnot = datasets.load_from_disk(DS_FOLDER)
DsClassAnot

In [None]:

import torch

text = "Tinha uma [MASK] no meio do caminho."

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = DsClassAnot.map(
    tokenize_function, batched=True, remove_columns=['text', 'disp', 'label']
)
tokenized_datasets

In [None]:
tokenizer.model_max_length

In [None]:
chunk_size = 128

In [None]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets[:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

In [None]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

In [None]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

In [None]:
tokenizer.decode(lm_datasets[0]["input_ids"])

In [3]:
# Se estiver carregando o Dataset LM novamente após Dead Kernel, execute esta celula.
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets[i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

In [None]:
samples = [lm_datasets[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
len(lm_datasets)

In [None]:
train_size = int(len(lm_datasets)*0.8)
test_size  = int(len(lm_datasets)*0.2)

In [None]:
len(lm_datasets) - (train_size + test_size)

In [None]:
#train_size = 10_000
#test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
# SALVANDO O DATASET PARA TREINAMENTO DO MODELO DE LINGUAGEM:
#df = pd.DataFrame(dados, columns=['text','disp','label'])
# dsAnot = Dataset.from_pandas(df)
downsampled_dataset.save_to_disk(FOLDER_BASE +"LM-ACP-DS/")

In [4]:
# CARREGANDO O DATASET SALVO ANTERIORMENTE:
# Executar previamente as células 1 e 2 e a que instancia o Data Collator, um pouco acima
ds = datasets.load_from_disk(FOLDER_BASE +"LM-ACP-DS/")
ds

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids', 'word_ids'],
        num_rows: 130926
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids', 'word_ids'],
        num_rows: 32731
    })
})

In [5]:
# REDUZ O TAMANHO DO DATASET
#
from datasets import Dataset, DatasetDict
import pandas as pd

ds_train_size = int(len(ds['train'])*0.1)
ds_test_size  = int(len(ds['test'])*0.1)
print(ds_train_size, ds_test_size)


df_train = pd.DataFrame(ds['train'])  #, columns=['text','disp','label']
df_test = pd.DataFrame(ds['test'])
df_train = df_train[0:ds_train_size]
df_test = df_test[0:ds_test_size]
print(len(df_train), len(df_test))

downsampled_dataset = DatasetDict()
downsampled_dataset['train'] = Dataset.from_pandas(df_train)
downsampled_dataset['test'] = Dataset.from_pandas(df_test)
downsampled_dataset

13092 3273
13092 3273


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids', 'word_ids'],
        num_rows: 13092
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids', 'word_ids'],
        num_rows: 3273
    })
})

In [6]:
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids', 'word_ids'],
        num_rows: 13092
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids', 'word_ids'],
        num_rows: 3273
    })
})

In [None]:
type(downsampled_dataset)

In [None]:
type(downsampled_dataset)

In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [8]:
from transformers import TrainingArguments

batch_size = 24 #64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=FOLDER_BASE+"/ACP-Bert-LM/"+f"{model_name}-finetuned-ACP",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    #push_to_hub=True,
    #fp16=True,
    logging_steps=logging_steps,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
training_args.device

device(type='cpu')

In [11]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

In [12]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13092
  Num Epochs = 3
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 1638


Epoch,Training Loss,Validation Loss
1,1.6762,1.386321
2,1.4585,1.302494
3,1.3937,1.270493


Saving model checkpoint to bert-base-portuguese-cased-finetuned-ACP/checkpoint-500
Configuration saved in bert-base-portuguese-cased-finetuned-ACP/checkpoint-500/config.json
Model weights saved in bert-base-portuguese-cased-finetuned-ACP/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3273
  Batch size = 24
Saving model checkpoint to bert-base-portuguese-cased-finetuned-ACP/checkpoint-1000
Configuration saved in bert-base-portuguese-cased-finetuned-ACP/checkpoint-1000/config.json
Model weights saved in bert-base-portuguese-cased-finetuned-ACP/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If w

TrainOutput(global_step=1638, training_loss=1.509173138819917, metrics={'train_runtime': 8770.7022, 'train_samples_per_second': 4.478, 'train_steps_per_second': 0.187, 'total_flos': 2584386097514496.0, 'train_loss': 1.509173138819917, 'epoch': 3.0})

In [None]:
# Batch size de 64 para 24
# Modelo de Large para Base
# Redução do tamanho do Dataset

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
import pixiedust

In [None]:
%%pixie_debugger
#%debug
train_result = trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [29]:
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [24]:
model.state_dict()

OrderedDict([('bert.embeddings.position_ids',
              tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                        98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
                       1

In [25]:
checkpoint = {'epoch':, 'model_state_dict':model.state_dict(), 'optimizer_state_dict':, 'loss':, 'val_loss':}
PATH = "/home/info/MyNotebooks/Datasets/SentencasTRT1/ACP-Bert-LM/Modelo2.pth"
torch.save(checkpoint, PATH)

In [31]:
from transformers import AutoModel
checkpoint = "/home/info/MyNotebooks/bert-base-portuguese-cased-finetuned-ACP/checkpoint-1500/"
model2 = AutoModel.from_pretrained(checkpoint)
#model2.load_state_dict(torch.load(PATH))

loading configuration file /home/info/MyNotebooks/bert-base-portuguese-cased-finetuned-ACP/checkpoint-1500/config.json
Model config BertConfig {
  "_name_or_path": "/home/info/MyNotebooks/bert-base-portuguese-cased-finetuned-ACP/checkpoint-1500/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "type_vocab_size":

In [33]:
import torch

text = "Tinha uma [MASK] no meio do caminho."

model2.eval()

inputs = tokenizer(text, return_tensors="pt")
token_logits = model2(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

AttributeError: 'BaseModelOutputWithPoolingAndCrossAttentions' object has no attribute 'logits'