## Francisco Teixeira Rocha Aragão 2021031726

In [1]:
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
import torch.nn.functional as F

from tqdm import tqdm
import numpy as np
import logging
import pandas as pd

# Set logging level
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler())

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create a device object
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the BERT tokenizer
bert_tokenizer = BertTokenizerFast.from_pretrained("neuralmind/bert-base-portuguese-cased")

# Initialize the label encoder
label_encoder = LabelEncoder()

# Set max_length
max_length = 512

def preprocess_and_tokenize(data):
    """Preprocesses and tokenizes the input data.
    Returns tokenized inputs, padded POS tags, and attention masks. The
    input_ids and padded_pos_tags are padded to `max_length`. There is a
    one-to-one correspondence between the tokens, POS tags and
    attention masks.\
    Arguments:
        data: list of strings. Each string is a sentence with words and POS
            tags separated by an underscore. For example,
            "Jersei_N atinge_V média_N de_PREP".
    Returns:
        input_ids: torch.tensor of shape (num_sentences, max_length).
            It is a "list" of numerical values (ids) that represent each token
            in the input text. The values are based on the vocabulary of the
            pre-trained BERT model.
        padded_pos_tags: torch.tensor of shape (num_sentences, max_length).
            It is a "list" of numerical values (ids) that represent each POS
            tag in the input text. The values are based on the label encoder.
        attention_masks: torch.tensor of shape (num_sentences, max_length).
            It is a "list" of 0s and 1s. The 1s indicate the position of the
            tokens in the input_ids tensor. The 0s indicate the padding.
    """
    sentences = []
    pos_tags = []
    unique_tags = set()

    # Split data into sentences and POS tags
    # count = 0
    for line in tqdm(data, desc="Splitting data into sentences and POS tags"):
        words = []
        tags = []
        #logging.debug(f"line: {line}")
        for word in line.split():
            split_word = word.split('_')
            #logging.debug(f"split_word: {split_word}")
            words.append(split_word[0])
            tags.append(split_word[1])
            unique_tags.add(split_word[1])
            #logging.debug(f"words: {words}")
            #logging.debug(f"unique_tags: {unique_tags}")
        sentences.append(words)
        pos_tags.append(tags)
        # count += 1
        # if count > 50:
        #     break

    # Count the number of sentences and tags
    num_sentences = sum(len(sentence) for sentence in sentences)
    num_tags = sum(len(tags) for tags in pos_tags)
    print(f"Number of sentences: {num_sentences}")
    #logging.debug(f"Number of tags (classes): {num_tags}")
    print(f"Number of unique tags: {len(unique_tags)}")

    # Tokenize the sentences
    tokenized_inputs = bert_tokenizer(sentences, truncation=True, padding='max_length',
                                      max_length=max_length, is_split_into_words=True)

    # Pad input_ids to max_length
    input_ids = pad_sequence([torch.tensor(i) for i in tokenized_inputs["input_ids"]], batch_first=True)
    input_ids = input_ids.to(device)  # Move to GPU
    # FIXME: Do this next to avoid unnecessary conversions
    # input_ids = pad_sequence(tokenized_inputs["input_ids"], batch_first=True).to(device)
    #logging.debug(f"input_ids: {input_ids}")

    # Handle the POS tags
    new_pos_tags = []
    for sent_tags, input_id in zip(pos_tags, tokenized_inputs["input_ids"]):
        new_tags = []
        for tag in sent_tags:
            new_tags.extend([tag] * len(input_id))
        new_pos_tags.append(new_tags[:len(input_id)])

    # Fit the label encoder with the unique tags
    label_encoder.fit(list(unique_tags))
    # Encode the POS tags
    # encoded_pos_tags = [label_encoder.fit_transform(tags) for tags in new_pos_tags]
    encoded_pos_tags = [label_encoder.transform(tags) for tags in pos_tags]
    #logging.debug(f"encoded_pos_tags: {encoded_pos_tags}")

    # Pad the POS tags
    # padded_pos_tags = pad_sequence([torch.tensor(tags) for tags in encoded_pos_tags], batch_first=True)
    # padded_pos_tags = padded_pos_tags.to(device)  # Move to GPU
    padded_pos_tags = [F.pad(torch.tensor(tags), pad=(0, max_length - len(tags))) for tags in encoded_pos_tags]
    padded_pos_tags = torch.stack(padded_pos_tags).to(device)  # Stack the list of tensors into a single tensor and move to GPU
    #logging.debug(f"padded_pos_tags: {padded_pos_tags}")

    # Create attention masks
    attention_masks = [[float(i != 0.0) for i in seq] for seq in input_ids]
    attention_masks = torch.tensor(attention_masks).to(device)  # Move to GPU
    #logging.debug(f"attention_masks: {attention_masks}")

    print(f"Number of sequences (dataset lines): {input_ids.shape[0]}")
    print()

    return input_ids, padded_pos_tags, attention_masks

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /neuralmind/bert-base-portuguese-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /neuralmind/bert-base-portuguese-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


In [3]:

with open('data/macmorpho-train.txt', 'r') as f:
    data = f.readlines()
    input_ids, tags, masks = preprocess_and_tokenize(data)

with open('data/macmorpho-dev.txt', 'r') as f:
    data = f.readlines()
    val_input_ids, val_tags, val_masks = preprocess_and_tokenize(data)

UNIQUE_TAGS = 26
     


Splitting data into sentences and POS tags: 100%|██████████| 37948/37948 [00:00<00:00, 137034.98it/s]


Number of sentences: 728497
Number of unique tags: 26
Number of sequences (dataset lines): 37948



Splitting data into sentences and POS tags: 100%|██████████| 1997/1997 [00:00<00:00, 210062.32it/s]

Number of sentences: 38881
Number of unique tags: 26





Number of sequences (dataset lines): 1997



In [4]:
# Define a custom PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, input_ids, masks, tags):
        self.input_ids = input_ids
        self.masks = masks
        self.tags = tags.to('cpu')

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.masks[idx],
            'labels': self.tags[idx]
        }

# Move the tensors to the CPU
input_ids_cpu = input_ids.to('cpu')
masks_cpu = masks.to('cpu')
tags_cpu = tags.to('cpu')
val_input_ids_cpu = val_input_ids.to('cpu')
val_masks_cpu = val_masks.to('cpu')
val_tags_cpu = val_tags.to('cpu')

# Convert training data into PyTorch Dataset
# train_dataset = CustomDataset(input_ids, masks, tags)
train_dataset = CustomDataset(input_ids_cpu, masks_cpu, tags_cpu)

# Convert validation data into PyTorch Dataset
# eval_dataset = CustomDataset(val_input_ids, val_masks, val_tags)
eval_dataset = CustomDataset(val_input_ids_cpu, val_masks_cpu, val_tags_cpu)

# Initialize the BERT model
model = BertForTokenClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels=UNIQUE_TAGS,
)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=16, # 4, 16
    per_device_eval_batch_size=32,  # 4, 32, 64
    weight_decay=0.01,
    num_train_epochs=3,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=1000,  # Set logging_steps to see the validation loss and metrics
    # bf16=True, # or fp16, to reduce memory/computer usage/requirements
    save_strategy="epoch",
    dataloader_num_workers=3,
    load_best_model_at_end=True,
)


# Define the evaluation metrics
def compute_metrics(eval_pred):
    labels = np.ravel(eval_pred.label_ids)
    preds = np.ravel(eval_pred.predictions.argmax(-1))
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-multilingual-cased/resolve/main/config.json HTTP/1.1" 200 0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
classification_layer = model.classifier
print(f"classification_layer: {classification_layer}")

# Freeze all the layers of the BERT model
for param in model.bert.parameters():
    param.requires_grad = False

# Train only the classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True

# for name, param in model.named_parameters():
#     print(name, param.requires_grad)

# Check that only the classifier layer is trainable
for name, param in model.named_parameters():
    if 'classifier' in name:  # Check if the parameter belongs to the classifier layer
        assert param.requires_grad == True  # Assert that the classifier layer parameters are trainable
    else:
        assert param.requires_grad == False  # Assert that all other parameters are not trainable

classification_layer: Linear(in_features=768, out_features=26, bias=True)


In [6]:
# Train the model
train_output = trainer.train()

# Print the training loss
print(train_output.training_loss)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:

with open('data/macmorpho-test.txt', 'r') as f:
    data = f.readlines()
    test_input_ids, test_tags, test_masks = preprocess_and_tokenize(data)

test_input_ids_cpu = test_input_ids.to('cpu')
test_masks_cpu = test_masks.to('cpu')
test_tags_cpu = test_tags.to('cpu')

# Convert test data into PyTorch Dataset
test_dataset = CustomDataset(test_input_ids_cpu, test_masks_cpu, test_tags_cpu)

In [None]:

# Evaluate the model on the test set
trainer.args.per_device_eval_batch_size = 4  # reduce evaluation batch size
results = trainer.evaluate(test_dataset)

# Print the evaluation results
print(results)

In [None]:


# Make predictions
predictions, labels, _ = trainer.predict(test_dataset)
     

def get_words_from_input_ids(input_ids):
    words = []
    for ids in input_ids:
        words.append(bert_tokenizer.convert_ids_to_tokens(ids))
    return words

# Obtém as palavras a partir dos tokens
words = get_words_from_input_ids(test_input_ids)

# Apply softmax to get probabilities and then get the most probable tags
predictions = np.argmax(F.softmax(torch.from_numpy(predictions), dim=-1), axis=-1)

# Flatten predictions and labels
predictions = predictions.flatten()
labels = labels.flatten()

# Create a DataFrame with columns 'true', 'predicted', 'word'
df = pd.DataFrame({
    'true': [tags[val] for val in labels],
    'predicted': [tags[val] for val in predictions],
    'word': [words[val] for val in input_ids_cpu.flatten()]
})

# Find the words where the true label is equal to the predicted label
correct_predictions = df[df['true'] == df['predicted']]

# Find the words where the true label is not equal to the predicted label
incorrect_predictions = df[df['true'] != df['predicted']]

print("## Palavras que tiveram a classe gramatical corretamente acertadas")
print(correct_predictions['word'].value_counts().head())

print()
print("## Palavras onde a predição da classe gramatical foi errada, por pelo menos uma vez")
print(incorrect_predictions['word'].value_counts().head())
     
