# Francisco Teixeira Rocha Aragão 2021031726

Nesse arquivo contem a implementação da solução para um problema de 'pos tagging', que é a tarefa de atribuir uma tag a cada palavra de uma sentença, em que no caso deste trabalho, envolve a atribuição de tags de classes gramáticais para frases em português.

Os dados utilizados são do corpus MacMorpho, que contém textos em português com as respectivas tags de classes gramaticais. O corpus já está dividido em 3 partes: treino, validação e teste. É possível baixar o corpus no [link referenciado](http://nilc.icmc.usp.br/macmorpho/macmorpho-v3.tgz)

Desse modo, o trabalho desenvolvido envolve a utilização de modelos para realizar a tarefa de pos tagging, com o desempenho sendo medido e retornando em cada caso testado. Vale destacar inicialmente que algumas estratégias foram testadas, porém não foram bem sucessidas, como o treinamento de muitas camadas do modelo BERT em português, ou a utilização de LLMs para a tarefa de pos tagging. A falta de sucesso deve-se a necessidade de maior poder de processamento, GPU e de memória para trabalhar com tarefas tão complexas e modelos tão grandes, o que não foi possível de ser feito localmente.

Com isso, a estratégia adotada foi de utilizar modelos já especializados na tarefa de pós taggins (fine tuning). Assim, os modelos escolhidos foram treinandos utilizando o próprio dataset MacMorpho, além de terem como base modelos BERT treinados em língua portuguesa. Fazer o uso desses modelos foi de grande ajuda pois não foi necessário o treinamento, apenas carregar em memória e utilizar.

Os resultados da utilização dos modelos estão descritos abaixo.

In [1]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import json
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

tokenizer = AutoTokenizer.from_pretrained("pucpr-br/postagger-bio-portuguese")

model = AutoModelForTokenClassification.from_pretrained("pucpr-br/postagger-bio-portuguese")

nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)




Device set to use cpu


In [3]:
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()
    

In [15]:
print(lines[0:4])

['Salto_N sete_ADJ\n', 'O_ART grande_ADJ assunto_N da_PREP+ART semana_N em_PREP Nova_NPROP York_NPROP é_V a_ART edição_N da_PREP+ART revista_N "_PU New_NPROP Yorker_NPROP "_PU que_PRO-KS está_V nas_PREP+ART bancas_N ._PU\n', 'Número_N duplo_ADJ especial_ADJ ,_PU é_V inteirinho_ADJ dedicado_PCP a_PREP ensaios_N sobre_PREP moda_N ._PU\n', 'A_ART endiabrada_PCP editora_N Tina_NPROP Brown_NPROP ex_N da_PREP+ART "_PU Vanity_NPROP Fair_NPROP -_PU convocou_V até_PDEN John_NPROP Updike_NPROP e_KC Salman_NPROP Rushdie_NPROP para_PREP discorrer_V sobre_PREP o_ART tema_N ._PU\n']


In [4]:
# organizando os dados, separando as palavras das tags
words = []
tags = []
for line in lines:
    # separate each word
    words_tags = line.split()
    for word_tag in words_tags:
        # separate the word from the tag
        word, tag = word_tag.split('_')
        words.append(word)
        tags.append(tag.strip())

In [21]:
print(words[0:4])
print(tags[0:4])

['Salto', 'sete', 'O', 'grande']
['N', 'ADJ', 'ART', 'ADJ']


In [40]:


# realizando a inferência das tags
total_words = len(words)
results = []

for word, tag in tqdm(zip(words, tags), total=total_words, desc="Processing words"):
    prediction = nlp_token_class(word)
    result = {
        'word': word,
        'tag': tag,
        'prediction': prediction[0]['entity_group']
    }
    if prediction[0]['entity_group'] == tag:
        result['correct'] = True
    else:
        result['correct'] = False
    
    results.append(result)

Processing words:   3%|▎         | 23144/728497 [06:21<3:29:27, 56.12it/s]

In [41]:
# salvando resultados em um arquivo para análise posterior

with open('results/results_postagger-bio-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)

In [None]:
# abrino o arquivo gerado
with open('results/results_postagger-bio-portuguese.json', 'r') as file:
    results = json.load(file)

In [43]:
# calculando acurácia do modelo
correct_predictions = 0
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Acurácia geral: {accuracy:.2f}')

Accuracy: 0.68


In [19]:
# observando resultados mais detalhados

# agora é calculado a acurácia para cada tag
correct_tags = {}
total_tags = {}

for result in results:
    if result['expected_tag'] not in total_tags:
        total_tags[result['expected_tag']] = 1
    else:
        total_tags[result['expected_tag']] += 1
    
    if result['correct']:
        if result['expected_tag'] not in correct_tags:
            correct_tags[result['expected_tag']] = 1
        else:
            correct_tags[result['expected_tag']] += 1

accuracy_tags = {}
for tag in total_tags:
    try:
        accuracy_tags[tag] = correct_tags[tag] / total_tags[tag]
    except KeyError:
        accuracy_tags[tag] = 0

# print in descending order
for tag, accuracy in sorted(accuracy_tags.items(), key=lambda x: x[1], reverse=True):
    print(f'TAG: {tag} - ACURÁCIA: {accuracy:.2f}')

TAG: PU - ACURÁCIA: 1.00
TAG: PREP+PROPESS - ACURÁCIA: 0.98
TAG: N - ACURÁCIA: 0.97
TAG: KC - ACURÁCIA: 0.96
TAG: PROPESS - ACURÁCIA: 0.93
TAG: ADV - ACURÁCIA: 0.92
TAG: V - ACURÁCIA: 0.92
TAG: PROSUB - ACURÁCIA: 0.88
TAG: PREP - ACURÁCIA: 0.85
TAG: PREP+ADV - ACURÁCIA: 0.84
TAG: PCP - ACURÁCIA: 0.82
TAG: PREP+ART - ACURÁCIA: 0.75
TAG: PDEN - ACURÁCIA: 0.55
TAG: ADJ - ACURÁCIA: 0.50
TAG: NPROP - ACURÁCIA: 0.48
TAG: PREP+PROADJ - ACURÁCIA: 0.47
TAG: PREP+PROSUB - ACURÁCIA: 0.41
TAG: IN - ACURÁCIA: 0.40
TAG: ART - ACURÁCIA: 0.08
TAG: PROADJ - ACURÁCIA: 0.05
TAG: NUM - ACURÁCIA: 0.04
TAG: KS - ACURÁCIA: 0.00
TAG: CUR - ACURÁCIA: 0.00
TAG: PRO-KS - ACURÁCIA: 0.00
TAG: ADV-KS - ACURÁCIA: 0.00
TAG: PREP+PRO-KS - ACURÁCIA: 0.00


In [None]:
# imprimindo o número de ocorrências de cada tag
for tag, total in total_tags.items():
    print(f'TAG: {tag} - OCORRÊNCIAS: {total}')

## Novo teste outro modelo com fine tunning

In [21]:
# lendo dados de entrada novamente e separado melhor o arquivo
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

dataT = []
# usando apenas metade dos dados para não sobrecarregar o kernel
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataT.append((word, tag.strip()))


Exception ignored in: <function _ConnectionBase.__del__ at 0x7fdec6172660>
Traceback (most recent call last):
  File "/home/francisco/Downloads/miniconda3/lib/python3.12/multiprocessing/connection.py", line 133, in __del__
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/home/francisco/Downloads/miniconda3/lib/python3.12/multiprocessing/queues.py", line 259, in _feed
    reader_close()
  File "/home/francisco/Downloads/miniconda3/lib/python3.12/multiprocessing/connection.py", line 178, in close
    self._close()
  File "/home/francisco/Downloads/miniconda3/lib/python3.12/multiprocessing/connection.py", line 377, in _close
    self._close()
  File "/home/francisco/Downloads/miniconda3/lib/python3.12/multiprocessing/connection.py", line 377, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/francisco/Downloads/min

In [3]:
dataT[0:5]

[('Jersei', 'N'),
 ('atinge', 'V'),
 ('média', 'N'),
 ('de', 'PREP'),
 ('Cr$', 'CUR')]

In [4]:
# pegando todas as tags
tags = set([tag for _, tag in dataT])
tags

{'ADJ',
 'ADV',
 'ADV-KS',
 'ART',
 'CUR',
 'IN',
 'KC',
 'KS',
 'N',
 'NPROP',
 'NUM',
 'PCP',
 'PDEN',
 'PREP',
 'PREP+ADV',
 'PREP+ART',
 'PREP+PRO-KS',
 'PREP+PROADJ',
 'PREP+PROPESS',
 'PREP+PROSUB',
 'PRO-KS',
 'PROADJ',
 'PROPESS',
 'PROSUB',
 'PU',
 'V'}

In [19]:
# carregando o modelo
pipe = pipeline("token-classification", model="lisaterumi/postagger-portuguese", tokenizer="lisaterumi/postagger-portuguese", aggregation_strategy="simple")

Device set to use cpu


In [36]:

# raelizando predição das tags
results = []

for word, expected_tag in tqdm(dataT, desc="Processing words"):
    prediction = pipe(word)
    
    # pegando a tag prevista
    predicted_tag = prediction[0]['entity_group']
    
    result_data = {
        'word': word,
        'expected_tag': expected_tag,
        'predicted_tag': predicted_tag,
        'correct': predicted_tag == expected_tag,
    }
    
    # Append result to the results list
    results.append(result_data)
        
# salvando resultados em um arquivo json
with open('results/results_postagger-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)
        

Processing words: 100%|██████████| 387877/387877 [1:34:09<00:00, 68.66it/s]


In [None]:
# abrindo arquivo para analise dos resultados -> acuracia geral

with open('results/results_postagger-portuguese.json', 'r') as file:
    results = json.load(file)

In [4]:

correct_predictions = 0


total_words = len(results)
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Acurácia geral: {accuracy:.2f}')

Accuracy: 0.74


In [20]:
# calculando novamente os resultados de acurácia para cada tag
correct_tags = {}
total_tags = {}

for result in results:
    if result['expected_tag'] not in total_tags:
        total_tags[result['expected_tag']] = 1
    else:
        total_tags[result['expected_tag']] += 1
    
    if result['correct']:
        if result['expected_tag'] not in correct_tags:
            correct_tags[result['expected_tag']] = 1
        else:
            correct_tags[result['expected_tag']] += 1

accuracy_tags = {}
for tag in total_tags:
    try:
        accuracy_tags[tag] = correct_tags[tag] / total_tags[tag]
    except KeyError:
        accuracy_tags[tag] = 0

# print in descending order
for tag, accuracy in sorted(accuracy_tags.items(), key=lambda x: x[1], reverse=True):
    print(f'TAG: {tag} - ACURÁCIA: {accuracy:.2f}')

TAG: PU - ACURÁCIA: 1.00
TAG: PREP+PROPESS - ACURÁCIA: 0.98
TAG: N - ACURÁCIA: 0.97
TAG: KC - ACURÁCIA: 0.96
TAG: ADV - ACURÁCIA: 0.92
TAG: V - ACURÁCIA: 0.92
TAG: PREP+ADV - ACURÁCIA: 0.89
TAG: PROSUB - ACURÁCIA: 0.88
TAG: PREP - ACURÁCIA: 0.85
TAG: PCP - ACURÁCIA: 0.82
TAG: PROPESS - ACURÁCIA: 0.79
TAG: PREP+ART - ACURÁCIA: 0.75
TAG: PDEN - ACURÁCIA: 0.56
TAG: ADJ - ACURÁCIA: 0.50
TAG: NPROP - ACURÁCIA: 0.49
TAG: PREP+PROADJ - ACURÁCIA: 0.47
TAG: PREP+PROSUB - ACURÁCIA: 0.41
TAG: IN - ACURÁCIA: 0.40
TAG: NUM - ACURÁCIA: 0.12
TAG: ART - ACURÁCIA: 0.08
TAG: PROADJ - ACURÁCIA: 0.05
TAG: KS - ACURÁCIA: 0.00
TAG: CUR - ACURÁCIA: 0.00
TAG: PRO-KS - ACURÁCIA: 0.00
TAG: ADV-KS - ACURÁCIA: 0.00
TAG: PREP+PRO-KS - ACURÁCIA: 0.00


In [1]:
import os

# Função para carregar os dados e separar tokens e tags
def load_macmorpho_data(filepath):
    sentences, labels = [], []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            tokens, tags = [], []
            for pair in line.split():
                token, tag = pair.rsplit("_", maxsplit=1)  # Separar o token e a tag
                tokens.append(token)
                tags.append(tag)
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

In [3]:
train_sentences, train_labels = load_macmorpho_data("./data/macmorpho-train.txt")
dev_sentences, dev_labels = load_macmorpho_data("./data/macmorpho-dev.txt")
test_sentences, test_labels = load_macmorpho_data("./data/macmorpho-test.txt")

In [4]:
all_labels = set(tag for tags in train_labels for tag in tags)
tag2id = {tag: idx for idx, tag in enumerate(sorted(all_labels))}
id2tag = {idx: tag for tag, idx in tag2id.items()}

In [5]:
from transformers import AutoTokenizer

# Carregar o tokenizer
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

# Função para tokenizar as sentenças e alinhar os rótulos
def tokenize_and_align_labels(sentences, labels, tokenizer):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Mapear tokens originais para IDs de palavras
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignorar tokens especiais
            else:
                label_ids.append(tag2id[label[word_idx]])  # Mapear rótulo
        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Tokenizar e alinhar rótulos para os datasets
train_data = tokenize_and_align_labels(train_sentences, train_labels, tokenizer)
dev_data = tokenize_and_align_labels(dev_sentences, dev_labels, tokenizer)
test_data = tokenize_and_align_labels(test_sentences, test_labels, tokenizer)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoModelForTokenClassification

# Carregar o modelo com uma cabeça de classificação de tokens
model = AutoModelForTokenClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=len(tag2id)
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Congelar todas as camadas do encoder, exceto as últimas
for name, param in model.bert.named_parameters():
    if not name.startswith("encoder.layer.11"):  # Ajustar para treinar somente a última camada
        param.requires_grad = False


In [10]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset

# Converter os dados para o formato Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
dev_dataset = Dataset.from_dict(dev_data)

# Configuração dos argumentos de treinamento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    dataloader_num_workers=4,  # Increase this based on your CPU core count
)

# Configurar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [11]:
trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss
1,0.1723,0.101586
2,0.1011,0.089282
3,0.09,0.086317


	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/to

TrainOutput(global_step=14232, training_loss=0.12114190653242912, metrics={'train_runtime': 9829.4963, 'train_samples_per_second': 11.582, 'train_steps_per_second': 1.448, 'total_flos': 7438380641998848.0, 'train_loss': 0.12114190653242912, 'epoch': 3.0})

In [12]:
# salvar o novo modelo que foi treinado agora
save_directory = "./trained_model"

trainer.save_model(save_directory)


In [24]:
# Load the fine-tuned model and tokenizer
model_path = "./trained_model"  # Adjust based on your saved model path
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Initialize the token classification pipeline
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, grouped_entities=True)

# Assuming dataT contains a list of tuples [(word, expected_tag), ...]
# If your dataset isn't in this format, preprocess it accordingly

# Initialize the results list
results = []

# Process each word in the dataset
for word, expected_tag in tqdm(dataT, desc="Processing words"):
    # Make prediction using the pipeline
    prediction = pipe(word)
    
    # Get the predicted tag (entity_group gives the label name)
    predicted_tag = prediction[0]['entity_group'] if prediction else None
    
    # Prepare result data
    result_data = {
        'word': word,
        'expected_tag': expected_tag,
        'predicted_tag': predicted_tag,
        'correct': predicted_tag == expected_tag,
    }
    
    # Append result to the results list
    results.append(result_data)
    
    if len(results) == 10:
        break

# Optionally, save results to a file
import json
with open("./results/results_bert_trained.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

Device set to use cpu
Processing words:   0%|          | 0/387877 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing words:   0%|          | 9/387877 [00:00<2:09:37, 49.87it/s]
