In [2]:
from transformers import pipeline

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("pucpr-br/postagger-bio-portuguese")

model = AutoModelForTokenClassification.from_pretrained("pucpr-br/postagger-bio-portuguese")

nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)




In [9]:
with open('data/macmorpho-test.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()
    

In [15]:
print(lines[0:4])

['Salto_N sete_ADJ\n', 'O_ART grande_ADJ assunto_N da_PREP+ART semana_N em_PREP Nova_NPROP York_NPROP é_V a_ART edição_N da_PREP+ART revista_N "_PU New_NPROP Yorker_NPROP "_PU que_PRO-KS está_V nas_PREP+ART bancas_N ._PU\n', 'Número_N duplo_ADJ especial_ADJ ,_PU é_V inteirinho_ADJ dedicado_PCP a_PREP ensaios_N sobre_PREP moda_N ._PU\n', 'A_ART endiabrada_PCP editora_N Tina_NPROP Brown_NPROP ex_N da_PREP+ART "_PU Vanity_NPROP Fair_NPROP -_PU convocou_V até_PDEN John_NPROP Updike_NPROP e_KC Salman_NPROP Rushdie_NPROP para_PREP discorrer_V sobre_PREP o_ART tema_N ._PU\n']


In [58]:
# separate the words from the tags
words = []
tags = []
for line in lines:
    # separate each word
    words_tags = line.split()
    for word_tag in words_tags:
        # separate the word from the tag
        word, tag = word_tag.split('_')
        words.append(word)
        tags.append(tag.strip())

In [21]:
print(words[0:4])
print(tags[0:4])

['Salto', 'sete', 'O', 'grande']
['N', 'ADJ', 'ART', 'ADJ']


In [40]:
from tqdm import tqdm

total_words = len(words)
results = []

for word, tag in tqdm(zip(words, tags), total=total_words, desc="Processing words"):
    prediction = nlp_token_class(word)
    result = {
        'word': word,
        'tag': tag,
        'prediction': prediction[0]['entity_group']
    }
    if prediction[0]['entity_group'] == tag:
        result['correct'] = True
    else:
        result['correct'] = False
    
    results.append(result)

Processing words: 100%|██████████| 178373/178373 [46:46<00:00, 63.56it/s] 


In [41]:
# store results in a json file
import json

with open('results/results_postagger-bio-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)

In [43]:
# count correct predictions
correct_predictions = 0
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.68


## Novo teste modelo com zero shot

In [4]:
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataT = []
# use only 60% of the data
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataT.append((word, tag.strip()))

In [32]:
dataT[0:5]

[('Jersei', 'N'),
 ('atinge', 'V'),
 ('média', 'N'),
 ('de', 'PREP'),
 ('Cr$', 'CUR')]

In [5]:
# get all tags
tags = set([tag for _, tag in dataT])
tags

{'ADJ',
 'ADV',
 'ADV-KS',
 'ART',
 'CUR',
 'IN',
 'KC',
 'KS',
 'N',
 'NPROP',
 'NUM',
 'PCP',
 'PDEN',
 'PREP',
 'PREP+ADV',
 'PREP+ART',
 'PREP+PRO-KS',
 'PREP+PROADJ',
 'PREP+PROPESS',
 'PREP+PROSUB',
 'PRO-KS',
 'PROADJ',
 'PROPESS',
 'PROSUB',
 'PU',
 'V'}

In [19]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="lisaterumi/postagger-portuguese", tokenizer="lisaterumi/postagger-portuguese", aggregation_strategy="simple")

Device set to use cpu


In [36]:
from tqdm import tqdm  # Import for the progress bar

# Initialize results list to store comparison results
results = []

# Iterating through the dataset
for word, expected_tag in tqdm(dataT, desc="Processing words"):
    # Generate the model's prediction for the current word
    prediction = pipe(word)
    
    # Extract the predicted tag
    predicted_tag = prediction[0]['entity_group']
    
    # Store evaluation data
    result_data = {
        'word': word,
        'expected_tag': expected_tag,
        'predicted_tag': predicted_tag,
        'correct': predicted_tag == expected_tag,
    }
    
    # Append result to the results list
    results.append(result_data)
        
# store results in a json file
import json

with open('results/results_postagger-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)
        

Processing words: 100%|██████████| 387877/387877 [1:34:09<00:00, 68.66it/s]


In [4]:
# count correct predictions
import json
correct_predictions = 0

with open('results/results_postagger-portuguese.json', 'r') as file:
    results = json.load(file)

total_words = len(results)
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.74


######################################


In [3]:
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataT = []
# use only 60% of the data
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataT.append((word, tag.strip()))

In [4]:
with open('data/macmorpho-dev.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataD = []
for line in lines[0:int(len(lines)*0.6)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataD.append((word, tag.strip()))


In [5]:
with open('data/macmorpho-test.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()


dataTest = []
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataTest.append((word, tag.strip()))

def get_label_list(data):
  """
  Extracts a list of unique labels (tags) from a list of (word, tag) tuples.

  Args:
    data: A list of tuples, where each tuple represents a word and its corresponding tag.

  Returns:
    A sorted list of unique labels.
  """
  label_set = set()
  for _, tag in data:
    label_set.add(tag)
  return sorted(list(label_set))

# Get the label list from the dataTest
label_list = get_label_list(dataTest) 

In [6]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from transformers import pipeline

# Load the pre-trained BERT model for Portuguese
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# You can use Hugging Face's pipeline for token classification



  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Create a dictionary mapping POS tags to integer labels
tag2id = {tag: i for i, tag in enumerate(label_list)}

# Create a reverse dictionary to map labels back to POS tags
id2tag = {i: tag for i, tag in enumerate(label_list)}

In [8]:

model = BertForTokenClassification.from_pretrained(model_name, num_labels=len(tag2id), label2id=tag2id, id2label=id2tag)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from datasets import Dataset
def prepare_dataset(data):
    tokens = []
    labels = []
    attention_masks = []  # Create a separate list for attention masks
    for word, tag in data:
        # Tokenize the word (with padding and truncation for consistency)
        tokenized = tokenizer(word, padding='max_length', truncation=True, is_split_into_words=True, max_length=128, return_tensors="pt")
        
        # Extract the input IDs and attention mask for the word
        input_ids = tokenized['input_ids'].squeeze(0).tolist()  # Remove batch dimension
        attention_mask = tokenized['attention_mask'].squeeze(0).tolist()  # Remove batch dimension
        
        label = [tag2id[tag]] * len(input_ids)

        # Append input_ids, attention_mask, and labels
        tokens.append(input_ids)
        attention_masks.append(attention_mask)
        labels.append(label)

    return Dataset.from_dict({
        "input_ids": tokens, 
        "attention_mask": attention_masks,  # Attention mask should be a list of individual attention masks
        "labels": labels
    })

# Prepare train, test, and dev datasets
train_dataset = prepare_dataset(dataT)

#dev_dataset = prepare_dataset(dataD)

KeyboardInterrupt: 

In [8]:
test_dataset = prepare_dataset(dataTest)

In [10]:
from transformers import  Trainer,  TrainingArguments


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory for model checkpoints
    evaluation_strategy="epoch",     # Evaluate at the end of each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=16,  # Batch size for training
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay for optimization
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # For simplicity, we use the same dataset for eval
)

# Start fine-tuning the model
trainer.train()




Epoch,Training Loss,Validation Loss


: 