In [2]:
from transformers import pipeline

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("pucpr-br/postagger-bio-portuguese")

model = AutoModelForTokenClassification.from_pretrained("pucpr-br/postagger-bio-portuguese")

nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)




In [9]:
with open('data/macmorpho-test.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()
    

In [15]:
print(lines[0:4])

['Salto_N sete_ADJ\n', 'O_ART grande_ADJ assunto_N da_PREP+ART semana_N em_PREP Nova_NPROP York_NPROP é_V a_ART edição_N da_PREP+ART revista_N "_PU New_NPROP Yorker_NPROP "_PU que_PRO-KS está_V nas_PREP+ART bancas_N ._PU\n', 'Número_N duplo_ADJ especial_ADJ ,_PU é_V inteirinho_ADJ dedicado_PCP a_PREP ensaios_N sobre_PREP moda_N ._PU\n', 'A_ART endiabrada_PCP editora_N Tina_NPROP Brown_NPROP ex_N da_PREP+ART "_PU Vanity_NPROP Fair_NPROP -_PU convocou_V até_PDEN John_NPROP Updike_NPROP e_KC Salman_NPROP Rushdie_NPROP para_PREP discorrer_V sobre_PREP o_ART tema_N ._PU\n']


In [58]:
# separate the words from the tags
words = []
tags = []
for line in lines:
    # separate each word
    words_tags = line.split()
    for word_tag in words_tags:
        # separate the word from the tag
        word, tag = word_tag.split('_')
        words.append(word)
        tags.append(tag.strip())

In [21]:
print(words[0:4])
print(tags[0:4])

['Salto', 'sete', 'O', 'grande']
['N', 'ADJ', 'ART', 'ADJ']


In [40]:
from tqdm import tqdm

total_words = len(words)
results = []

for word, tag in tqdm(zip(words, tags), total=total_words, desc="Processing words"):
    prediction = nlp_token_class(word)
    result = {
        'word': word,
        'tag': tag,
        'prediction': prediction[0]['entity_group']
    }
    if prediction[0]['entity_group'] == tag:
        result['correct'] = True
    else:
        result['correct'] = False
    
    results.append(result)

Processing words: 100%|██████████| 178373/178373 [46:46<00:00, 63.56it/s] 


In [15]:
import json


In [41]:
# store results in a json file

with open('results/results_postagger-bio-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)

In [43]:
# count correct predictions
correct_predictions = 0
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.68


In [17]:
# open results and check which TAGS were predicted incorrectly and how many times
with open('results/results_postagger-bio-portuguese.json', 'r') as file:
    results = json.load(file)

incorrect_tags = {}
for result in results:
    if not result['correct']:
        if result['tag'] not in incorrect_tags:
            incorrect_tags[result['tag']] = 1
        else:
            incorrect_tags[result['tag']] += 1

# print in descending order
for tag, count in sorted(incorrect_tags.items(), key=lambda x: x[1], reverse=True):
    print(f'{tag}: {count}')

PREP: 13273
ART: 10103
NPROP: 8091
ADJ: 5540
NUM: 2388
KS: 2272
PRO-KS: 2195
V: 2101
PROADJ: 1869
PROPESS: 1268
PU: 1178
N: 1177
PREP+ART: 1121
PCP: 842
ADV: 638
PROSUB: 584
PDEN: 498
CUR: 296
ADV-KS: 230
PREP+PROADJ: 207
KC: 173
PREP+PROSUB: 88
IN: 76
PREP+PRO-KS: 58
PREP+PROPESS: 12
PREP+ADV: 3


## Novo teste modelo com zero shot

In [2]:
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataT = []
# use only 60% of the data
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataT.append((word, tag.strip()))


In [3]:
dataT[0:5]

[('Jersei', 'N'),
 ('atinge', 'V'),
 ('média', 'N'),
 ('de', 'PREP'),
 ('Cr$', 'CUR')]

In [4]:
# get all tags
tags = set([tag for _, tag in dataT])
tags

{'ADJ',
 'ADV',
 'ADV-KS',
 'ART',
 'CUR',
 'IN',
 'KC',
 'KS',
 'N',
 'NPROP',
 'NUM',
 'PCP',
 'PDEN',
 'PREP',
 'PREP+ADV',
 'PREP+ART',
 'PREP+PRO-KS',
 'PREP+PROADJ',
 'PREP+PROPESS',
 'PREP+PROSUB',
 'PRO-KS',
 'PROADJ',
 'PROPESS',
 'PROSUB',
 'PU',
 'V'}

In [19]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="lisaterumi/postagger-portuguese", tokenizer="lisaterumi/postagger-portuguese", aggregation_strategy="simple")

Device set to use cpu


In [36]:
from tqdm import tqdm  # Import for the progress bar

# Initialize results list to store comparison results
results = []

# Iterating through the dataset
for word, expected_tag in tqdm(dataT, desc="Processing words"):
    # Generate the model's prediction for the current word
    prediction = pipe(word)
    
    # Extract the predicted tag
    predicted_tag = prediction[0]['entity_group']
    
    # Store evaluation data
    result_data = {
        'word': word,
        'expected_tag': expected_tag,
        'predicted_tag': predicted_tag,
        'correct': predicted_tag == expected_tag,
    }
    
    # Append result to the results list
    results.append(result_data)
        
# store results in a json file
import json

with open('results/results_postagger-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)
        

Processing words: 100%|██████████| 387877/387877 [1:34:09<00:00, 68.66it/s]


In [4]:
# count correct predictions
import json
correct_predictions = 0

with open('results/results_postagger-portuguese.json', 'r') as file:
    results = json.load(file)

total_words = len(results)
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.74


In [19]:
with open('results/results_postagger-portuguese.json', 'r') as file:
    results = json.load(file)

incorrect_tags = {}
for result in results:
    if not result['correct']:
        if result['expected_tag'] not in incorrect_tags:
            incorrect_tags[result['expected_tag']] = 1
        else:
            incorrect_tags[result['expected_tag']] += 1

# print in descending order
for tag, count in sorted(incorrect_tags.items(), key=lambda x: x[1], reverse=True):
    print(f'{tag}: {count}')

ART: 26804
NPROP: 19359
ADJ: 8135
NUM: 7420
PREP+ART: 6311
PREP: 5806
PROADJ: 4934
KS: 4699
PRO-KS: 3961
V: 3192
N: 2356
PCP: 1556
CUR: 1485
PDEN: 936
PROPESS: 707
ADV: 621
ADV-KS: 354
KC: 349
PREP+PROADJ: 345
PROSUB: 231
PREP+PROSUB: 127
PREP+PRO-KS: 66
IN: 12
PREP+PROPESS: 3
PREP+ADV: 2


######################################


In [3]:
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataT = []
# use only 60% of the data
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataT.append((word, tag.strip()))

In [4]:
with open('data/macmorpho-dev.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataD = []
for line in lines[0:int(len(lines)*0.6)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataD.append((word, tag.strip()))


In [5]:
with open('data/macmorpho-test.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()


dataTest = []
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataTest.append((word, tag.strip()))

def get_label_list(data):
  """
  Extracts a list of unique labels (tags) from a list of (word, tag) tuples.

  Args:
    data: A list of tuples, where each tuple represents a word and its corresponding tag.

  Returns:
    A sorted list of unique labels.
  """
  label_set = set()
  for _, tag in data:
    label_set.add(tag)
  return sorted(list(label_set))

# Get the label list from the dataTest
label_list = get_label_list(dataTest) 