In [2]:
from transformers import pipeline

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("pucpr-br/postagger-bio-portuguese")

model = AutoModelForTokenClassification.from_pretrained("pucpr-br/postagger-bio-portuguese")

nlp_token_class = pipeline('ner', model=model, tokenizer=tokenizer, grouped_entities=True)




In [9]:
with open('data/macmorpho-test.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()
    

In [15]:
print(lines[0:4])

['Salto_N sete_ADJ\n', 'O_ART grande_ADJ assunto_N da_PREP+ART semana_N em_PREP Nova_NPROP York_NPROP é_V a_ART edição_N da_PREP+ART revista_N "_PU New_NPROP Yorker_NPROP "_PU que_PRO-KS está_V nas_PREP+ART bancas_N ._PU\n', 'Número_N duplo_ADJ especial_ADJ ,_PU é_V inteirinho_ADJ dedicado_PCP a_PREP ensaios_N sobre_PREP moda_N ._PU\n', 'A_ART endiabrada_PCP editora_N Tina_NPROP Brown_NPROP ex_N da_PREP+ART "_PU Vanity_NPROP Fair_NPROP -_PU convocou_V até_PDEN John_NPROP Updike_NPROP e_KC Salman_NPROP Rushdie_NPROP para_PREP discorrer_V sobre_PREP o_ART tema_N ._PU\n']


In [58]:
# separate the words from the tags
words = []
tags = []
for line in lines:
    # separate each word
    words_tags = line.split()
    for word_tag in words_tags:
        # separate the word from the tag
        word, tag = word_tag.split('_')
        words.append(word)
        tags.append(tag.strip())

In [21]:
print(words[0:4])
print(tags[0:4])

['Salto', 'sete', 'O', 'grande']
['N', 'ADJ', 'ART', 'ADJ']


In [40]:
from tqdm import tqdm

total_words = len(words)
results = []

for word, tag in tqdm(zip(words, tags), total=total_words, desc="Processing words"):
    prediction = nlp_token_class(word)
    result = {
        'word': word,
        'tag': tag,
        'prediction': prediction[0]['entity_group']
    }
    if prediction[0]['entity_group'] == tag:
        result['correct'] = True
    else:
        result['correct'] = False
    
    results.append(result)

Processing words: 100%|██████████| 178373/178373 [46:46<00:00, 63.56it/s] 


In [15]:
import json


In [41]:
# store results in a json file

with open('results/results_postagger-bio-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)

In [43]:
# count correct predictions
correct_predictions = 0
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Acurácia geral: {accuracy:.2f}')

Accuracy: 0.68


In [19]:
# open results and check which TAGS were predicted incorrectly and how many times
with open('results/results_postagger-bio-portuguese.json', 'r') as file:
    results = json.load(file)

# calculate the accuracy for each tag
correct_tags = {}
total_tags = {}

for result in results:
    if result['expected_tag'] not in total_tags:
        total_tags[result['expected_tag']] = 1
    else:
        total_tags[result['expected_tag']] += 1
    
    if result['correct']:
        if result['expected_tag'] not in correct_tags:
            correct_tags[result['expected_tag']] = 1
        else:
            correct_tags[result['expected_tag']] += 1

accuracy_tags = {}
for tag in total_tags:
    try:
        accuracy_tags[tag] = correct_tags[tag] / total_tags[tag]
    except KeyError:
        accuracy_tags[tag] = 0

# print in descending order
for tag, accuracy in sorted(accuracy_tags.items(), key=lambda x: x[1], reverse=True):
    print(f'TAG: {tag} - ACURÁCIA: {accuracy:.2f}')

TAG: PU - ACURÁCIA: 1.00
TAG: PREP+PROPESS - ACURÁCIA: 0.98
TAG: N - ACURÁCIA: 0.97
TAG: KC - ACURÁCIA: 0.96
TAG: PROPESS - ACURÁCIA: 0.93
TAG: ADV - ACURÁCIA: 0.92
TAG: V - ACURÁCIA: 0.92
TAG: PROSUB - ACURÁCIA: 0.88
TAG: PREP - ACURÁCIA: 0.85
TAG: PREP+ADV - ACURÁCIA: 0.84
TAG: PCP - ACURÁCIA: 0.82
TAG: PREP+ART - ACURÁCIA: 0.75
TAG: PDEN - ACURÁCIA: 0.55
TAG: ADJ - ACURÁCIA: 0.50
TAG: NPROP - ACURÁCIA: 0.48
TAG: PREP+PROADJ - ACURÁCIA: 0.47
TAG: PREP+PROSUB - ACURÁCIA: 0.41
TAG: IN - ACURÁCIA: 0.40
TAG: ART - ACURÁCIA: 0.08
TAG: PROADJ - ACURÁCIA: 0.05
TAG: NUM - ACURÁCIA: 0.04
TAG: KS - ACURÁCIA: 0.00
TAG: CUR - ACURÁCIA: 0.00
TAG: PRO-KS - ACURÁCIA: 0.00
TAG: ADV-KS - ACURÁCIA: 0.00
TAG: PREP+PRO-KS - ACURÁCIA: 0.00


## Novo teste outro modelo com fine tunning

In [2]:
with open('data/macmorpho-train.txt', 'r') as file:
    # store file content in a list
    lines = file.readlines()

# separate the words from the tags
dataT = []
# use only 60% of the data
for line in lines[0:int(len(lines)*0.5)]:
    words_tags = line.split()
    for word_tag in words_tags:
        word, tag = word_tag.split('_')
        dataT.append((word, tag.strip()))


In [3]:
dataT[0:5]

[('Jersei', 'N'),
 ('atinge', 'V'),
 ('média', 'N'),
 ('de', 'PREP'),
 ('Cr$', 'CUR')]

In [4]:
# get all tags
tags = set([tag for _, tag in dataT])
tags

{'ADJ',
 'ADV',
 'ADV-KS',
 'ART',
 'CUR',
 'IN',
 'KC',
 'KS',
 'N',
 'NPROP',
 'NUM',
 'PCP',
 'PDEN',
 'PREP',
 'PREP+ADV',
 'PREP+ART',
 'PREP+PRO-KS',
 'PREP+PROADJ',
 'PREP+PROPESS',
 'PREP+PROSUB',
 'PRO-KS',
 'PROADJ',
 'PROPESS',
 'PROSUB',
 'PU',
 'V'}

In [19]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("token-classification", model="lisaterumi/postagger-portuguese", tokenizer="lisaterumi/postagger-portuguese", aggregation_strategy="simple")

Device set to use cpu


In [36]:
from tqdm import tqdm  # Import for the progress bar

# Initialize results list to store comparison results
results = []

# Iterating through the dataset
for word, expected_tag in tqdm(dataT, desc="Processing words"):
    # Generate the model's prediction for the current word
    prediction = pipe(word)
    
    # Extract the predicted tag
    predicted_tag = prediction[0]['entity_group']
    
    # Store evaluation data
    result_data = {
        'word': word,
        'expected_tag': expected_tag,
        'predicted_tag': predicted_tag,
        'correct': predicted_tag == expected_tag,
    }
    
    # Append result to the results list
    results.append(result_data)
        
# store results in a json file
import json

with open('results/results_postagger-portuguese.json', 'w') as file:
    json.dump(results, file, indent=4)
        

Processing words: 100%|██████████| 387877/387877 [1:34:09<00:00, 68.66it/s]


In [4]:
# count correct predictions
import json
correct_predictions = 0

with open('results/results_postagger-portuguese.json', 'r') as file:
    results = json.load(file)

total_words = len(results)
for result in results:
    if result['correct']:
        correct_predictions += 1

accuracy = correct_predictions / total_words
print(f'Acurácia geral: {accuracy:.2f}')

Accuracy: 0.74


In [20]:
with open('results/results_postagger-portuguese.json', 'r') as file:
    results = json.load(file)

# calculate the accuracy for each tag
correct_tags = {}
total_tags = {}

for result in results:
    if result['expected_tag'] not in total_tags:
        total_tags[result['expected_tag']] = 1
    else:
        total_tags[result['expected_tag']] += 1
    
    if result['correct']:
        if result['expected_tag'] not in correct_tags:
            correct_tags[result['expected_tag']] = 1
        else:
            correct_tags[result['expected_tag']] += 1

accuracy_tags = {}
for tag in total_tags:
    try:
        accuracy_tags[tag] = correct_tags[tag] / total_tags[tag]
    except KeyError:
        accuracy_tags[tag] = 0

# print in descending order
for tag, accuracy in sorted(accuracy_tags.items(), key=lambda x: x[1], reverse=True):
    print(f'TAG: {tag} - ACURÁCIA: {accuracy:.2f}')

TAG: PU - ACURÁCIA: 1.00
TAG: PREP+PROPESS - ACURÁCIA: 0.98
TAG: N - ACURÁCIA: 0.97
TAG: KC - ACURÁCIA: 0.96
TAG: ADV - ACURÁCIA: 0.92
TAG: V - ACURÁCIA: 0.92
TAG: PREP+ADV - ACURÁCIA: 0.89
TAG: PROSUB - ACURÁCIA: 0.88
TAG: PREP - ACURÁCIA: 0.85
TAG: PCP - ACURÁCIA: 0.82
TAG: PROPESS - ACURÁCIA: 0.79
TAG: PREP+ART - ACURÁCIA: 0.75
TAG: PDEN - ACURÁCIA: 0.56
TAG: ADJ - ACURÁCIA: 0.50
TAG: NPROP - ACURÁCIA: 0.49
TAG: PREP+PROADJ - ACURÁCIA: 0.47
TAG: PREP+PROSUB - ACURÁCIA: 0.41
TAG: IN - ACURÁCIA: 0.40
TAG: NUM - ACURÁCIA: 0.12
TAG: ART - ACURÁCIA: 0.08
TAG: PROADJ - ACURÁCIA: 0.05
TAG: KS - ACURÁCIA: 0.00
TAG: CUR - ACURÁCIA: 0.00
TAG: PRO-KS - ACURÁCIA: 0.00
TAG: ADV-KS - ACURÁCIA: 0.00
TAG: PREP+PRO-KS - ACURÁCIA: 0.00
