In [1]:
import numpy as np
import csv
import os
import viterbi
import baseline

### Inizializzazione delle variabili

In [2]:
# Variabili per memorizzare i dati
tags = []
words = []
emission_P = []
transition_P = []

Una funzione per la lettura dei file .csv con le probabilità

In [3]:
def readProb(reader):
    for row in reader:
        if row:
            if row[0] == 'tags':
                tags = next(reader)
            elif row[0] == 'words':
                words = next(reader)
            elif row[0] == 'emissione':
                section = 'emissione'
                emission_P = []
            elif row[0] == 'transizione':
                section = 'transizione'
                transition_P = []
            elif section == 'emissione':
                emission_P.append([float(x) for x in row])
            elif section == 'transizione':
                transition_P.append([float(x) for x in row])
    # conversione delle matrici python in matrici numpy
    transition_P = np.array(transition_P)
    emission_P = np.array(emission_P)
    
    return tags, words, transition_P, emission_P

### Lettura dei file .csv
Una versione per ogni dataset wikineural

wikineural_en

In [4]:
current_dir = os.getcwd()

# LETTURA FILE PROBABILITA'
file_path = os.path.join(current_dir, 'wikineural_en', 'probabilities.csv')
with open(file_path, 'r', encoding='utf-8') as prob:
    reader = csv.reader(prob)
    section = None
    tags, words, transition_P, emission_P = readProb(reader)

# LETTURA FILE DI TEST
file_path = os.path.join(current_dir, 'wikineural_en', 'test.conllu')
with open(file_path, 'r', encoding='utf-8') as test:
   righe = test.readlines()

wikineural_es

wikineural_it

## Decoding
eseguibile con Viterbi, con la baseline semplice o con quella basata su MEMM ()

In [5]:
# la frase da analizzare
sequence = []

# un ciclo for generale legge tutte le frasi del file test
for riga in righe:
    riga = riga.strip()
    if riga:
        riga = riga.split()
        sequence.append(riga[1])
    else:     # riga vuota => end of sentence
        final_sequence = viterbi.viterbi(sequence, emission_P, transition_P, tags, words)
        #final_sequence = baseline.easy_baseline(sequence, emission_P, tags, words)
        print("frase: ")
        print(sequence)
        print("tags: ")
        print(final_sequence)
        sequence = []
        final_sequence = []

frase: 
['On', 'this', 'occasion', 'he', 'failed', 'to', 'gain', 'the', 'support', 'of', 'the', 'South', 'Wales', 'Miners', "'", 'Federation', 'and', 'had', 'to', 'stand', 'down', '.']
tags: 
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O']
frase: 
['On', 'both', 'these', 'occasions', 'he', 'was', 'backed', 'by', 'the', 'South', 'Wales', 'Miners', "'", 'Federation', ',', 'but', 'he', 'was', 'not', 'successful', '.']
tags: 
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
frase: 
['He', 'also', 'appeared', 'as', 'himself', 'in', 'the', '1996', 'film', '"', 'Eddie', '"', '.']
tags: 
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O']
frase: 
['The', 'Colorado', 'Rockies', 'were', 'created', 'as', 'an', 'expansion', 'franchise', 'in', '1993', 'and', 'Coors', 'Field', 'opened', 'in', '1995', '.']
tags: 
['O', 'B-ORG

## Valutazione

In [6]:
def evaluate(test_sentences, emission_P, transition_P, tags, words): 
    correct = 0 
    total = 0 
    true_positives = 0 
    false_positives = 0 
    false_negatives = 0 
     
    for sentence in test_sentences: 
        sequence = [word for word, true_tag in sentence] 
        true_tags = [true_tag for word, true_tag in sentence] 
        predicted_tags = viterbi(sequence, emission_P, transition_P, tags, words) 
         
        for true_tag, predicted_tag in zip(true_tags, predicted_tags): 
            if true_tag == predicted_tag: 
                correct += 1 
            total += 1 
 
            if predicted_tag.startswith("B-") or predicted_tag.startswith("I-"): 
                if predicted_tag == true_tag: 
                    true_positives += 1 
                else: 
                    false_positives += 1 
            if true_tag.startswith("B-") or true_tag.startswith("I-"): 
                if true_tag != predicted_tag: 
                    false_negatives += 1 
 
    accuracy = correct / total 
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0 
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0 
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 
 
    return accuracy, precision, recall, f1_score

In [None]:
accuracy, precision, recall, f1_score = evaluate(test_sentences, emission_P, transition_P, tags, words) 
 
print(f"Accuracy: {accuracy:.4f}") 
print(f"Precision: {precision:.4f}") 
print(f"Recall: {recall:.4f}") 
print(f"F1 Score: {f1_score:.4f}")