# NER Tagging con Viterbi

## Setup

### Import

In [1]:
import math

### Funzioni accessorie

In [2]:
def log(number):
    if number == 0:
        number += 0.0000000001
    return math.log(number)

### Retrieval del Corpus

In [3]:
def get_corpus(path):
    with open(path, 'r', encoding='utf8') as f:
        return parse_conllu(f.readlines())

def parse_conllu(lines):
    result = []
    sentence = []
    for line in lines:
        if len(line) <= 1:
            result.append(sentence.copy())
            sentence = []
        else:
            token = parse_conllu_token(line)
            sentence.append(token)
    return result

def parse_conllu_token(line):
    line = line.strip('\n').split('\t')
    tag = line[2]
    if tag != 'O':
        tag = tag[2:]
    return (line[1], tag)

lang = 'en'
corpus = get_corpus(f'data/{lang}/train.conllu')
tagset = ['O', 'ORG', 'LOC', 'PER', 'MISC']

## Implementazione

In [4]:
class Tagger():
    def __init__(self, model, algorithm):
        self.model = model
        self.algorithm = algorithm
        self.tagset = model.tagset

    def tag_sentence(self, sentence):
        if sentence.__class__ == str:
            sentence = sentence.split()
        return self.algorithm(self.model, sentence)

### Hidden Markov Model

In [5]:
class HiddenMarkovModel:
    def __init__(self, corpus, tagset):
        self.tagset = tagset
        self.punctuation_signs = [',', '.', '-', ':', '\'', '(', ')', '"']
        self.vocabulary, self.tag_occurrences = self._build_vocabulary_and_tag_count(corpus)
        self.start_prob = self.compute_start_prob(corpus)
        self.end_prob = self.compute_end_prob(corpus)
        self.transition_prob = self.compute_transition_prob(corpus)
        self.emission_prob = self.compute_emission_prob(corpus)
        self.smoothing_strategy = 'always_o'
    
    def _build_vocabulary_and_tag_count(self, corpus):
        tag_occurrences = dict((tag,0) for tag in self.tagset)
        vocabulary = set()
        for sent in corpus:
            for token in sent:
                vocabulary.add(token[0].lower())
                tag_occurrences[token[1]] += 1
        return list(vocabulary), tag_occurrences

    def compute_start_prob(self, corpus):
        count_tag = dict((tag,0) for tag in self.tagset)
        for sent in corpus:
            tag = sent[0][1]
            count_tag[tag] += 1
        n = len(corpus)
        return dict((tag,count/n) for tag,count in count_tag.items())

    def compute_transition_prob(self, corpus):
        # Inizializzazione dei dizionari
        count_tag = dict(((tag1, tag2),0) for tag1 in self.tagset for tag2 in self.tagset)
        total = dict((tag, 0) for tag in self.tagset)
    
        # Conta delle coppie di tag
        for sent in corpus:
            for i in range(len(sent)-1):
                tag1 = sent[i][1]
                total[tag1] += 1
                tag2 = sent[i+1][1]
                key = (tag1, tag2)
                count_tag[key] += 1

        # Calcolo delle frequenze
        for tag1 in self.tagset:
            for tag2 in self.tagset:
                count_tag[(tag1,tag2)] = count_tag[(tag1,tag2)]/total[tag1]
        return count_tag

    def compute_end_prob(self, corpus):
        count_tag = dict((tag,0) for tag in self.tagset)
        for sent in corpus:
            tag = self._get_last_tag(sent)
            count_tag[tag] += 1
        n = len(corpus)
        return dict((tag, count/n) for tag,count in count_tag.items())

    def _get_last_tag(self, sent):
        for i, token in enumerate(reversed(sent)):
            if token[0] not in self.punctuation_signs:
                return token[1]

    def compute_emission_prob(self, corpus):
        # Inizializzazione
        count_dict = dict()
        for word in self.vocabulary:
            for tag in self.tagset:
                key = (word,tag)
                count_dict[key] = 0

        # Conteggio
        for sent in corpus:
            for token in sent:
                word, tag = token
                key = (word.lower(), tag)
                count_dict[key] += 1

        # Calcolo delle frequenze
        for word, tag in count_dict:
            n = self.tag_occurrences[tag]
            count_dict[(word,tag)] = count_dict[(word,tag)]/n
        return count_dict

    def get_start_prob(self, tag):
        return self.start_prob[tag]

    def get_transition_prob(self, tag1, tag2):
        return self.transition_prob[(tag1, tag2)]

    def get_end_prob(self, tag):
        return self.end_prob[tag]

    def get_emission_prob(self, word, tag):
        if word in self.vocabulary:
            return self.emission_prob[(word.lower(),tag)]
        if self.smoothing_strategy == 'always_o':
            return 1.0 if tag == 'O' else 0.0
        if self.smoothing_strategy == 'misc_or_o':
            return 0.5 if tag in ['MISC', 'O'] else 0.0
        if self.smoothing_strategy == 'uniform':
            return 1/len(self.tagset)

    def set_smoothing_strategy(self, strategy):
        if strategy.lower() in ['always_o', 'misc_or_o', 'uniform']:
            self.smoothing_strategy = strategy.lower()
        else:
            raise ValueError("Incorrect smoothing strategy")

model = HiddenMarkovModel(corpus, tagset)

### Viterbi

In [6]:
def viterbi(model, sentence):
    viterbi_matrix, backpointer = inizialitazion_step(model, sentence)
    for t in range(1,len(sentence)):
        for i in range(len(model.tagset)):
            viterbi_matrix[i][t], backpointer[i][t] = compute_max(model, viterbi_matrix, sentence[t], t, i)
    viterbi_matrix[-1][-1], backpointer[-1][-1] = termination_step(model, viterbi_matrix)
    result = backtrace(backpointer, model.tagset)
    return result

def inizialitazion_step(model, sentence):
    tagset = model.tagset
    viterbi_matrix = [[0 for _ in range(len(sentence))] for _ in range(len(tagset)+1)]
    backpointer = [[-1 for _ in range(len(sentence))] for _ in range(len(tagset)+1)]
    for i, tag in enumerate(tagset):
        start_prob = model.get_start_prob(tag)
        emission_prob = model.get_emission_prob(sentence[0], tag)
        viterbi_matrix[i][0] = log(start_prob * emission_prob)
        backpointer[i][0] = 0
    return viterbi_matrix, backpointer

def compute_max(model, viterbi_matrix, word, t, i):
    tagset = model.tagset
    max_prob = {'log_prob': -math.inf, 'tag':'-'}
    for j in range(len(tagset)):
        transition_prob = model.get_transition_prob(tagset[j], tagset[i])
        emission_prob = model.get_emission_prob(word, tagset[i])
        log_prob = viterbi_matrix[j][t-1] + log(transition_prob * emission_prob)
        if log_prob > max_prob['log_prob']:
            max_prob = {'log_prob': log_prob, 'tag': tagset[j]}
    return max_prob['log_prob'], max_prob['tag']

def termination_step(model, viterbi_matrix):
    tagset = model.tagset
    max_prob = {'log_prob': -math.inf, 'tag':'-'}
    for j in range(len(tagset)):
        end_prob = model.get_end_prob(tagset[j])
        log_prob = viterbi_matrix[j][-1] + log(end_prob)
        if log_prob > max_prob['log_prob']:
            max_prob = {'log_prob': log_prob, 'tag': tagset[j]}
    return max_prob['log_prob'], max_prob['tag']

def backtrace(backpointer, tagset):
    tags = [backpointer[-1][-1]]
    for t in range(len(backpointer[0])-1, 0, -1):
        tag = tags[-1]
        tag_idx = tagset.index(tag)
        tags.append(backpointer[tag_idx][t])
    return list(reversed(tags))

hmm_viterbi = Tagger(model, viterbi)

### Baseline

In [7]:
class BaselineModel():
    def __init__(self, corpus, tagset):
        self.tagset = tagset
        self.vocabulary = self.build_vocabulary(corpus)
        self.frequencies = self.compute_frequencies(corpus)

    def build_vocabulary(self, corpus):
        vocabulary = set()
        for sent in corpus:
            for token in sent:
                vocabulary.add(token[0].lower())
        return list(vocabulary)
    
    def compute_frequencies(self, corpus):
        word_dict = dict((tag, 0) for tag in self.tagset)
        freq_dict = dict((word, word_dict.copy()) for word in self.vocabulary)
        for sent in corpus:
            for token in sent:
                freq_dict[token[0].lower()][token[1]] += 1
        return freq_dict
    
    def assign_tag(self, word):
        if word.lower() not in self.vocabulary:
            return 'MISC'
        tag_freq = self.frequencies[word.lower()]
        tag_freq = list(sorted(tag_freq.items(), key=lambda x: x[1], reverse=True))
        return tag_freq[0][0]

def baseline_tag_sentence(model, sentence):
    tags = []
    for word in sentence:
        tags.append(model.assign_tag(word))
    return tags

baseline_model = BaselineModel(corpus, tagset)
baseline = Tagger(baseline_model, baseline_tag_sentence)

## Valutazione

### Processing del test set

In [8]:
def process_test_set(tagger, test_set, verbose=False):
    model_outputs, target_outputs = [], []
    for i, sentence in enumerate(test_set):
        sent, target_output = get_sent_and_tags(sentence)
        target_outputs.append(target_output)
        model_output = tagger.tag_sentence(sent)
        model_outputs.append(model_output)
        if verbose and i%1000 == 0:
            print(f'Progress {i}/{len(test_set)}')
    return model_outputs, target_outputs

def get_sent_and_tags(sentence):
    sent, tags = '', []
    for token in sentence:
        if token[0] not in [',', '.', '-', ':', '\'', '(', ')']:
            sent += token[0] + " "
            tags.append(token[1])
    return sent[:-1], tags

### Accuracy

In [9]:
def compute_accuracy(model_outputs, target_outputs):
    score = 0
    total = 0
    for i in range(len(model_outputs)):
        for j in range(len(model_outputs[i])):
            if target_outputs[i][j] == 'O':
                continue
            total += 1
            if model_outputs[i][j] == target_outputs[i][j]:
                score += 1
    return score/total

### Precision e recall

In [10]:
def compute_statistics(tagset, model_outputs, target_outputs):
    true_positives = dict((tag,0) for tag in tagset)
    false_positives = dict((tag,0) for tag in tagset)
    false_negatives = dict((tag,0) for tag in tagset)
    for i in range(len(model_outputs)):
        for j in range(len(model_outputs[i])):
            for tag in tagset:
                if model_outputs[i][j] != tag and target_outputs[i][j] != tag:
                    continue
                if model_outputs[i][j] == target_outputs[i][j]:
                    true_positives[tag] += 1
                elif model_outputs[i][j] == tag:
                    false_positives[tag] += 1
                else:
                    false_negatives[tag] += 1
    return true_positives, false_positives, false_negatives

In [11]:
def compute_precision(true_positives, false_positives):
    precisions = dict()
    for tag in true_positives:
        if true_positives[tag] == 0:
            precisions[tag] = 0
            continue
        precisions[tag] = true_positives[tag] / (true_positives[tag] + false_positives[tag])
    return precisions

In [12]:
def compute_recall(true_positives, false_negatives):
    recalls = dict()
    for tag in true_positives:
        if true_positives[tag] == 0:
            recalls[tag] = 0
            continue
        recalls[tag] = true_positives[tag] / (true_positives[tag] + false_negatives[tag])
    return recalls

### Confronto con Baseline

In [13]:
def evaluate_tagger(tagger, test_set):
    model_outputs, target_outputs = process_test_set(tagger, test_set)
    accuracy = compute_accuracy(model_outputs, target_outputs)
    true_positives, false_positives, false_negatives = compute_statistics(tagger.tagset, model_outputs, target_outputs)
    precision = compute_precision(true_positives, false_positives)
    recall = compute_recall(true_positives, false_negatives)
    print(f'---- Evaluating model {str(tagger.model.__class__.__name__)} ----')
    print(f'Model accuracy on entities: {accuracy}')
    print(f'Model precision per tag: {precision}')
    print(f'Model recall per tag: {recall}\n')

In [14]:
test_set = get_corpus(f'data/{lang}/test.conllu')
N = len(test_set)
# N = 10

for tagger in [hmm_viterbi, baseline]:
    evaluate_tagger(tagger, test_set[:N])

### Test

In [None]:
sentences = {
    'it': [
        'La vera casa di Harry Potter è il Castello di Hogwarts .',
        'Harry le raccontò del loro incontro a Diagon Alley .',
        'Mr Dursley era direttore di una ditta di nome Grunnings , che fabbricava trapani .'
    ], 'en': [
        'Harry Potter \'s true home is Hogwarts Castle .',
        'Harry told her about their meeting at Diagon Alley .',
        'Mr. Dursley was director of a company named Grunnings that manufactured drills .'
    ]
}

correct_tags = {
    'it': [
        ['O', 'O', 'O', 'O', 'PER', 'PER', 'O', 'O', 'LOC', 'LOC', 'LOC', 'O'],
        ['PER', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'LOC', 'O'],
        ['PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O', 'O'],
    ], 'en': [
        ['PER', 'PER', 'O', 'O', 'O', 'O', 'LOC', 'LOC', 'O'],
        ['PER', 'O', 'O', 'O', 'O', 'O', 'O', 'LOC', 'LOC', 'O'],
        ['PER', 'PER', 'O', 'O', 'O', 'O', 'O', 'O', 'ORG', 'O', 'O', 'O', 'O'],
    ]
}

for i, sentence in enumerate(sentences[lang]):
    print('-'*225)
    header = sentence.split()
    row1 = hmm_viterbi.tag_sentence(sentence)
    row2 = baseline.tag_sentence(sentence)
    row3 = correct_tags[lang][i]
    rows = [header, row1, row2, row3]
    print('\n'.join([''.join(['{:16}'.format(x) for x in r]) for r in rows]))

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Harry           Potter          's              true            home            is              Hogwarts        Castle          .               
O               O               O               O               O               O               O               O               O               
PER             MISC            O               O               O               O               MISC            LOC             O               
PER             PER             O               O               O               O               LOC             LOC             O               
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------