In [3]:
import os
import subprocess
import numpy as np

In [4]:
tags = [t.strip().split(' ')[-1] for t in list(filter(lambda x:x != '-DOCSTART- -X- -X- -X- O\n',open("../data/conll03/deu.train", encoding="ISO-8859-1").readlines())) if len(t.strip().split(' '))>1]
tags = sorted(list(set(tags)))
tags

['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']

In [8]:
def hmm_viterbi(W, words):
    y_pred = []
    for num in range(len(words)):
        if num == 0:
            best_score = [W[('*', t)] if ('*',t) in W else 0 for t in tags]
            best_score = [best_score[s]+W[(words[num], tags[s])] if (words[num], tags[s]) in W else best_score[s] for s in range(len(best_score))]
            y_pred.append(tags[best_score.index(max(best_score))])
        else:
            sum_score = [np.array(best_score) for t in tags]
            for t_next in range(len(sum_score)):
                sum_score[t_next] = [W[(tags[t_prev],tags[t_next])]+sum_score[t_next][t_prev] if (tags[t_prev],tags[t_next]) in W 
                                     else sum_score[t_next][t_prev]
                                     for t_prev in range(len(sum_score[t_next]))]
            sum_score = [np.array(sum_score[s])+W[(words[num], tags[s])] if (words[num], tags[s]) in W else np.array(sum_score[s]) for s in range(len(sum_score))]
            best_score = [max(s) for s in sum_score]
            y_pred.append(tags[best_score.index(max(best_score))])
    return y_pred

def create_features(words, tags):
    temp_dict = {}
    for t in range(len(tags)+1):
        if t == 0: prev_tag = '*'
        else: prev_tag = tags[t-1]
        if t == len(tags): next_tag = '*'
        else: next_tag = tags[t]
        
        if (prev_tag, next_tag) in temp_dict:
            temp_dict[(prev_tag, next_tag)] += 0.01
        else: temp_dict[(prev_tag, next_tag)] = 0.01
        
    for t in range(len(tags)):
        if (words[t], tags[t]) in temp_dict:
            temp_dict[(words[t], tags[t])] += 1
        else: temp_dict[(words[t], tags[t])] = 1
    return temp_dict

In [14]:
text = [t.strip().split(' ') for t in list(filter(lambda x:x != '-DOCSTART- -X- -X- -X- O\n',open("../data/conll03/deu.train", encoding="ISO-8859-1").readlines()))]
sen_locs = [t for t in range(len(text)) if len(text[t])==1]
text_train = [[text[s] for s in range(sen_locs[loc]+1,sen_locs[loc+1])] for loc in range(len(sen_locs)-1)]
text_train[:1]

[[['Ereignis', 'Ereignis', 'NN', 'I-NC', 'O'],
  ['und', 'und', 'KON', 'O', 'O'],
  ['Erzählung', 'Erzählung', 'NN', 'I-NC', 'O'],
  ['oder', 'oder', 'KON', 'I-NC', 'O'],
  [':', ':', '$.', 'O', 'O']]]

In [15]:
text = [t.strip().split(' ') for t in list(filter(lambda x:x != '-DOCSTART- -X- -X- -X- O\n',open("../data/conll03/deu.testa", encoding="ISO-8859-1").readlines()))]
sen_locs = [t for t in range(len(text)) if len(text[t])==1]
text_pred = [[text[s] for s in range(sen_locs[loc]+1,sen_locs[loc+1])] for loc in range(len(sen_locs)-1)]
text_pred[:1]

[[['Großer', 'Große', 'NN', 'I-NC', 'O'],
  ['Foto-Wettbeweb', '<unknown>', 'NN', 'I-NC', 'O'],
  ['"', '"', '$(', 'O', 'O'],
  ['Nordendler', '<unknown>', 'NN', 'I-NC', 'I-ORG'],
  ['"', '"', '$(', 'O', 'O'],
  ['laden', 'laden', 'VVFIN', 'I-VC', 'O'],
  ['die', 'd', 'ART', 'I-NC', 'O'],
  ['Nordendler', '<unknown>', 'NN', 'I-NC', 'I-MISC'],
  ['ein', 'ein', 'ART', 'B-NC', 'O']]]

In [16]:
W = {}
epoch = 10
for e in range(epoch):
    print('epoch = '+ str(e+1))
    
    for sentence in text_train:

        words = [s[0] for s in sentence]
        word_tags = [s[-1] for s in sentence]

        y_pred = hmm_viterbi(W, words)
        w_true = create_features(words, word_tags)
        w_pred = create_features(words, y_pred)

        for wt in w_true:
            if wt in W: W[wt] += w_true[wt]
            else: W[wt] = w_true[wt]

        for wp in w_pred:
            if wp in W: W[wp] -= w_pred[wp]
            else: W[wp] = -w_pred[wp]
                
    filename = 'result_SPViterbi_deu.txt'
    if os.path.exists(filename):
        os.remove(filename)

    save_wt = '\n'
    for sentence in text_pred:
        words = [s[0] for s in sentence]
        y_pred = hmm_viterbi(W, words)

        for wt in range(len(sentence)):
            save_wt += sentence[wt][0]+' '+sentence[wt][1]+' '+sentence[wt][2]+' '+sentence[wt][3]+' '+sentence[wt][4]+' '+y_pred[wt]+'\n'

        save_wt += '\n'

    with open('result_SPViterbi_deu.txt', 'w+') as txt_file:
        txt_file.write(save_wt)

    output = subprocess.run("/usr/bin/perl -w conlleval < result_SPViterbi_deu.txt", shell=True, stdout=subprocess.PIPE, universal_newlines=True)
    print('\n'+output.stdout)

epoch = 1

processed 51444 tokens with 4833 phrases; found: 2810 phrases; correct: 1318.
accuracy:  90.44%; precision:  46.90%; recall:  27.27%; FB1:  34.49
              LOC: precision:  52.73%; recall:  41.74%; FB1:  46.60
             MISC: precision:  33.40%; recall:  15.45%; FB1:  21.12
              ORG: precision:  33.53%; recall:  23.21%; FB1:  27.43
              PER: precision:  69.40%; recall:  27.19%; FB1:  39.08

epoch = 2

processed 51444 tokens with 4833 phrases; found: 2477 phrases; correct: 1325.
accuracy:  90.91%; precision:  53.49%; recall:  27.42%; FB1:  36.25
              LOC: precision:  58.65%; recall:  36.75%; FB1:  45.18
             MISC: precision:  36.98%; recall:  16.73%; FB1:  23.04
              ORG: precision:  43.40%; recall:  27.56%; FB1:  33.71
              PER: precision:  77.24%; recall:  27.12%; FB1:  40.15

epoch = 3

processed 51444 tokens with 4833 phrases; found: 2768 phrases; correct: 1271.
accuracy:  90.05%; precision:  45.92%; recall:  26.