In [8]:
import os
import subprocess
import numpy as np

In [9]:
tags = [t.strip().split(' ')[-1] for t in list(filter(lambda x:x != '-DOCSTART- -X- -X- O\n',open("../data/conll03/eng.train").readlines())) if len(t.strip().split(' '))>1]
tags = sorted(list(set(tags)))
tags

['B-LOC', 'B-MISC', 'B-ORG', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'MO', 'O']

In [15]:
def hmm_viterbi(W, words):
    y_pred = []
    for num in range(len(words)):
        if num == 0:
            best_score = [W[('*', t)] if ('*',t) in W else 0 for t in tags]
            best_score = [best_score[s]+W[(words[num], tags[s])] if (words[num], tags[s]) in W else best_score[s] for s in range(len(best_score))]
            y_pred.append(tags[best_score.index(max(best_score))])
        else:
            sum_score = [np.array(best_score) for t in tags]
            for t_next in range(len(sum_score)):
                sum_score[t_next] = [W[(tags[t_prev],tags[t_next])]+sum_score[t_next][t_prev] if (tags[t_prev],tags[t_next]) in W 
                                     else sum_score[t_next][t_prev]
                                     for t_prev in range(len(sum_score[t_next]))]
            sum_score = [np.array(sum_score[s])+W[(words[num], tags[s])] if (words[num], tags[s]) in W else np.array(sum_score[s]) for s in range(len(sum_score))]
            best_score = [max(s) for s in sum_score]
            y_pred.append(tags[best_score.index(max(best_score))])
    return y_pred

def create_features(words, tags):
    temp_dict = {}
    for t in range(len(tags)+1):
        if t == 0: prev_tag = '*'
        else: prev_tag = tags[t-1]
        if t == len(tags): next_tag = '*'
        else: next_tag = tags[t]
        
        if (prev_tag, next_tag) in temp_dict:
            temp_dict[(prev_tag, next_tag)] += 0.01
        else: temp_dict[(prev_tag, next_tag)] = 0.01
        
    for t in range(len(tags)):
        if (words[t], tags[t]) in temp_dict:
            temp_dict[(words[t], tags[t])] += 1
        else: temp_dict[(words[t], tags[t])] = 1
    return temp_dict

In [11]:
text = [t.strip().split(' ') for t in list(filter(lambda x:x != '-DOCSTART- -X- -X- O\n',open("../data/conll03/eng.train").readlines()))]
sen_locs = [t for t in range(len(text)) if len(text[t])==1]
text_train = [[text[s] for s in range(sen_locs[loc]+1,sen_locs[loc+1])] for loc in range(len(sen_locs)-1)]
text_train[:1]

[[['EU', 'NNP', 'I-NP', 'I-ORG'],
  ['rejects', 'VBZ', 'I-VP', 'O'],
  ['German', 'JJ', 'I-NP', 'I-MISC'],
  ['call', 'NN', 'I-NP', 'O'],
  ['to', 'TO', 'I-VP', 'O'],
  ['boycott', 'VB', 'I-VP', 'O'],
  ['British', 'JJ', 'I-NP', 'I-MISC'],
  ['lamb', 'NN', 'I-NP', 'O'],
  ['.', '.', 'O', 'O']]]

In [12]:
text = [t.strip().split(' ') for t in list(filter(lambda x:x != '-DOCSTART- -X- -X- O\n',open("../data/conll03/eng.testa").readlines()))]
sen_locs = [t for t in range(len(text)) if len(text[t])==1]
text_pred = [[text[s] for s in range(sen_locs[loc]+1,sen_locs[loc+1])] for loc in range(len(sen_locs)-1)]
text_pred[:1]

[[['CRICKET', 'NNP', 'I-NP', 'O'],
  ['-', ':', 'O', 'O'],
  ['LEICESTERSHIRE', 'NNP', 'I-NP', 'I-ORG'],
  ['TAKE', 'NNP', 'I-NP', 'O'],
  ['OVER', 'IN', 'I-PP', 'O'],
  ['AT', 'NNP', 'I-NP', 'O'],
  ['TOP', 'NNP', 'I-NP', 'O'],
  ['AFTER', 'NNP', 'I-NP', 'O'],
  ['INNINGS', 'NNP', 'I-NP', 'O'],
  ['VICTORY', 'NN', 'I-NP', 'O'],
  ['.', '.', 'O', 'O']]]

In [16]:
W = {}
epoch = 10
for e in range(epoch):
    print('epoch = '+ str(e+1))
    
    for sentence in text_train:

        words = [s[0] for s in sentence]
        word_tags = [s[-1] for s in sentence]

        y_pred = hmm_viterbi(W, words)
        w_true = create_features(words, word_tags)
        w_pred = create_features(words, y_pred)

        for wt in w_true:
            if wt in W: W[wt] += w_true[wt]
            else: W[wt] = w_true[wt]

        for wp in w_pred:
            if wp in W: W[wp] -= w_pred[wp]
            else: W[wp] = -w_pred[wp]
                
    filename = 'result_SPViterbi_eng.txt'
    if os.path.exists(filename):
        os.remove(filename)

    save_wt = '\n'
    for sentence in text_pred:
        words = [s[0] for s in sentence]
        y_pred = hmm_viterbi(W, words)

        for wt in range(len(sentence)):
            save_wt += sentence[wt][0]+' '+sentence[wt][1]+' '+sentence[wt][2]+' '+sentence[wt][3]+' '+y_pred[wt]+'\n'

        save_wt += '\n'

    with open('result_SPViterbi_eng.txt', 'w+') as txt_file:
        txt_file.write(save_wt)

    output = subprocess.run("/usr/bin/perl -w conlleval < result_SPViterbi_eng.txt", shell=True, stdout=subprocess.PIPE, universal_newlines=True)
    print('\n'+output.stdout)

epoch = 1

processed 51362 tokens with 5942 phrases; found: 5626 phrases; correct: 3718.
accuracy:  92.98%; precision:  66.09%; recall:  62.57%; FB1:  64.28
              LOC: precision:  81.54%; recall:  80.57%; FB1:  81.05
             MISC: precision:  52.02%; recall:  53.04%; FB1:  52.52
              ORG: precision:  48.09%; recall:  45.04%; FB1:  46.52
              PER: precision:  70.90%; recall:  62.16%; FB1:  66.24

epoch = 2

processed 51362 tokens with 5942 phrases; found: 5344 phrases; correct: 3571.
accuracy:  92.72%; precision:  66.82%; recall:  60.10%; FB1:  63.28
              LOC: precision:  79.89%; recall:  77.41%; FB1:  78.63
             MISC: precision:  62.62%; recall:  56.51%; FB1:  59.41
              ORG: precision:  42.26%; recall:  42.36%; FB1:  42.31
              PER: precision:  76.37%; recall:  57.55%; FB1:  65.63

epoch = 3

processed 51362 tokens with 5942 phrases; found: 5382 phrases; correct: 3365.
accuracy:  92.53%; precision:  62.52%; recall:  56.