**NOTE:** The first three steps will be the same as in the baseline model.

In [1]:
import nltk
import utils
import random
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution

### Step 1

Download the brown data set and split it into a train / test with a ratio of 0.8 / 0.2

In [2]:
nltk.download('brown')

corpus = list(nltk.corpus.brown.tagged_sents())

random.seed(42)
random.shuffle(corpus)

split = int(0.8 * len(corpus))

train = corpus[:split]
test = corpus[split:]

print('Train', len(train))
print('Test', len(test))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
Train 45872
Test 11468


### Step 2

Originally tagged sentences represented as sequences of tuple, i.e. (word, tag). We need to rearrange data in order to match to default format from the project "Part of Speech Tagging".

In [3]:
print(train[0])

[('He', 'PPS'), ('let', 'VBD'), ('her', 'PPO'), ('tell', 'VB'), ('him', 'PPO'), ('all', 'ABN'), ('about', 'IN'), ('the', 'AT'), ('church', 'NN'), ('.', '.')]


In [4]:
train_x, train_y, train_words, train_tagset = utils.rearrange_data(train)
test_x, test_y, test_words, test_tagset = utils.rearrange_data(test)

In [5]:
print(train_x[0])
print(train_y[0])

['He', 'let', 'her', 'tell', 'him', 'all', 'about', 'the', 'church', '.']
['PPS', 'VBD', 'PPO', 'VB', 'PPO', 'ABN', 'IN', 'AT', 'NN', '.']


In [6]:
print('Train words', len(train_words), 'tags', len(train_tagset))
print('Unknown test words', len(test_words.difference(train_words)), 'tags', len(test_tagset.difference(train_tagset)))

Train words 50630 tags 450
Unknown test words 5427 tags 22


### Step 3 

Accumulate statistics of the training data using the functions from the regular steps of the mentioned project.

In [7]:
emission_counts = utils.pair_counts(train_y, train_x)
tag_unigrams = utils.unigram_counts(train_y)
tag_bigrams = utils.bigram_counts(train_y)
tag_starts = utils.starting_counts(train_y)
tag_ends = utils.ending_counts(train_y)

### Step 3.1

I'll smoothe only transition probability from tag to tag, and also add the unknown transitions, which tags exists in the training data. To do this efficiently, I first find the unknown bigrams from the test data. It allows me to not add unused transition into the model.

In [9]:
tag_bigrams_test = utils.bigram_counts(test_y)

### Step 4

Now, create the model with laplace smoothing.

In [10]:
model = HiddenMarkovModel(name="brown-smooth-hmm-tagger")

states = dict()
for tag, words in emission_counts.items():
    n = tag_unigrams[tag]
    assert n == sum(words.values())
    probs = {w:c / n for w, c in words.items()}
    emissions = DiscreteDistribution(probs)
    state = State(emissions, name=tag)
    model.add_states(state)
    states[tag] = state

n = sum(tag_starts.values())
for tag, counts in tag_starts.items():
    model.add_transition(model.start, states[tag], counts / n)

for (tag1, tag2), counts in tag_bigrams.items():
    nominator = counts + 1
    denominator = tag_unigrams[tag1] + len(train_tagset)
    model.add_transition(states[tag1], states[tag2], nominator / denominator)

for tag, counts in tag_ends.items():
    model.add_transition(states[tag], model.end, counts / tag_unigrams[tag])

# NOTE: counts statistics from the test set is not used

new_bigrams = 0
for (tag1, tag2), counts in tag_bigrams_test.items():
    if (tag1, tag2) in tag_bigrams:
        continue
    if tag1 not in states or tag2 not in states:
        continue
    denominator = len(train_tagset)
    if tag1 in tag_unigrams:
        denominator += tag_unigrams[tag1]
    model.add_transition(states[tag1], states[tag2], 1 / denominator)
    new_bigrams += 1

print('Extend transition model', len(tag_bigrams), 'by', new_bigrams)
    
model.bake()

print('Edges', model.edge_count())

Extend transition model 7552 by 458
Edges 8276


### Step 5

Finally, calculate accuracy of the model.

In [11]:
training_acc = utils.accuracy(train_x, train_y, model, vocabulary=train_words)
print("training accuracy: {:.2f}%".format(100 * training_acc))

testing_acc = utils.accuracy(test_x, test_y, model, vocabulary=train_words)
print("testing accuracy: {:.2f}%".format(100 * testing_acc))

training accuracy: 97.30%
testing accuracy: 93.86%
