# Hidden Markov Model tagger project

### Imports and settings

In [230]:
%load_ext autoreload
%autoreload 1
%run auxillary_functions.ipynb

import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
from collections import defaultdict, namedtuple


print_examples = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dataset and print the main parameters

In [21]:
data = Dataset('tags-universal.txt', 'brown-universal.txt', train_test_split = 0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

There are 57340 sentences in the corpus.
There are 45872 sentences in the training set.
There are 11468 sentences in the testing set.


In [92]:
# Sentence example
if print_examples:
  key = 'b100-38532'
  print("Sentence: {}".format(key))
  print("words:\n\t{!s}".format(data.sentences[key].words))
  print("tags:\n\t{!s}".format(data.sentences[key].tags))

Sentence: b100-38532
words:
	('Perhaps', 'it', 'was', 'right', ';', ';')
tags:
	('ADV', 'PRON', 'VERB', 'ADJ', '.', '.')


In [91]:
# Counting unique Elements in the dataset
if print_examples:
  print("There are a total of {} samples of {} unique words in the corpus."
        .format(data.N, len(data.vocab)))
  print("There are {} samples of {} unique words in the training set."
        .format(data.training_set.N, len(data.training_set.vocab)))
  print("There are {} samples of {} unique words in the testing set."
        .format(data.testing_set.N, len(data.testing_set.vocab)))
  print("There are {} words in the test set that are missing in the training set."
        .format(len(data.testing_set.vocab - data.training_set.vocab)))

  assert data.N == data.training_set.N + data.testing_set.N, \
         "The number of training + test samples should sum to the total number of samples"

There are a total of 1161192 samples of 56057 unique words in the corpus.
There are 928458 samples of 50536 unique words in the training set.
There are 232734 samples of 25112 unique words in the testing set.
There are 5521 words in the test set that are missing in the training set.


In [87]:
# Accessing word and tag sequences
if print_examples:
  for i in range(2):    
      print("Sentence {}:".format(i + 1), data.X[i])
      print()
      print("Labels {}:".format(i + 1), data.Y[i])
      print()

In [189]:
# Accessing (word, tag) samples

if print_examples:
  print("\nStream (word, tag) pairs:\n")
  for i, pair in enumerate(data.stream()):
      print("\t", pair)
      if i > 5: break


Stream (word, tag) pairs:

	 ('Mr.', 'NOUN')
	 ('Podger', 'NOUN')
	 ('had', 'VERB')
	 ('thanked', 'VERB')
	 ('him', 'PRON')
	 ('gravely', 'ADV')
	 (',', '.')


### Pair counts implementation

#### Implementation 1 - suboptimal. Dictionary and conditional statements.

In [197]:
# def pair_counts(sequences_A, sequences_B):
#     pair_counts = {}

#     for i, pair in enumerate(data.stream()):
#       if pair[1] not in pair_counts.keys():
#         pair_counts[pair[1]] = {}
#         if pair[0] not in pair_counts[pair[1]].keys():
#           pair_counts[pair[1]][pair[0]] = 1
#         else:
#           pair_counts[pair[1]][pair[0]] += 1

#       else:
#         if pair[0] not in pair_counts[pair[1]].keys():
#           pair_counts[pair[1]][pair[0]] = 1
#         else:
#           pair_counts[pair[1]][pair[0]] += 1
    
#     return pair_counts


# emission_counts = pair_counts(data.tagset, data.vocab)

#### Implementation 2 - mediocre. dict and defaultdict combination.

In [198]:
# def pair_counts(sequences_A, sequences_B):
#     pair_counts = {}

#     for i, pair in enumerate(data.stream()):
#       if pair[1] not in pair_counts.keys():
#         pair_counts[pair[1]] = defaultdict(lambda: 0)
#         pair_counts[pair[1]][pair[0]] += 1
#       else:
#         pair_counts[pair[1]][pair[0]] += 1
    
#     return pair_counts


# emission_counts = pair_counts(data.tagset, data.vocab)

#### Implementation 3 - optimal. Using defaultdict class.

In [200]:
# def pair_counts(sequences_A, sequences_B):
#     pair_counts = defaultdict(lambda: defaultdict(lambda: 0))
    
#     for i, pair in enumerate(data.stream()):
#       pair_counts[pair[1]][pair[0]] += 1
    
#     return pair_counts


# emission_counts = pair_counts(data.tagset, data.vocab)

#### Implementation 4 - expected (manual). Using sequences of arbitrary lengths and a nested for loop.

In [225]:
# def pair_counts(sequences_A, sequences_B):
  
#     pair_counts = defaultdict(lambda: defaultdict(lambda: 0))

#     for i in range(len(data.X)):
#       for pair in zip(data.X[i], data.Y[i]):
#         pair_counts[pair[1]][pair[0]] += 1
    
#     return pair_counts


# emission_counts = pair_counts(data.tagset, data.vocab)

#### Implementation 5 - expected (itertools chain). Using sequences of arbitrary lengths and itertools

In [231]:
def pair_counts(sequences_A, sequences_B):
  
    pair_counts = defaultdict(lambda: defaultdict(lambda: 0))
    
    for pair in zip(tuple(chain.from_iterable(data.X)), tuple(chain.from_iterable(data.Y))):
      pair_counts[pair[1]][pair[0]] += 1
    
    return pair_counts


emission_counts = pair_counts(data.tagset, data.vocab)

### Most frequent class tagger (baseline)

In [233]:
FakeState = namedtuple("FakeState", "name")

class MFCTagger:
    missing = FakeState(name="<MISSING>")
    
    def __init__(self, table):
        self.table = defaultdict(lambda: MFCTagger.missing)
        self.table.update({word: FakeState(name=tag) for word, tag in table.items()})
        
    def viterbi(self, seq):
        return 0., list(enumerate(["<start>"] + [self.table[w] for w in seq] + ["<end>"]))

word_counts = pair_counts(data.training_set.Y, data.training_set.X)

mfc_table = {k:max(v, key=v.get) for k,v in word_counts.items()}

mfc_model = MFCTagger(mfc_table)

In [236]:
# Helper functions for the missing value functionality

def replace_unknown(sequence):
    return [w if w in data.training_set.vocab else 'nan' for w in sequence]

def simplify_decoding(X, model):
    _, state_path = model.viterbi(replace_unknown(X))
    return [state[1].name for state in state_path[1:-1]]