# Hidden Markov Model tagger project

### Imports and settings

In [103]:
%load_ext autoreload
%autoreload 1
%run auxillary_functions.ipynb

import matplotlib.pyplot as plt
import numpy as np

from IPython.core.display import HTML
from itertools import chain
from collections import Counter, defaultdict
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution
from collections import defaultdict

print_examples = True

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load dataset and print the main parameters

In [21]:
data = Dataset('tags-universal.txt', 'brown-universal.txt', train_test_split = 0.8)

print("There are {} sentences in the corpus.".format(len(data)))
print("There are {} sentences in the training set.".format(len(data.training_set)))
print("There are {} sentences in the testing set.".format(len(data.testing_set)))

assert len(data) == len(data.training_set) + len(data.testing_set), \
       "The number of sentences in the training set + testing set should sum to the number of sentences in the corpus"

There are 57340 sentences in the corpus.
There are 45872 sentences in the training set.
There are 11468 sentences in the testing set.


In [92]:
# Sentence example
if print_examples:
  key = 'b100-38532'
  print("Sentence: {}".format(key))
  print("words:\n\t{!s}".format(data.sentences[key].words))
  print("tags:\n\t{!s}".format(data.sentences[key].tags))

Sentence: b100-38532
words:
	('Perhaps', 'it', 'was', 'right', ';', ';')
tags:
	('ADV', 'PRON', 'VERB', 'ADJ', '.', '.')


In [91]:
# Counting unique Elements in the dataset
if print_examples:
  print("There are a total of {} samples of {} unique words in the corpus."
        .format(data.N, len(data.vocab)))
  print("There are {} samples of {} unique words in the training set."
        .format(data.training_set.N, len(data.training_set.vocab)))
  print("There are {} samples of {} unique words in the testing set."
        .format(data.testing_set.N, len(data.testing_set.vocab)))
  print("There are {} words in the test set that are missing in the training set."
        .format(len(data.testing_set.vocab - data.training_set.vocab)))

  assert data.N == data.training_set.N + data.testing_set.N, \
         "The number of training + test samples should sum to the total number of samples"

There are a total of 1161192 samples of 56057 unique words in the corpus.
There are 928458 samples of 50536 unique words in the training set.
There are 232734 samples of 25112 unique words in the testing set.
There are 5521 words in the test set that are missing in the training set.


In [87]:
# Accessing word and tag sequences
if print_examples:
  for i in range(2):    
      print("Sentence {}:".format(i + 1), data.X[i])
      print()
      print("Labels {}:".format(i + 1), data.Y[i])
      print()

In [189]:
# Accessing (word, tag) samples

if print_examples:
  print("\nStream (word, tag) pairs:\n")
  for i, pair in enumerate(data.stream()):
      print("\t", pair)
      if i > 5: break


Stream (word, tag) pairs:

	 ('Mr.', 'NOUN')
	 ('Podger', 'NOUN')
	 ('had', 'VERB')
	 ('thanked', 'VERB')
	 ('him', 'PRON')
	 ('gravely', 'ADV')
	 (',', '.')


### Pair counts implementation

In [None]:
#### Implementation 1 - suboptimal. Dictionary and con

In [None]:
def pair_counts(sequences_A, sequences_B):
    pair_counts = {}

    for i, pair in enumerate(data.stream()):
      if pair[1] not in pair_counts.keys():
        pair_counts[pair[1]] = {}
        if pair[0] not in pair_counts[pair[1]].keys():
          pair_counts[pair[1]][pair[0]] = 1
        else:
          pair_counts[pair[1]][pair[0]] += 1

      else:
        if pair[0] not in pair_counts[pair[1]].keys():
          pair_counts[pair[1]][pair[0]] = 1
        else:
          pair_counts[pair[1]][pair[0]] += 1
    
    return pair_counts


emission_counts = pair_counts(data.tagset, data.vocab)

assert len(emission_counts) == 12, \
       "Uh oh. There should be 12 tags in your dictionary."
assert max(emission_counts["NOUN"], key=emission_counts["NOUN"].get) == 'time', \
       "Hmmm...'time' is expected to be the most common NOUN."
HTML('<div class="alert alert-block alert-success">Your emission counts look good!</div>')

#### Implementation 2 - mediocre. dict and defaultdict combination.

In [191]:
def pair_counts(sequences_A, sequences_B):
    pair_counts = {}

    for i, pair in enumerate(data.stream()):
      if pair[1] not in pair_counts.keys():
        pair_counts[pair[1]] = defaultdict(lambda: 0)
        pair_counts[pair[1]][pair[0]] += 1
      else:
        pair_counts[pair[1]][pair[0]] += 1
    
    return pair_counts


emission_counts = pair_counts(data.tagset, data.vocab)

assert len(emission_counts) == 12, \
       "Uh oh. There should be 12 tags in your dictionary."
assert max(emission_counts["NOUN"], key=emission_counts["NOUN"].get) == 'time', \
       "Hmmm...'time' is expected to be the most common NOUN."
HTML('<div class="alert alert-block alert-success">Your emission counts look good!</div>')

#### Implementation 3 - optimal. Using defaultdict class.

In [192]:
def pair_counts(sequences_A, sequences_B):
    pair_counts = defaultdict(lambda: defaultdict(lambda: 0))
    
    for i, pair in enumerate(data.stream()):
      pair_counts[pair[1]][pair[0]] += 1
    
    return pair_counts


emission_counts = pair_counts(data.tagset, data.vocab)

assert len(emission_counts) == 12, \
       "Uh oh. There should be 12 tags in your dictionary."
assert max(emission_counts["NOUN"], key=emission_counts["NOUN"].get) == 'time', \
       "Hmmm...'time' is expected to be the most common NOUN."
HTML('<div class="alert alert-block alert-success">Your emission counts look good!</div>')

In [186]:
def pair_counts(sequences_A, sequences_B):
    pair_counts = {}

    for i, pair in enumerate(data.stream()):
      if pair[1] not in pair_counts.keys():
        pair_counts[pair[1]] = {}
        if pair[0] not in pair_counts[pair[1]].keys():
          pair_counts[pair[1]][pair[0]] = 1
        else:
          pair_counts[pair[1]][pair[0]] += 1

      else:
        if pair[0] not in pair_counts[pair[1]].keys():
          pair_counts[pair[1]][pair[0]] = 1
        else:
          pair_counts[pair[1]][pair[0]] += 1
    
    return pair_counts


emission_counts = pair_counts(data.tagset, data.vocab)

assert len(emission_counts) == 12, \
       "Uh oh. There should be 12 tags in your dictionary."
assert max(emission_counts["NOUN"], key=emission_counts["NOUN"].get) == 'time', \
       "Hmmm...'time' is expected to be the most common NOUN."
HTML('<div class="alert alert-block alert-success">Your emission counts look good!</div>')

In [178]:
pair_counts = {}

for i, pair in enumerate(data.stream()):
  if pair[1] not in pair_counts.keys():
    pair_counts[pair[1]] = {}
    if pair[0] not in pair_counts[pair[1]].keys():
      pair_counts[pair[1]][pair[0]] = 1
    else:
      pair_counts[pair[1]][pair[0]] += 1
      
  else:
    if pair[0] not in pair_counts[pair[1]].keys():
      pair_counts[pair[1]][pair[0]] = 1
    else:
      pair_counts[pair[1]][pair[0]] += 1

In [168]:
pair_counts

{'NOUN': {'Mr.': 1,
  'Podger': 1,
  'use': 1,
  'advice': 1,
  'difference': 1,
  'opinion': 1,
  'board': 1},
 'VERB': {'had': 1,
  'thanked': 1,
  'made': 1,
  'seemed': 1,
  'be': 1,
  'should': 1},
 'PRON': {'him': 1, 'he': 1},
 'ADV': {'gravely': 1, 'now': 1, 'how': 1, 'far': 1},
 '.': {',': 1, '.': 1},
 'CONJ': {'and': 1, 'But': 1},
 'ADP': {'of': 2, 'as': 1, 'to': 1},
 'DET': {'the': 2, 'some': 1},
 'PRT': {'there': 1, 'to': 1}}

In [169]:
pair_counts = {}

for i, pair in enumerate(data.stream()):
  print("\t", pair)
  if pair[1] not in pair_counts.keys():
    pair_counts[pair[1]] = defaultdict(lambda: 0)
    pair_counts[pair[1]][pair[0]] += 1
  else:
    pair_counts[pair[1]][pair[0]] += 1
  if i > 5: 
    break

	 ('Mr.', 'NOUN')
	 ('Podger', 'NOUN')
	 ('had', 'VERB')
	 ('thanked', 'VERB')
	 ('him', 'PRON')
	 ('gravely', 'ADV')
	 (',', '.')


In [170]:
pair_counts

{'NOUN': defaultdict(<function __main__.<lambda>()>, {'Mr.': 1, 'Podger': 1}),
 'VERB': defaultdict(<function __main__.<lambda>()>, {'had': 1, 'thanked': 1}),
 'PRON': defaultdict(<function __main__.<lambda>()>, {'him': 1}),
 'ADV': defaultdict(<function __main__.<lambda>()>, {'gravely': 1}),
 '.': defaultdict(<function __main__.<lambda>()>, {',': 1})}

In [173]:
pair_counts = defaultdict(lambda: defaultdict(lambda: 0))

for i, pair in enumerate(data.stream()):
  pair_counts[pair[1]][pair[0]] += 1
  if i > 30: 
    break

In [174]:
pair_counts

defaultdict(<function __main__.<lambda>()>,
            {'NOUN': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'Mr.': 1,
                          'Podger': 1,
                          'use': 1,
                          'advice': 1,
                          'difference': 1,
                          'opinion': 1,
                          'board': 1}),
             'VERB': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'had': 1,
                          'thanked': 1,
                          'made': 1,
                          'seemed': 1,
                          'be': 1,
                          'should': 1}),
             'PRON': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'him': 1, 'he': 1}),
             'ADV': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'gravely': 1, 'now': 1, 'how': 1, 'far': 1}),
     

In [177]:
data.training_set.X

(('Whenever',
  'artists',
  ',',
  'indeed',
  ',',
  'turned',
  'to',
  'actual',
  'representations',
  'or',
  'molded',
  'three-dimensional',
  'figures',
  ',',
  'which',
  'were',
  'rare',
  'down',
  'to',
  '800',
  'B.C.',
  ',',
  'they',
  'tended',
  'to',
  'reflect',
  'reality',
  '(',
  'see',
  'Plate',
  '6a',
  ',',
  '9b',
  ')',
  ';',
  ';'),
 ('For',
  'almost',
  'two',
  'months',
  ',',
  'the',
  'defendant',
  'and',
  'the',
  'world',
  'heard',
  'from',
  'individuals',
  'escaped',
  'from',
  'the',
  'grave',
  'about',
  'fathers',
  'and',
  'mothers',
  ',',
  'graybeards',
  ',',
  'adolescents',
  ',',
  'babies',
  ',',
  'starved',
  ',',
  'beaten',
  'to',
  'death',
  ',',
  'strangled',
  ',',
  'machine-gunned',
  ',',
  'gassed',
  ',',
  'burned',
  '.'),
 ('Clearer', 'meaning'),
 ('Yes',
  ',',
  'gentlemen',
  ',',
  'I',
  'am',
  'getting',
  'to',
  'the',
  'point',
  ',',
  'to',
  'my',
  'point',
  '.'),
 ('About',
  'the',

In [126]:
pair_counts = defaultdict(lambda: 1)

In [127]:
pair_counts['word'] = 1

In [128]:
pair_counts

defaultdict(<function __main__.<lambda>()>, {'word': 1})