In [1]:
# This file illustrates how you might experiment with the HMM interface at the prompt.
# You can also run it directly.

import logging, math, os
from pathlib import Path
from typing import Callable

from corpus import TaggedCorpus, sentence_str
from eval import model_cross_entropy, tagger_write_output
from hmm_test import HiddenMarkovModel
from lexicon import build_lexicon
import torch
import pdb


In [2]:
# Set up logging
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG
# torch.autograd.set_detect_anomaly(True)    # uncomment to improve error messages from .backward(), but slows down

# Make an HMM with randomly initialized parameters.
icsup = TaggedCorpus(Path("../nlp6-data/icsup"), add_oov=False)
logging.info(f"Ice cream vocabulary: {list(icsup.vocab)}")
logging.info(f"Ice cream tagset: {list(icsup.tagset)}")
lexicon = build_lexicon(icsup, one_hot=True)   # one-hot lexicon: separate parameters for each word
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab, lexicon)

logging.info("*** Current A, B matrices (computed by softmax from small random parameters)")
hmm.updateAB()   # compute the matrices from the initial parameters (this would normally happen during training).
                 # An alternative is to set them directly to some spreadsheet values you'd like to try.
# hmm.A = torch.Tensor([[0.8, 0.1,0.1,0],[0.1,0.8,0.1,0],[0,0,0,0],[0.5, 0.5,0,0]])
# hmm.B = torch.Tensor([[0.7, 0.2,0.1],[0.1,0.2,0.7],[0,0,0],[0,0,0]])
hmm.printAB()




INFO : Read 40 tokens from icsup
INFO : Created 4 tag types
INFO : Created 5 word types
INFO : Ice cream vocabulary: ['1', '2', '3', '_EOS_WORD_', '_BOS_WORD_']
INFO : Ice cream tagset: ['C', 'H', '_EOS_TAG_', '_BOS_TAG_']
INFO : *** Current A, B matrices (computed by softmax from small random parameters)


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.334	0.332	0.334	0.000
H	0.332	0.334	0.334	0.000
_EOS_TAG_	0.333	0.335	0.332	0.000
_BOS_TAG_	0.333	0.333	0.334	0.000

Emission matrix B:
	1	2	3
C	0.332	0.334	0.334
H	0.332	0.334	0.334
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




In [None]:
# While training on ice cream, we will just evaluate the cross-entropy
# on the training data itself (icsup), since we are interested in watching it improve.
logging.info("*** Supervised training on icsup")
cross_entropy_loss = lambda model: model_cross_entropy(model, icsup)
hmm.train(corpus=icsup, loss=cross_entropy_loss, 
          minibatch_size=10, evalbatch_size=500, lr=0.01, tolerance=0.0001)

logging.info("*** A, B matrices after training on icsup (should approximately match initial params on spreadsheet [transposed])")
hmm.printAB()


In [3]:
# load the model
hmm.load('my_hmm.pkl')
logging.info("*** Viterbi results on icraw")
icraw = TaggedCorpus(Path("../nlp6-data/icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
tagger_write_output(hmm, icraw, Path("icraw.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat ../nlp6-data/icraw.output")   # print the file we just created, and remove it

# Now let's use the forward algorithm to see what the model thinks about 
# the probability of the spreadsheet "sentence."
logging.info("*** Forward algorithm on icraw (should approximately match iteration 0 "
             "on spreadsheet)")
for sentence in icraw:
    prob = math.exp(hmm.log_prob(sentence, icraw))
    logging.info(f"{prob} = p({sentence_str(sentence)})")
    




INFO : Loading model from my_hmm.pkl
INFO : Loaded model from my_hmm.pkl
INFO : *** Viterbi results on icraw
0it [00:00, ?it/s]


{1: tensor([0, 0, 0, 0]), 2: tensor([0, 0, 0, 0]), 3: tensor([0, 0, 0, 0]), 4: tensor([0, 0, 0, 0]), 5: tensor([0, 0, 0, 0]), 6: tensor([0, 0, 0, 0]), 7: tensor([0, 0, 0, 0]), 8: tensor([0, 0, 0, 0]), 9: tensor([0, 0, 0, 0]), 10: tensor([0, 0, 0, 0]), 11: tensor([0, 0, 0, 0]), 12: tensor([0, 0, 0, 0]), 13: tensor([0, 0, 0, 0]), 14: tensor([0, 0, 0, 0]), 15: tensor([0, 0, 0, 0]), 16: tensor([0, 0, 0, 0]), 17: tensor([0, 0, 0, 0]), 18: tensor([0, 0, 0, 0]), 19: tensor([0, 0, 0, 0]), 20: tensor([0, 0, 0, 0]), 21: tensor([0, 0, 0, 0]), 22: tensor([0, 0, 0, 0]), 23: tensor([0, 0, 0, 0]), 24: tensor([0, 0, 0, 0]), 25: tensor([0, 0, 0, 0]), 26: tensor([0, 0, 0, 0]), 27: tensor([0, 0, 0, 0]), 28: tensor([0, 0, 0, 0]), 29: tensor([0, 0, 0, 0]), 30: tensor([0, 0, 0, 0]), 31: tensor([0, 0, 0, 0]), 32: tensor([0, 0, 0, 0]), 33: tensor([0, 0, 0, 0]), 34: tensor([0, 0, 0, 0])}
_EOS_TAG_
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor(0)
C
tensor

KeyError: 0

In [None]:
# Finally, let's reestimate on the icraw data, as the spreadsheet does.
logging.info("*** Reestimating on icraw (perplexity should improve on every iteration)")
negative_log_likelihood = lambda model: model_cross_entropy(model, icraw)  # evaluate on icraw itself
hmm.train(corpus=icraw, loss=negative_log_likelihood,
          minibatch_size=10, evalbatch_size=500, lr=0.001, tolerance=0.0001)

logging.info("*** A, B matrices after reestimation on icraw (SGD, not EM, but still "
             "should approximately match final params on spreadsheet [transposed])")
hmm.printAB()