In [31]:
# This file illustrates how you might experiment with the HMM interface at the prompt.
# You can also run it directly.

import logging, math, os
from pathlib import Path
from corpus import TaggedCorpus, desupervise, sentence_str
from typing import Callable
from corpus import TaggedCorpus, sentence_str


from eval import model_cross_entropy, tagger_write_output
from hmm import HiddenMarkovModel
from crf import CRFModel
from lexicon import build_lexicon
import torch

import pdb
from eval import eval_tagging, model_cross_entropy, model_error_rate

In [None]:
# Set up logging
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG
# torch.autograd.set_detect_anomaly(True)    # uncomment to improve error messages from .backward(), but slows down

# Make an HMM with randomly initialized parameters.
icsup = TaggedCorpus(Path("../nlp6-data/icsup"), add_oov=False)
logging.info(f"Ice cream vocabulary: {list(icsup.vocab)}")
logging.info(f"Ice cream tagset: {list(icsup.tagset)}")
lexicon = build_lexicon(icsup, one_hot=True)   # one-hot lexicon: separate parameters for each word
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab, lexicon)
hmm = hmm.load('my_hmm.pkl')
logging.info("*** Current A, B matrices (computed by softmax from small random parameters)")
#hmm.updateAB()   # compute the matrices from the initial parameters (this would normally happen during training).
                 # An alternative is to set them directly to some spreadsheet values you'd like to try.
# hmm.A = torch.Tensor([[0.8, 0.1,0.1,0],[0.1,0.8,0.1,0],[0,0,0,0],[0.5, 0.5,0,0]])
# hmm.B = torch.Tensor([[0.7, 0.2,0.1],[0.1,0.2,0.7],[0,0,0],[0,0,0]])
hmm.printAB()




INFO : Read 40 tokens from icsup
INFO : Created 4 tag types
INFO : Created 5 word types
INFO : Ice cream vocabulary: ['1', '2', '3', '_EOS_WORD_', '_BOS_WORD_']
INFO : Ice cream tagset: ['C', 'H', '_EOS_TAG_', '_BOS_TAG_']


TypeError: __init__() missing 1 required positional argument: 'lexicon'

In [None]:
# While training on ice cream, we will just evaluate the cross-entropy
# on the training data itself (icsup), since we are interested in watching it improve.
logging.info("*** Supervised training on icsup")
cross_entropy_loss = lambda model: model_cross_entropy(model, icsup)
hmm.train(corpus=icsup, loss=cross_entropy_loss, 
          minibatch_size=10, evalbatch_size=500, lr=0.01, tolerance=0.0001)

logging.info("*** A, B matrices after training on icsup (should approximately match initial params on spreadsheet [transposed])")
hmm.printAB()


In [None]:
# load the model
hmm = hmm.load('my_hmm.pkl')
logging.info("*** Viterbi results on icraw")
icraw = TaggedCorpus(Path("../nlp6-data/icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
tagger_write_output(hmm, icraw, Path("icraw.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat ../nlp6-data/icraw.output")   # print the file we just created, and remove it

# Now let's use the forward algorithm to see what the model thinks about 
# the probability of the spreadsheet "sentence."
logging.info("*** Forward algorithm on icraw (should approximately match iteration 0 "
             "on spreadsheet)")
for sentence in icraw:
    prob = math.exp(hmm.log_prob(sentence, icraw))
    logging.info(f"{prob} = p({sentence_str(sentence)})")
    





INFO : Loading model from my_hmm.pkl
INFO : Loaded model from my_hmm.pkl
INFO : *** Viterbi results on icraw
1it [00:00, 31.56it/s]
INFO : *** Forward algorithm on icraw (should approximately match iteration 0 on spreadsheet)
INFO : 1.4301312227198852e-58 = p(2 3 3 2 3 2 3 2 2 3 1 3 3 1 1 1 2 1 1 1 3 1 2 1 1 1 2 3 3 2 3 2 2)


{0: tensor([-inf, -inf, -inf, 0.]), 1: tensor([3, 3, 3, 3]), 2: tensor([1, 1, 1, 1]), 3: tensor([1, 1, 1, 1]), 4: tensor([1, 1, 1, 1]), 5: tensor([1, 1, 1, 1]), 6: tensor([1, 1, 1, 1]), 7: tensor([1, 1, 1, 1]), 8: tensor([1, 1, 1, 1]), 9: tensor([1, 1, 1, 1]), 10: tensor([1, 1, 1, 1]), 11: tensor([0, 1, 0, 0]), 12: tensor([0, 1, 0, 0]), 13: tensor([0, 1, 1, 1]), 14: tensor([0, 0, 0, 0]), 15: tensor([0, 0, 0, 0]), 16: tensor([0, 0, 0, 0]), 17: tensor([0, 1, 0, 0]), 18: tensor([0, 0, 0, 0]), 19: tensor([0, 0, 0, 0]), 20: tensor([0, 0, 0, 0]), 21: tensor([0, 1, 0, 0]), 22: tensor([0, 0, 0, 0]), 23: tensor([0, 1, 0, 0]), 24: tensor([0, 0, 0, 0]), 25: tensor([0, 0, 0, 0]), 26: tensor([0, 0, 0, 0]), 27: tensor([0, 1, 0, 0]), 28: tensor([0, 1, 0, 0]), 29: tensor([0, 1, 1, 1]), 30: tensor([0, 1, 1, 1]), 31: tensor([1, 1, 1, 1]), 32: tensor([1, 1, 1, 1]), 33: tensor([1, 1, 1, 1]), 34: tensor([0, 1, 1, 1])}
_EOS_TAG_
1
H
1
H
1
H
1
H
1
H
1
H
1
H
1
H
0
C
0
C
0
C
0
C
0
C
0
C
0
C
0
C
0
C
0
C
0
C
0
C

In [None]:
# Finally, let's reestimate on the icraw data, as the spreadsheet does.
logging.info("*** Reestimating on icraw (perplexity should improve on every iteration)")
negative_log_likelihood = lambda model: model_cross_entropy(model, icraw)  # evaluate on icraw itself
hmm.train(corpus=icraw, loss=negative_log_likelihood,
          minibatch_size=10, evalbatch_size=500, lr=0.001, tolerance=0.0001)

logging.info("*** A, B matrices after reestimation on icraw (SGD, not EM, but still "
             "should approximately match final params on spreadsheet [transposed])")
hmm.printAB()

INFO : *** Reestimating on icraw (perplexity should improve on every iteration)
1it [00:00, 77.69it/s]
INFO : Cross-entropy: 3.9497 nats (= perplexity 51.921)
1it [00:00, 139.47it/s]
INFO : Cross-entropy: 3.9494 nats (= perplexity 51.905)
INFO : Saved model to my_hmm.pkl
500it [00:14, 35.62it/s]
INFO : *** A, B matrices after reestimation on icraw (SGD, not EM, but still should approximately match final params on spreadsheet [transposed])


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.930	0.069	0.001	0.000
H	0.084	0.915	0.001	0.000
_EOS_TAG_	0.333	0.334	0.333	0.000
_BOS_TAG_	0.072	0.924	0.003	0.000

Emission matrix B:
	1	2	3
C	0.650	0.153	0.197
H	0.014	0.484	0.502
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




In [38]:
ic_crf = CRFModel.load('ic_crf.pkl')
model_error_rate(ic_crf, eval_corpus=icsup)

INFO:root:Loading model from ic_crf.pkl
INFO:root:Loaded model from ic_crf.pkl
0it [00:00, ?it/s]


TypeError: The corpus that this sentence came from uses a different tagset or vocab

In [None]:
# load hmm on En data
# Get the corpora
entrain = TaggedCorpus(Path("../nlp6-data/ensup"), Path("../nlp6-data/enraw"))                               # all training
ensup =   TaggedCorpus(Path("../nlp6-data/ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("../nlp6-data/endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
logging.info(f"Tagset: f{list(entrain.tagset)}")
known_vocab = TaggedCorpus(Path("../nlp6-data/ensup")).vocab    # words seen with supervised tags; used in evaluation

In [None]:
# Initialize an HMM
#lexicon = build_lexicon(entrain, embeddings_file=Path('../lexicons/words-50.txt'))  # works better with more attributes!
hmm = HiddenMarkovModel.load('en_hmm.pkl')

#hmm = hmm.load('en_hmm.pkl')
#hmm = hmm.load('divsup_train.pkl')


In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
model_error_rate(hmm, eval_corpus=endev, known_vocab=known_vocab)

996it [00:05, 181.28it/s]
INFO:root:Cross-entropy: 8.0400 nats (= perplexity 3102.636)
996it [00:06, 148.22it/s]
INFO:root:Tagging accuracy: all: 81.739%, known: 83.175%, seen: 72.391%, novel: 62.814%


0.18260668439921357

In [None]:
hmm_aw = HiddenMarkovModel.load('en_hmm_awesome.pkl')
model_error_rate(hmm_aw, eval_corpus=endev, known_vocab=known_vocab)

INFO:root:Loading model from en_hmm_awesome.pkl
INFO:root:Loaded model from en_hmm_awesome.pkl
996it [00:05, 179.65it/s]
INFO:root:Cross-entropy: 6.9604 nats (= perplexity 1054.084)
996it [00:04, 242.16it/s]
INFO:root:Tagging accuracy: all: 93.524%, known: 96.094%, seen: 67.340%, novel: 63.342%


0.06476234532207703

In [None]:
# awesome tag

# load hmm on En data
# Get the corpora
entrain = TaggedCorpus(Path("../nlp6-data/ensup"), Path("../nlp6-data/enraw"), , log_counts=True)                # all training
ensup =   TaggedCorpus(Path("../nlp6-data/ensup"), tagset=entrain.tagset, vocab=entrain.vocab, log_counts=True)  # supervised training
endev =   TaggedCorpus(Path("../nlp6-data/endev"), tagset=entrain.tagset, vocab=entrain.vocab, log_counts=True)  # evaluation
logging.info(f"Tagset: f{list(entrain.tagset)}")
known_vocab = TaggedCorpus(Path("../nlp6-data/ensup")).vocab    # words seen with supervised tags; used in evaluation

In [None]:
# train on unsupervised
logger=logging.getLogger() 

#Now we are going to Set the threshold of logger to DEBUG 
logger.setLevel(logging.INFO) 

loss_dev = lambda model: model_error_rate(model, eval_corpus=endev, known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, minibatch_size=30, evalbatch_size=10000, lr=0.0001, reg=0,save_path='unsup_train.pkl')

996it [00:04, 199.85it/s]
996it [00:03, 278.55it/s]
996it [00:05, 195.63it/s]
996it [00:03, 279.85it/s]
10000it [05:17, 31.47it/s]


In [None]:
logger=logging.getLogger() 

#Now we are going to Set the threshold of logger to DEBUG 
logger.setLevel(logging.DEBUG) 
acc = 0
c =0
for m, sentence in enumerate(endev):
    if m >= 1: break
    viterbi = hmm.viterbi_tagging(desupervise(sentence), endev)
    counts = eval_tagging(predicted=viterbi, gold=sentence, 
                          known_vocab=known_vocab)
    num = counts['NUM', 'ALL']
    denom = counts['DENOM', 'ALL']
  
    logging.info(f"Gold:    {sentence_str(sentence)}")
    logging.info(f"Viterbi: {sentence_str(viterbi)}")
    acc_all = (denom - num)/denom
    print(num/denom)
   # logging.info(f"acc:    {num}/{denom}")
    acc += acc_all 
    
    c+=1
print(acc/c)
    # logging.info(f"Prob:    {math.exp(hmm.log_prob(sentence, endev))}")

INFO:root:Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO:root:Viterbi: ``/P We/V 're/R strongly/- _OOV_/W that/U anyone/W who/V has/V eaten/I in/D the/N cafeteria/D this/N month/V have/D the/N shot/, ,/' ''/' Mr./C Mattausch/N added/, ,/C ``/C and/W that/V means/R virtually/U everyone/W who/S works/R here/. ./'


0.1388888888888889
0.8611111111111112


In [None]:


counts


Counter({('DENOM', 'KNOWN'): 33,
         ('NUM', 'KNOWN'): 4,
         ('DENOM', 'ALL'): 36,
         ('NUM', 'ALL'): 5,
         ('DENOM', 'NOVEL'): 1,
         ('DENOM', 'SEEN'): 2,
         ('NUM', 'SEEN'): 1})

In [None]:
# load hmm on ic data
# Get the corpora
ictrain = TaggedCorpus(Path("../nlp6-data/icsup"), Path("../nlp6-data/icraw"))                               # all training
icsup =   TaggedCorpus(Path("../nlp6-data/icsup"), tagset=ictrain.tagset, vocab=ictrain.vocab)  # supervised training
icdev =   TaggedCorpus(Path("../nlp6-data/icdev"), tagset=ictrain.tagset, vocab=ictrain.vocab)  # evaluation
logging.info(f"Tagset: f{list(ictrain.tagset)}")

known_vocab = TaggedCorpus(Path("../nlp6-data/icsup")).vocab    # words seen with supervised tags; used in evaluation

INFO:root:Read 73 tokens from icsup, icraw
INFO:root:Created 4 tag types
INFO:root:Created 6 word types
INFO:root:Tagset: f['C', 'H', '_EOS_TAG_', '_BOS_TAG_']
INFO:root:Read 40 tokens from icsup
INFO:root:Created 4 tag types
INFO:root:Created 6 word types


In [None]:
logger=logging.getLogger() 

#Now we are going to Set the threshold of logger to DEBUG 
logger.setLevel(logging.INFO) 
lexicon = build_lexicon(ictrain, one_hot=True)   # one-hot lexicon: separate parameters for each word
hmm = HiddenMarkovModel(ictrain.tagset, ictrain.vocab, lexicon)

logging.info("*** Current A, B matrices (computed by softmax from small random parameters)")
hmm.updateAB()   # compute the matrices from the initial parameters (this would normally happen during training).

# While training on ice cream, we will just evaluate the cross-entropy
# on the training data itself (icsup), since we are interested in watching it improve.
logging.info("*** Supervised training on icsup")
loss_sup = lambda model: model_cross_entropy(model, icsup)
hmm.train(corpus=icsup, loss=loss_sup, 
          minibatch_size=10, evalbatch_size=500, lr=0.01, tolerance=0.0001, save_path='icsup_train.pkl')

INFO:root:*** Current A, B matrices (computed by softmax from small random parameters)
INFO:root:*** Supervised training on icsup
4it [00:00, 232.36it/s]
INFO:root:Cross-entropy: 2.2305 nats (= perplexity 9.304)
4it [00:00, 518.70it/s]
INFO:root:Cross-entropy: 1.1334 nats (= perplexity 3.106)
4it [00:00, 525.57it/s]
INFO:root:Cross-entropy: 1.1079 nats (= perplexity 3.028)
4it [00:00, 530.35it/s]
INFO:root:Cross-entropy: 1.0982 nats (= perplexity 2.999)
4it [00:00, 511.55it/s]
INFO:root:Cross-entropy: 1.0931 nats (= perplexity 2.984)
4it [00:00, 502.97it/s]
INFO:root:Cross-entropy: 1.0900 nats (= perplexity 2.974)
4it [00:00, 500.38it/s]
INFO:root:Cross-entropy: 1.0879 nats (= perplexity 2.968)
4it [00:00, 520.21it/s]
INFO:root:Cross-entropy: 1.0864 nats (= perplexity 2.964)
4it [00:00, 509.00it/s]
INFO:root:Cross-entropy: 1.0853 nats (= perplexity 2.960)
4it [00:00, 508.23it/s]
INFO:root:Cross-entropy: 1.0844 nats (= perplexity 2.958)
4it [00:00, 520.72it/s]
INFO:root:Cross-entropy: 1

In [None]:
# evaluation
model_error_rate(hmm, eval_corpus=icdev, known_vocab=ictrain.vocab)


NameError: name 'icdev' is not defined

In [None]:
# unsupervised 
logging.info("*** Unsupervised training on icsup")
loss_dev = lambda model: model_error_rate(model, eval_corpus=icdev, known_vocab=ictrain.vocab)
hmm.train(corpus=ictrain, loss=loss_dev, 
          minibatch_size=10, evalbatch_size=500, lr=0.01, tolerance=0.0001, save_path='icentire_train.pkl')

INFO:root:*** Unsupervised training on icsup
1it [00:00, 79.75it/s]
INFO:root:Cross-entropy: 1.1792 nats (= perplexity 3.252)
1it [00:00, 115.50it/s]
INFO:root:Tagging accuracy: all: 93.939%, known: 93.939%, seen: nan%, novel: nan%
1it [00:00, 164.12it/s]
INFO:root:Cross-entropy: 1.0874 nats (= perplexity 2.967)
1it [00:00, 222.32it/s]
INFO:root:Tagging accuracy: all: 90.909%, known: 90.909%, seen: nan%, novel: nan%
INFO:root:Saved model to icentire_train.pkl
500it [00:05, 92.73it/s]


In [None]:
hmm.printAB()

Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.895	0.104	0.001	0.000
H	0.100	0.899	0.001	0.000
_EOS_TAG_	0.333	0.333	0.334	0.000
_BOS_TAG_	0.409	0.586	0.005	0.000

Emission matrix B:
	1	2	3	_OOV_
C	0.699	0.194	0.106	0.001
H	0.078	0.300	0.620	0.001
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000	0.000




In [None]:
# invastigate the en data

lexicon = build_lexicon(entrain, embeddings_file=Path('../lexicons/words-50.txt'))  # works better with more attributes!
hmm1 = HiddenMarkovModel(entrain.tagset, entrain.vocab, lexicon)


hmm1 = hmm1.load('en_hmm.pkl')