In [1]:
import inspect, sys, hashlib

# Hack around a warning message deep inside scikit learn, loaded by nltk :-(
#  Modelled on https://stackoverflow.com/a/25067818
import warnings
with warnings.catch_warnings(record=True) as w:
    save_filters=warnings.filters
    warnings.resetwarnings()
    warnings.simplefilter('ignore')
    import nltk
    warnings.filters=save_filters
try:
    nltk
except NameError:
    # didn't load, produce the warning
    import nltk

from nltk.corpus import brown
from nltk.tag import map_tag, tagset_mapping

if map_tag('brown', 'universal', 'NR-TL') != 'NOUN':
    # Out-of-date tagset, we add a few that we need
    tm=tagset_mapping('en-brown','universal')
    tm['NR-TL']=tm['NR-TL-HL']='NOUN'

In [3]:
tagged_sentences = brown.tagged_sents(categories='news')

# set up the training data
train = tagged_sentences[0:len(tagged_sentences)-500]

# set up the test data
test = tagged_sentences[-500:]

In [205]:
from nltk.probability import ConditionalFreqDist, ConditionalProbDist
from nltk.probability import LidstoneProbDist
from nltk.probability import FreqDist
import math

In [261]:
class HMM:
    def __init__(self, train_data, test_data):
        """
        Initialise a new instance of the HMM.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :param test_data: the test/evaluation dataset, a list of sentence with tags
        :type test_data: list(list(tuple(str,str)))
        """
        self.train_data = train_data
        self.test_data = test_data

        # Emission and transition probability distributions
        self.emission_PD = None
        self.transition_PD = None
        self.states = []

        self.viterbi = []
        self.backpointer = []

    # Compute emission model using ConditionalProbDist with a LidstoneProbDist estimator.
    #   To achieve the latter, pass a function
    #    as the probdist_factory argument to ConditionalProbDist.
    #   This function should take 3 arguments
    #    and return a LidstoneProbDist initialised with +0.01 as gamma and an extra bin.
    #   See the documentation/help for ConditionalProbDist to see what arguments the
    #    probdist_factory function is called with.
    def emission_model(self, train_data):
        """
        Compute an emission model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The emission probability distribution and a list of the states
        :rtype: Tuple[ConditionalProbDist, list(str)]
        """
        # TODO prepare data
        data = []
        fd = []
        # Don't forget to lowercase the observation otherwise it mismatches the test data
        # Do NOT add <s> or </s> to the input sentences
        for i in train_data:
            data_p = list(map(lambda a:(a[1],a[0].lower()),i))
            data.extend(data_p)
        emission_FD = ConditionalFreqDist(data)
        lidstone = lambda fdist: LidstoneProbDist(fdist, 0.01, fdist.B() + 1)
        self.emission_PD = ConditionalProbDist(emission_FD, probdist_factory = lidstone)
        self.states = list(set([ tag for (tag, word) in data]))
        self.states.sort()

        return self.emission_PD, self.states

    # Access function for testing the emission model
    # For example model.elprob('VERB','is') might be -1.4
    def elprob(self,state,word):
        """
        The log of the estimated probability of emitting a word from a state

        :param state: the state name
        :type state: str
        :param word: the word
        :type word: str
        :return: log base 2 of the estimated emission probability
        :rtype: float
        """
                
        return math.log2(self.emission_PD[state].prob(word.lower()))
    
    
    # Compute transition model using ConditionalProbDist with a LidstonelprobDist estimator.
    # See comments for emission_model above for details on the estimator.
    def transition_model(self, train_data):
        """
        Compute an transition model using a ConditionalProbDist.

        :param train_data: The training dataset, a list of sentences with tags
        :type train_data: list(list(tuple(str,str)))
        :return: The transition probability distribution
        :rtype: ConditionalProbDist
        """
        
        # TODO: prepare the data
        data = []
        data_start = [[("<s>","<s>")] + w +[("</s>","</s>")] for w in train_data]
        words = [[word for (word,tag) in wordlist]for wordlist in data_start]
        for word in words:
            data.append(list(zip(word[:-1], word[1:])))
            
        
        

        # The data object should be an array of tuples of conditions and observations,
        # in our case the tuples will be of the form (tag_(i),tag_(i+1)).
        # DON'T FORGET TO ADD THE START SYMBOL </s> and the END SYMBOL </s>
        for s in train_data:
            pass  # TODO

        # TODO compute the transition model

        transition_FD = 'fixme'
        self.transition_PD = 'fixme'

        return self.transition_PD

In [262]:
a = HMM(train, test)
emm, states = a.transition_model(train)


[('<s>', 'The'), ('The', 'Fulton'), ('Fulton', 'County'), ('County', 'Grand'), ('Grand', 'Jury'), ('Jury', 'said'), ('said', 'Friday'), ('Friday', 'an'), ('an', 'investigation'), ('investigation', 'of'), ('of', "Atlanta's"), ("Atlanta's", 'recent'), ('recent', 'primary'), ('primary', 'election'), ('election', 'produced'), ('produced', '``'), ('``', 'no'), ('no', 'evidence'), ('evidence', "''"), ("''", 'that'), ('that', 'any'), ('any', 'irregularities'), ('irregularities', 'took'), ('took', 'place'), ('place', '.'), ('.', '</s>')]
[('<s>', 'The'), ('The', 'jury'), ('jury', 'further'), ('further', 'said'), ('said', 'in'), ('in', 'term-end'), ('term-end', 'presentments'), ('presentments', 'that'), ('that', 'the'), ('the', 'City'), ('City', 'Executive'), ('Executive', 'Committee'), ('Committee', ','), (',', 'which'), ('which', 'had'), ('had', 'over-all'), ('over-all', 'charge'), ('charge', 'of'), ('of', 'the'), ('the', 'election'), ('election', ','), (',', '``'), ('``', 'deserves'), ('dese

[('<s>', '``'), ('``', 'I'), ('I', 'can'), ('can', 'remove'), ('remove', 'the'), ('the', 'ball'), ('ball', ','), (',', "can't"), ("can't", 'I'), ('I', "''"), ("''", '?'), ('?', '?'), ('?', '</s>')]
[('<s>', 'Asked'), ('Asked', 'Palmer'), ('Palmer', 'of'), ('of', 'an'), ('an', 'official'), ('official', '.'), ('.', '</s>')]
[('<s>', '``'), ('``', 'No'), ('No', "''"), ("''", ','), (',', 'said'), ('said', 'the'), ('the', 'official'), ('official', '.'), ('.', '</s>')]
[('<s>', '``'), ('``', 'You'), ('You', 'must'), ('must', 'play'), ('play', 'it'), ('it', 'where'), ('where', 'it'), ('it', 'lies'), ('lies', "''"), ("''", '.'), ('.', '</s>')]
[('<s>', '``'), ('``', "You're"), ("You're", 'wrong'), ('wrong', "''"), ("''", ','), (',', 'said'), ('said', 'Arnold'), ('Arnold', ','), (',', 'a'), ('a', 'man'), ('man', 'who'), ('who', 'knows'), ('knows', 'the'), ('the', 'rules'), ('rules', '.'), ('.', '</s>')]
[('<s>', '``'), ('``', "I'll"), ("I'll", 'do'), ('do', 'as'), ('as', 'you'), ('you', 'say'),

[('<s>', 'Not'), ('Not', 'all'), ('all', 'sections'), ('sections', 'are'), ('are', 'showing'), ('showing', 'an'), ('an', 'upswing'), ('upswing', ','), (',', 'however'), ('however', ';'), (';', ';'), (';', '</s>')]
[('<s>', 'the'), ('the', 'drought-seared'), ('drought-seared', 'North'), ('North', 'Central'), ('Central', 'states'), ('states', 'are'), ('are', 'the'), ('the', 'most'), ('most', 'notable'), ('notable', 'exceptions'), ('exceptions', 'to'), ('to', 'the'), ('the', 'uptrend'), ('uptrend', '.'), ('.', '</s>')]
[('<s>', 'The'), ('The', 'significance'), ('significance', 'of'), ('of', 'the'), ('the', 'pickup'), ('pickup', 'in'), ('in', 'farm'), ('farm', 'machinery'), ('machinery', 'sales'), ('sales', 'extends'), ('extends', 'beyond'), ('beyond', 'the'), ('the', 'farm'), ('farm', 'equipment'), ('equipment', 'industry'), ('industry', '.'), ('.', '</s>')]
[('<s>', 'The'), ('The', 'demand'), ('demand', 'for'), ('for', 'farm'), ('farm', 'machinery'), ('machinery', 'is'), ('is', 'regarded

ValueError: too many values to unpack (expected 2)

In [173]:
help(nltk.ConditionalProbDist)

Help on class ConditionalProbDist in module nltk.probability:

class ConditionalProbDist(ConditionalProbDistI)
 |  A conditional probability distribution modeling the experiments
 |  that were used to generate a conditional frequency distribution.
 |  A ConditionalProbDist is constructed from a
 |  ``ConditionalFreqDist`` and a ``ProbDist`` factory:
 |  
 |  - The ``ConditionalFreqDist`` specifies the frequency
 |    distribution for each condition.
 |  - The ``ProbDist`` factory is a function that takes a
 |    condition's frequency distribution, and returns its
 |    probability distribution.  A ``ProbDist`` class's name (such as
 |    ``MLEProbDist`` or ``HeldoutProbDist``) can be used to specify
 |    that class's constructor.
 |  
 |  The first argument to the ``ProbDist`` factory is the frequency
 |  distribution that it should model; and the remaining arguments are
 |  specified by the ``factory_args`` parameter to the
 |  ``ConditionalProbDist`` constructor.  For example, the f