In [93]:
import nltk
import sys
from nltk.corpus import brown

In [94]:
# Modify the POS tags by using only the first two letters of a tag,
# which represent the broad class of POS tags in the Brown corpus.
# Also switch the order of word tag pairs to be tag word to conform
# with transition and emission probabilities. Add a leading period
# to give the first sentence in the corpus a starting transition
# probability.
tagged_corpus = [(".", ".")]
for sentence in brown.tagged_sents():
    tagged_corpus.extend([(tag[:2], word.lower()) for (word, tag) in sentence])

In [95]:
### Create the transition probability matrix 
## Transition probability: P(xt | x{t-1})
pos_tags = [tag for (tag, word) in tagged_corpus]
transition_cond_freq_dist = nltk.ConditionalFreqDist(nltk.bigrams(pos_tags))
transition_matrix = nltk.ConditionalProbDist(transition_cond_freq_dist, nltk.MLEProbDist)

In [96]:
### Create the emission probability matrix
## Emission probability: P(yt | xt)
emission_cond_freq_dist = nltk.ConditionalFreqDist(tagged_corpus)
emission_matrix = nltk.ConditionalProbDist(emission_cond_freq_dist, nltk.MLEProbDist)

In [103]:
### Implement the Viterbi algorithm.
# Algorithm set up
distinct_tags = set(pos_tags)
sentence = ["I", "want", "to", "race"]
tag_probs = [{}]
actual_tags = [{}]

for tag in distinct_tags:
    tag_probs[0][tag] = emission_matrix["."].prob(tag) * emission_matrix[tag].prob(sentence[0])
    actual_tags[0][tag] = None
    
for index in range(1, len(sentence)):
    this_tag_prob = {}
    this_actual_tag = {}
    prev_tag_prob = tag_probs[-1]
    
    for tag in distinct_tags:
        best_prev = max(prev_tag_prob.keys(), key = lambda prev_tag: \
                        prev_tag_prob[prev_tag] * transition_matrix[prev_tag].prob(tag) \
                       * emission_matrix[tag].prob(sentence[index]))
        this_actual_tag[tag] = best_prev
        this_tag_prob[tag] = prev_tag_prob[best_prev] * transition_matrix[best_prev].prob(tag) \
                             * emission_matrix[tag].prob(sentence[index])
    
    tag_probs.append(this_tag_prob)
    actual_tags.append(this_actual_tag)

prev_tag_prob = tag_probs[-1]
best_prev = max(prev_tag_prob.keys(), key = lambda prev_tag: \
                prev_tag_prob[prev_tag] * transition_matrix[prev_tag].prob("."))
best_prob = prev_tag_prob[best_prev] * transition_matrix[best_prev].prob(".")
best_tags = [".", best_prev]
actual_tags.reverse()

best_tag = best_prev
for tag in actual_tags:
    best_tags.append(tag[best_tag])
    best_tag = tag[best_tag]
    
best_tags.reverse()

print("Sentence: ")
for word in sentence: 
    print(word + " ")
print("\n")
print("POS tags: ")
for tag in best_tags:
    if tag:
        print(tag + " ")
print("\n")
print("Probability: " + str(best_prob))
    


        
    


Sentence: 
I 
want 
to 
race 


POS tags: 
AB 
AB 
AB 
AB 
. 


Probability: 0.0


In [None]:
2) Input sentence
1) Forward viterbi
3) range to include from brown corpus
4) choose number of top pos tags per observation

In [85]:
tags = []
for word in brown.tagged_words():
    tags.append(word[1][:2])

In [91]:
emission_matrix["AT"].prob("the")

0.7061982094734398

In [86]:
tagged_corpus[:100]

[('.', '.'),
 ('AT', 'the'),
 ('NP', 'fulton'),
 ('NN', 'county'),
 ('JJ', 'grand'),
 ('NN', 'jury'),
 ('VB', 'said'),
 ('NR', 'friday'),
 ('AT', 'an'),
 ('NN', 'investigation'),
 ('IN', 'of'),
 ('NP', "atlanta's"),
 ('JJ', 'recent'),
 ('NN', 'primary'),
 ('NN', 'election'),
 ('VB', 'produced'),
 ('``', '``'),
 ('AT', 'no'),
 ('NN', 'evidence'),
 ("''", "''"),
 ('CS', 'that'),
 ('DT', 'any'),
 ('NN', 'irregularities'),
 ('VB', 'took'),
 ('NN', 'place'),
 ('.', '.'),
 ('AT', 'the'),
 ('NN', 'jury'),
 ('RB', 'further'),
 ('VB', 'said'),
 ('IN', 'in'),
 ('NN', 'term-end'),
 ('NN', 'presentments'),
 ('CS', 'that'),
 ('AT', 'the'),
 ('NN', 'city'),
 ('JJ', 'executive'),
 ('NN', 'committee'),
 (',', ','),
 ('WD', 'which'),
 ('HV', 'had'),
 ('JJ', 'over-all'),
 ('NN', 'charge'),
 ('IN', 'of'),
 ('AT', 'the'),
 ('NN', 'election'),
 (',', ','),
 ('``', '``'),
 ('VB', 'deserves'),
 ('AT', 'the'),
 ('NN', 'praise'),
 ('CC', 'and'),
 ('NN', 'thanks'),
 ('IN', 'of'),
 ('AT', 'the'),
 ('NN', 'city')