In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('universal_tagset')
brownwords = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [None]:
len(brownwords)

100554

In [None]:
brownwords[5425:5440]

[('workers', 'NOUN'),
 ('would', 'VERB'),
 ('be', 'VERB'),
 ('raised', 'VERB'),
 ('to', 'PRT'),
 ('pay', 'VERB'),
 ('the', 'DET'),
 ('hospital', 'NOUN'),
 ('and', 'CONJ'),
 ('some', 'DET'),
 ('other', 'ADJ'),
 ('medical', 'ADJ'),
 ('bills', 'NOUN'),
 ('of', 'ADP'),
 ('14.2', 'NUM')]

In [None]:
fd = nltk.FreqDist(tag for (word,tag) in brownwords)

In [None]:
print('Number of nouns:',fd['NOUN'])
print('Number of adjectives:',fd['ADJ'])

Number of nouns: 30654
Number of adjectives: 6706


In [None]:
# we can create our own tagger; start with some baselines!
bad_tagger = nltk.DefaultTagger('NOUN')
text = nltk.word_tokenize('It could be that it rained, or is raining heavily.')
bad_tagger.tag(text)

[('It', 'NOUN'),
 ('could', 'NOUN'),
 ('be', 'NOUN'),
 ('that', 'NOUN'),
 ('it', 'NOUN'),
 ('rained', 'NOUN'),
 (',', 'NOUN'),
 ('or', 'NOUN'),
 ('is', 'NOUN'),
 ('raining', 'NOUN'),
 ('heavily', 'NOUN'),
 ('.', 'NOUN')]

In [None]:
# nltk can evaluate the tagger, but we need it represented as tagged sentences:
brownsentences = nltk.corpus.brown.tagged_sents(categories='news', tagset='universal')
bad_tagger.accuracy(brownsentences)

0.30485112476878096

In [None]:
# rule based tagger!
patterns = [
    (r'.*ly$', 'ADV'),
    (r'.*ing$', 'VERB'),
    (r'.*ed$', 'VERB'),
    (r'.*ould$', 'VERB'),
    (r'^[.]$', 'PUNCT'),
    (r'^[,]$', 'PUNCT'),
    (r'.*$', 'NOUN'),
]
rule_tagger = nltk.RegexpTagger(patterns)
rule_tagger.tag(text)

[('It', 'NOUN'),
 ('could', 'VERB'),
 ('be', 'NOUN'),
 ('that', 'NOUN'),
 ('it', 'NOUN'),
 ('rained', 'VERB'),
 (',', 'PUNCT'),
 ('or', 'NOUN'),
 ('is', 'NOUN'),
 ('raining', 'VERB'),
 ('heavily', 'ADV'),
 ('.', 'PUNCT')]

In [None]:
rule_tagger.accuracy(brownsentences)

0.3580464228175905

In [None]:
cfd = nltk.ConditionalFreqDist(brownwords)

In [None]:
print(cfd['the']['DET'])

5580


In [None]:
print(cfd['pay'].keys())

dict_keys(['NOUN', 'VERB'])


In [None]:
print(cfd['pay']['NOUN'])
print(cfd['pay']['VERB'])

5
28


In [None]:
justwords = nltk.FreqDist(nltk.corpus.brown.words(categories='news')).keys()
best_tags = dict((w, cfd[w].max()) for w in justwords)

In [None]:
print('Most frequent tag for "the":', best_tags['the'])
print('Most frequent tag for "pay":', best_tags['pay'])

Most frequent tag for "the": DET
Most frequent tag for "pay": VERB


In [None]:
split = int(len(brownsentences)*0.9)
train = brownsentences[:split]
test = brownsentences[split:]
unigram_tagger = nltk.UnigramTagger(train)
unigram_tagger.accuracy(test)

0.8451111332602412

In [None]:
better_tagger = nltk.UnigramTagger(train, backoff=rule_tagger)
better_tagger.accuracy(test)

0.9304295823781521

In [None]:
flipped = [(t,w) for sent in train for (w,t) in sent]
wordgiventag = nltk.ConditionalFreqDist(flipped)
# check same counts as above, but just training data:
print(wordgiventag['DET']['the'])
print(wordgiventag['VERB']['pay'])

5030
26


In [None]:
# this is P(w|t), unsmoothed!
def P(w,t):
    return wordgiventag[t][w] / wordgiventag[t].N()

print('P(the|DET) =', P('the','DET'))
print('P(is|VERB) =', P('is','VERB'))
print('P(pay|VERB) =', P('pay','VERB'))

P(the|DET) = 0.4911629723659799
P(is|VERB) = 0.05013518733101584
P(pay|VERB) = 0.0020084974893781384


In [None]:
tag_bigrams = [(x,y) for sent in train for x,y in nltk.bigrams([t for (w,t) in sent])]

In [None]:
tag_bigrams[:10]

[('DET', 'NOUN'),
 ('NOUN', 'NOUN'),
 ('NOUN', 'ADJ'),
 ('ADJ', 'NOUN'),
 ('NOUN', 'VERB'),
 ('VERB', 'NOUN'),
 ('NOUN', 'DET'),
 ('DET', 'NOUN'),
 ('NOUN', 'ADP'),
 ('ADP', 'NOUN')]

In [None]:
tag_bigram_counts = nltk.ConditionalFreqDist(tag_bigrams)
# this is count of noun tags following adjective tags (normal order in English)
print(tag_bigram_counts['ADJ']['NOUN'])
# this is count of adjective tags following noun tags
print(tag_bigram_counts['NOUN']['ADJ'])

4250
482


In [None]:
# this is P(t2|t1), unsmoothed again!
def tagP(t2,t1):
    return tag_bigram_counts[t1][t2] / tag_bigram_counts[t1].N()
print('P(NOUN|ADJ) =',tagP('NOUN','ADJ'))
print('P(NOUN|DET) =',tagP('NOUN','DET'))

P(NOUN|ADJ) = 0.7123700972175662
P(NOUN|DET) = 0.6479781207267045


In [None]:
sentence_start = nltk.FreqDist(sent[0][1] for sent in train)
def initP(t):
    return sentence_start[t] / sentence_start.N()
print('initP(DET) =', initP('DET'))
print('initP(PRON) =', initP('PRON'))  # he, she, it, etc.
print('initP(NOUN) =', initP('NOUN'))
print('initP(VERB) =', initP('VERB'))

initP(DET) = 0.23918269230769232
initP(PRON) = 0.096875
initP(NOUN) = 0.27115384615384613
initP(VERB) = 0.03774038461538461


In [None]:
def argmax(V,tag_list,t,i):
    ans=-1
    best=None
    for s in tag_list:
        temp=V[(s,i-1)]*tagP(t,s)
        if temp > ans:
            ans = temp
            best = s
    return (best,ans)

In [None]:
def printV(sentence,tag_list,V,B):
    for i in range(len(sentence)):
        print('i='+str(i)+' ['+sentence[i]+']')
        for t in tag_list:
            if V[(t,i)] != 0:
                toprint='  '+t+'='+str(V[(t,i)])
                if i>0:
                    toprint += ' (from '+B[(t,i)]+')'
                print(toprint)
    

In [None]:
def viterbi(sentence):
    V = dict()    # keys are (t,i) where t is a tag (row label) and i is position in sentence (column label)
    B = dict()    # same keys as V; this stores the "backpointers" to remember best tag sequence
    tag_list = sentence_start.keys()
    for t in tag_list:
        V[(t,0)] = initP(t)*P(sentence[0],t)
    for i in range(1,len(sentence)):
        for t in tag_list:
            pair = argmax(V,tag_list,t,i)
            B[(t,i)] = pair[0]
            V[(t,i)] = pair[1]*P(sentence[i],t)
    printV(sentence,tag_list,V,B)

In [None]:
# note how best tag for past changes to (correct) ADP when we see "the" at i=8
viterbi('like one little flat near work well past the last right turn'.split())

In [None]:
viterbi('the beer was a little flat'.split())

i=0 [the]
  DET=0.11747768209234376
i=1 [beer]
  NOUN=5.4488363100480395e-06 (from DET)
i=2 [was]
  VERB=3.7844556130187584e-08 (from NOUN)
i=3 [a]
  DET=1.228566861006549e-09 (from VERB)
  X=4.193581415959796e-14 (from VERB)
i=4 [little]
  ADV=4.882420196152905e-14 (from DET)
  ADJ=1.066779665474013e-12 (from DET)
i=5 [flat]
  NOUN=8.159428087793336e-17 (from ADJ)
  ADJ=2.2489550067219224e-17 (from ADJ)


In [None]:
viterbi('difficulties like high interest rates'.split())

In [None]:
viterbi('banks like high interest rates'.split())