In [1]:
import nltk
from nltk.corpus import brown

In [2]:
tags = [tag for (word, tag) in brown.tagged_words(categories = "news")]

In [3]:
nltk.FreqDist(tags).max()

'NN'

In [4]:
raw = "I do not like green eggs and ham, I do not like them Same I am"

In [5]:
tokens = nltk.word_tokenize(raw)

In [6]:
default_tagger = nltk.DefaultTagger('NN')

In [7]:
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Same', 'NN'),
 ('I', 'NN'),
 ('am', 'NN')]

In [8]:
brown_tagged_sents = brown.tagged_sents(categories = "news")

In [9]:
brown_sents = brown.sents(categories = "news")

In [10]:
default_tagger.accuracy(brown_tagged_sents) # u can also use .evaluate(brown_tagged_sents)

0.13089484257215028

In [11]:
patterns = [
    ('.*ing$', 'VBG'), 
    ('.*ed', 'VBD'),
    ('.*es$', "VBZ"),
    ('.*ould$', "MD"),
    ('.*\'s$', 'NN$'),
    ('.*s$', 'NNS'), 
    ('^-?[0-9]+(\.[0-9]+)?', "CD"),
    (".*", "NN")
    
]

In [12]:
regexp_tagger = nltk.RegexpTagger(patterns)

In [13]:
regexp_tagger.evaluate(brown_tagged_sents)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  regexp_tagger.evaluate(brown_tagged_sents)


0.20144400023867773

In [14]:
regexp_tagger.tag(brown_sents[3])

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD'),
 ("''", 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('jury', 'NN'),
 ('said', 'NN'),
 (',', 'NN'),
 ('``', 'NN'),
 ('considering', 'VBG'),
 ('the', 'NN'),
 ('widespread', 'NN'),
 ('interest', 'NN'),
 ('in', 'NN'),
 ('the', 'NN'),
 ('election', 'NN'),
 (',', 'NN'),
 ('the', 'NN'),
 ('number', 'NN'),
 ('of', 'NN'),
 ('voters', 'NNS'),
 ('and', 'NN'),
 ('the', 'NN'),
 ('size', 'NN'),
 ('of', 'NN'),
 ('this', 'NNS'),
 ('city', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [15]:
fd = nltk.FreqDist(brown.words(categories= "news"))

In [16]:
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories= "news"))

In [17]:
most_freq_words = fd.most_common(100)

In [18]:
likely_tags = dict((word, cfd[word].max()) for (word,_) in most_freq_words)

In [19]:
likely_tags

{'the': 'AT',
 ',': ',',
 '.': '.',
 'of': 'IN',
 'and': 'CC',
 'to': 'TO',
 'a': 'AT',
 'in': 'IN',
 'for': 'IN',
 'The': 'AT',
 'that': 'CS',
 '``': '``',
 'is': 'BEZ',
 'was': 'BEDZ',
 "''": "''",
 'on': 'IN',
 'at': 'IN',
 'with': 'IN',
 'be': 'BE',
 'by': 'IN',
 'as': 'CS',
 'he': 'PPS',
 'said': 'VBD',
 'his': 'PP$',
 'will': 'MD',
 'it': 'PPS',
 'from': 'IN',
 'are': 'BER',
 ';': '.',
 'an': 'AT',
 'has': 'HVZ',
 '--': '--',
 'had': 'HVD',
 'who': 'WPS',
 'have': 'HV',
 'not': '*',
 'Mrs.': 'NP',
 'were': 'BED',
 'this': 'DT',
 'which': 'WDT',
 'would': 'MD',
 'their': 'PP$',
 'been': 'BEN',
 'they': 'PPSS',
 'He': 'PPS',
 'one': 'CD',
 'I': 'PPSS',
 'but': 'CC',
 'its': 'PP$',
 'or': 'CC',
 ')': ')',
 'more': 'AP',
 'Mr.': 'NP',
 '(': '(',
 'up': 'RP',
 'all': 'ABN',
 'out': 'RP',
 'last': 'AP',
 'two': 'CD',
 'other': 'AP',
 ':': ':',
 'new': 'JJ',
 'first': 'OD',
 'than': 'IN',
 'year': 'NN',
 'A': 'AT',
 'about': 'IN',
 'there': 'EX',
 'when': 'WRB',
 'home': 'NN',
 'after':

In [20]:
baseline_tagger = nltk.UnigramTagger(model = likely_tags)

In [21]:
baseline_tagger.accuracy(brown_tagged_sents)

0.45578495136941344

In [25]:
sent = brown.sents(categories= "news")[3]

In [26]:
baseline_tagger.tag(sent)

[('``', '``'),
 ('Only', None),
 ('a', 'AT'),
 ('relative', None),
 ('handful', None),
 ('of', 'IN'),
 ('such', None),
 ('reports', None),
 ('was', 'BEDZ'),
 ('received', None),
 ("''", "''"),
 (',', ','),
 ('the', 'AT'),
 ('jury', None),
 ('said', 'VBD'),
 (',', ','),
 ('``', '``'),
 ('considering', None),
 ('the', 'AT'),
 ('widespread', None),
 ('interest', None),
 ('in', 'IN'),
 ('the', 'AT'),
 ('election', None),
 (',', ','),
 ('the', 'AT'),
 ('number', None),
 ('of', 'IN'),
 ('voters', None),
 ('and', 'CC'),
 ('the', 'AT'),
 ('size', None),
 ('of', 'IN'),
 ('this', 'DT'),
 ('city', None),
 ("''", "''"),
 ('.', '.')]

In [27]:
baseline_tagger = nltk.UnigramTagger(model = likely_tags, backoff= nltk.DefaultTagger("NN"))

In [28]:
baseline_tagger.accuracy(brown_tagged_sents)

0.5817769556656125

In [29]:
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

In [30]:
unigram_tagger.accuracy(brown_tagged_sents)

0.9349006503968017

In [31]:
size = int(len(brown_tagged_sents) * .9)

In [32]:
size

4160

In [33]:
train_sents = brown_tagged_sents[:size]

In [34]:
test_sents = brown_tagged_sents[size:]

In [35]:
unigram_tagger = nltk.UnigramTagger(train_sents)

In [36]:
unigram_tagger.accuracy(test_sents)

0.8121200039868434

In [38]:
bigram_tagger = nltk.BigramTagger(train_sents)

In [39]:
bigram_tagger.accuracy(test_sents)

0.10206319146815508

In [40]:
bigram_tagger.tag(brown_sents[4203])

[('The', 'AT'),
 ('population', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Congo', 'NP'),
 ('is', 'BEZ'),
 ('13.5', None),
 ('million', None),
 (',', None),
 ('divided', None),
 ('into', None),
 ('at', None),
 ('least', None),
 ('seven', None),
 ('major', None),
 ('``', None),
 ('culture', None),
 ('clusters', None),
 ("''", None),
 ('and', None),
 ('innumerable', None),
 ('tribes', None),
 ('speaking', None),
 ('400', None),
 ('separate', None),
 ('dialects', None),
 ('.', None)]

In [41]:
t0 = nltk.DefaultTagger("NN")
t1 = nltk.UnigramTagger(train_sents, backoff= t0)
t2 = nltk.BigramTagger(train_sents, backoff= t1)

In [42]:
t2.accuracy(test_sents)

0.8452108043456593

In [44]:
from pickle import dump

In [47]:
output = open("t2.pk1", "wb")

In [48]:
dump(t2, output, -1)

In [49]:
from pickle import load

In [50]:
input = open("t2.pk1", "rb")

In [51]:
tagger = load(input)

In [52]:
input.close()

In [57]:
text = "Chapman University was founded in 1861 in California."

In [58]:
tokens = nltk.word_tokenize(text)

In [59]:
tagger.tag(tokens)

[('Chapman', 'NP'),
 ('University', 'NN-TL'),
 ('was', 'BEDZ'),
 ('founded', 'VBN'),
 ('in', 'IN'),
 ('1861', 'NN'),
 ('in', 'IN'),
 ('California', 'NP'),
 ('.', '.')]

In [61]:
test_tags = [tag for sent in brown.sents(categories= "humor")
             for (word,tag) in t2.tag(sent)]

In [62]:
gold_tags = [tag for (word, tag) in brown.tagged_words(categories= "humor")]

In [63]:
f = open("confusionmatrix.txt", "w")
f.write((str(nltk.ConfusionMatrix(gold_tags, test_tags))))
f.close()