# Performance
## Import libraries

In [1]:
import nltk                                                                                #import nltk library

In [2]:
from nltk.corpus import brown                                                              #import corpus

## Editorial

In [3]:
editorial_tagged_sents = brown.tagged_sents(categories='editorial')                         #tagged sents

In [4]:
editorial_sents = brown.sents(categories='editorial')                                       #untagged sents

In [5]:
editorial_fd = nltk.FreqDist(brown.words(categories='editorial'))                           #frequency distribution

In [6]:
editorial_cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='editorial'))              #cumulative frequency distribution

In [8]:
most_freq_words = editorial_fd.most_common(100)                                             #top 100 frequent words

In [10]:
likely_tags = dict((word, editorial_cfd[word].max()) for (word, _) in most_freq_words)      #Likelihood of tags

In [11]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags)                                     #Unigram Tagger

In [12]:
baseline_tagger.evaluate(editorial_tagged_sents)                                            #evaluation

0.4754236737874164

In [13]:
sent = brown.sents(categories='editorial')[6]                                                  #set sentence #6 from editorial

In [14]:
baseline_tagger.tag(sent)                                                                   # tag the sentence

[('This', 'DT'),
 ('session', None),
 (',', ','),
 ('for', 'IN'),
 ('instance', None),
 (',', ','),
 ('may', 'MD'),
 ('have', 'HV'),
 ('insured', None),
 ('a', 'AT'),
 ('financial', None),
 ('crisis', None),
 ('two', None),
 ('years', 'NNS'),
 ('from', 'IN'),
 ('now', 'RB'),
 ('.', '.')]

In [15]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
...                                      backoff=nltk.DefaultTagger('NN'))                          #set backoff to default tagger 'NN'

In [16]:
def performance(cfd, wordlist):                                                              #define performance
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='editorial'))              

def display():                                                                               #define display 
    import pylab                                                                             #import library
    word_freqs = nltk.FreqDist(brown.words(categories='editorial')).most_common()            
    words_by_freq = [w for (w, _) in word_freqs]
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='editorial'))               
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size for editorial')            #set title
    pylab.xlabel('Model Size')                                                                #set label
    pylab.ylabel('Performance')                                                               #set label
    pylab.show()

In [17]:
display()                                                                                     #display output

## 2. News

In [5]:
news_tagged_sents = brown.tagged_sents(categories='news')                         #tagged sents
news_sents = brown.sents(categories='news')                                       #untagged sents
news_fd = nltk.FreqDist(brown.words(categories='news'))                           #frequency distribution
news_cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))        #cumulative frequency distribution
most_freq_words = news_fd.most_common(100)                                        #top 100 frequent words
likely_tags = dict((word, news_cfd[word].max()) for (word, _) in most_freq_words) #Likelihood of tags
baseline_tagger = nltk.UnigramTagger(model=likely_tags)                           #Unigram Tagger

In [7]:
baseline_tagger.evaluate(news_tagged_sents)                                       #evaluation

0.45578495136941344

In [8]:
sent = brown.sents(categories='news')[4]                                          #set sentence #4 from news

In [9]:
baseline_tagger.tag(sent)                                                         #tag the sentence

[('The', 'AT'),
 ('jury', None),
 ('said', 'VBD'),
 ('it', 'PPS'),
 ('did', None),
 ('find', None),
 ('that', 'CS'),
 ('many', None),
 ('of', 'IN'),
 ("Georgia's", None),
 ('registration', None),
 ('and', 'CC'),
 ('election', None),
 ('laws', None),
 ('``', '``'),
 ('are', 'BER'),
 ('outmoded', None),
 ('or', 'CC'),
 ('inadequate', None),
 ('and', 'CC'),
 ('often', None),
 ('ambiguous', None),
 ("''", "''"),
 ('.', '.')]

In [10]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
...                                      backoff=nltk.DefaultTagger('NN'))           #set backoff to default tagger 'NN'

In [11]:
def performance(cfd, wordlist):                                                      #define performance
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))

def display():                                                                       #define display 
    import pylab                                                                     #import library
    word_freqs = nltk.FreqDist(brown.words(categories='news')).most_common()
    words_by_freq = [w for (w, _) in word_freqs]
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size for news')         #set title
    pylab.xlabel('Model Size')                                                        #set label
    pylab.ylabel('Performance')                                                       #set label
    pylab.show()

In [12]:
display() #display output

## 3. Reviews

In [31]:
reviews_tagged_sents = brown.tagged_sents(categories='reviews')                       #tagged sents
reviews_sents = brown.sents(categories='reviews')                                     #untagged sents
reviews_fd = nltk.FreqDist(brown.words(categories='reviews'))                         #frequency distribution
reviews_cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='reviews'))      #cumulative frequency distribution
most_freq_words = reviews_fd.most_common(100)                                         #top 100 frequent words
likely_tags = dict((word, reviews_cfd[word].max()) for (word, _) in most_freq_words)  #Likelihood of tags
baseline_tagger = nltk.UnigramTagger(model=likely_tags)                               #Unigram Tagger

In [32]:
baseline_tagger.evaluate(reviews_tagged_sents)                                        #evaluation

0.47494103773584906

In [33]:
sent = brown.sents(categories='reviews')[4]                                           #set sentence #4 from reviews

In [34]:
baseline_tagger.tag(sent)                                                             #tag the sentence

[('Those', None),
 ('would', 'MD'),
 ('be', 'BE'),
 ('reserved', None),
 ('for', 'IN'),
 ('the', 'AT'),
 ("orchestra's", None),
 ('great', 'JJ'),
 ('nights', None),
 ('when', 'WRB'),
 ('the', 'AT'),
 ('soloist', None),
 ('can', 'MD'),
 ('surpass', None),
 ('himself', None),
 ('.', '.')]

In [35]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
...                                      backoff=nltk.DefaultTagger('NN'))            #set backoff to default tagger 'NN'

In [36]:
def performance(cfd, wordlist):                                                       #define performance
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='reviews'))

def display():                                                                        #define display 
    import pylab                                                                      #import library
    word_freqs = nltk.FreqDist(brown.words(categories='reviews')).most_common()
    words_by_freq = [w for (w, _) in word_freqs]
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='reviews'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size for reviews')       #set title
    pylab.xlabel('Model Size')                                                         #set label
    pylab.ylabel('Performance')                                                        #set label
    pylab.show()

In [37]:
display()                                                                              #display output

## 4. Government

In [38]:
government_tagged_sents = brown.tagged_sents(categories='government')                   #tagged sents
government_sents = brown.sents(categories='government')                                 #untagged sents
government_fd = nltk.FreqDist(brown.words(categories='government'))                     #frequency distribution
government_cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='government'))  #cumulative frequency distribution
most_freq_words = government_fd.most_common(100)                                        #top 100 frequent words
likely_tags = dict((word, government_cfd[word].max()) for (word, _) in most_freq_words) #Likelihood of tags
baseline_tagger = nltk.UnigramTagger(model=likely_tags)                                 #Unigram Tagger

In [39]:
baseline_tagger.evaluate(government_tagged_sents)                                      #evaluation

0.4654791277436285

In [40]:
sent = brown.sents(categories='government')[4]                                         #set sentence #4 from government

In [41]:
baseline_tagger.tag(sent)                                                              #tag the sentence

[('For', None),
 ('further', None),
 ('information', None),
 ('contact', None),
 ('Director', None),
 (',', ','),
 ('Office', None),
 ('of', 'IN'),
 ('Business', None),
 ('Economics', None),
 (',', ','),
 ('U.S.', None),
 ('Department', 'NN-TL'),
 ('of', 'IN'),
 ('Commerce', None),
 (',', ','),
 ('Washington', None),
 ('25', None),
 (',', ','),
 ('D.C.', None),
 ('.', '.')]

In [42]:
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
...                                      backoff=nltk.DefaultTagger('NN'))            #set backoff to default tagger 'NN'

In [43]:
def performance(cfd, wordlist):                                                       #define performance
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='government'))

def display():                                                                        #define display 
    import pylab                                                                      #import library
    word_freqs = nltk.FreqDist(brown.words(categories='government')).most_common()
    words_by_freq = [w for (w, _) in word_freqs]
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='government'))
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size for government')   #set title
    pylab.xlabel('Model Size')                                                        #set label
    pylab.ylabel('Performance')                                                       #set label
    pylab.show()

In [44]:
display()                                                                             #display output

# End
## Reference:
### Bird,  S.,  Klein,  E.  and  Loper,  E.  (2009).   Natural  language  processing  with  python,Oâ€™Reilly Media, Inc.