In [245]:
import nltk
from nltk import word_tokenize
from nltk import FreqDist # need this to access "most_common" method

import urllib2

from IPython.display import Image

import re

import random

import numpy as np

from __future__  import division

### 2

In [246]:
pattern = 'NNS: {<JJ|CD|DT>+<NNS>+}'
cp = nltk.RegexpParser(pattern)

In [247]:
sentence = '''
Three men are caught drunk driving. Two of them have four beers.

'''
words = word_tokenize(sentence)
words

['Three',
 'men',
 'are',
 'caught',
 'drunk',
 'driving',
 '.',
 'Two',
 'of',
 'them',
 'have',
 'four',
 'beers',
 '.']

In [248]:
word_tags = nltk.pos_tag(words)
word_tags

[('Three', 'CD'),
 ('men', 'NNS'),
 ('are', 'VBP'),
 ('caught', 'VBN'),
 ('drunk', 'JJ'),
 ('driving', 'NN'),
 ('.', '.'),
 ('Two', 'CD'),
 ('of', 'IN'),
 ('them', 'PRP'),
 ('have', 'VBP'),
 ('four', 'CD'),
 ('beers', 'NNS'),
 ('.', '.')]

In [249]:
result = cp.parse(word_tags)
print(result)

(S
  (NNS Three/CD men/NNS)
  are/VBP
  caught/VBN
  drunk/JJ
  driving/NN
  ./.
  Two/CD
  of/IN
  them/PRP
  have/VBP
  (NNS four/CD beers/NNS)
  ./.)


### 4

In [250]:
pattern = '''
NP: 
   {<.*>+}    # chunk everything
   }<DT|RP>+{ # chink sequences of DT or RP

'''

Let's use one of brown corpus' sentence to test:

In [251]:
sent = nltk.corpus.brown.sents()
sent[0]

[u'The',
 u'Fulton',
 u'County',
 u'Grand',
 u'Jury',
 u'said',
 u'Friday',
 u'an',
 u'investigation',
 u'of',
 u"Atlanta's",
 u'recent',
 u'primary',
 u'election',
 u'produced',
 u'``',
 u'no',
 u'evidence',
 u"''",
 u'that',
 u'any',
 u'irregularities',
 u'took',
 u'place',
 u'.']

In [252]:
word_tags = nltk.pos_tag(sent[0])
word_tags

[(u'The', 'DT'),
 (u'Fulton', 'NNP'),
 (u'County', 'NNP'),
 (u'Grand', 'NNP'),
 (u'Jury', 'NNP'),
 (u'said', 'VBD'),
 (u'Friday', 'NNP'),
 (u'an', 'DT'),
 (u'investigation', 'NN'),
 (u'of', 'IN'),
 (u"Atlanta's", 'NNP'),
 (u'recent', 'JJ'),
 (u'primary', 'JJ'),
 (u'election', 'NN'),
 (u'produced', 'VBD'),
 (u'``', '``'),
 (u'no', 'DT'),
 (u'evidence', 'NN'),
 (u"''", "''"),
 (u'that', 'IN'),
 (u'any', 'DT'),
 (u'irregularities', 'NNS'),
 (u'took', 'VBD'),
 (u'place', 'NN'),
 (u'.', '.')]

In [253]:
cp = nltk.RegexpParser(pattern)
print (cp.parse(word_tags))

(S
  The/DT
  (NP Fulton/NNP County/NNP Grand/NNP Jury/NNP said/VBD Friday/NNP)
  an/DT
  (NP
    investigation/NN
    of/IN
    Atlanta's/NNP
    recent/JJ
    primary/JJ
    election/NN
    produced/VBD
    ``/``)
  no/DT
  (NP evidence/NN ''/'' that/IN)
  any/DT
  (NP irregularities/NNS took/VBD place/NN ./.))


The "an/DT", "no/DT", "any/DT" are chinked out using the pattern we assigned.

### 5

In [254]:
gerund_patterns = '''
GERUNDS: 
       {<DT>*<VB.?>+<DT>?<NNS>+}    # chunk everything
       {<NN>*<VB.?>+<DT>?<NNS>+} # chink sequences of DT or RP

'''

In [255]:
sent = nltk.corpus.brown.sents()
# word_tags = nltk.pos_tag(sent)
word_tags = nltk.pos_tag(sent[6])
cp = nltk.RegexpParser(gerund_patterns)
print (cp.parse(word_tags))

(S
  The/DT
  grand/JJ
  jury/NN
  commented/VBD
  on/IN
  a/DT
  number/NN
  of/IN
  other/JJ
  topics/NNS
  ,/,
  among/IN
  them/PRP
  the/DT
  Atlanta/NNP
  and/CC
  Fulton/NNP
  County/NNP
  purchasing/NN
  departments/NNS
  which/WDT
  it/PRP
  said/VBD
  ``/``
  are/VBP
  well/RB
  operated/VBN
  and/CC
  follow/VB
  generally/RB
  (GERUNDS accepted/VBN practices/NNS)
  which/WDT
  inure/VBP
  to/TO
  the/DT
  best/JJS
  interest/NN
  of/IN
  both/DT
  governments/NNS
  ''/''
  ./.)


In [256]:
sentence_1 = 'Can you help fixing the issues?'
word_tags = nltk.pos_tag(word_tokenize(sentence_1))
cp = nltk.RegexpParser(gerund_patterns)
print (cp.parse(word_tags))

(S Can/MD you/PRP (GERUNDS help/VB fixing/VBG the/DT issues/NNS) ?/.)


### 6

In [257]:
# the following patterns handle some coordinated noun phrases:

cnp = '''
SENARIO_1: {<NN.*>+<CC>*<NN.*>+}
SENARIO_2: {<DT><PRP$><NNS><CC><NNS>}
SENARIO_3: {<NN.*>*<CC>*<NN.*>+}
'''

In [258]:
cp = nltk.RegexpParser(cnp)

In [259]:
sentence = 'July and August. All your managers and supervisors. Company courts and adjudicators.'

In [260]:
word_tags = nltk.pos_tag(word_tokenize(sentence))
print (cp.parse(word_tags))

(S
  (SENARIO_1 July/NNP and/CC August/NNP)
  ./.
  All/DT
  your/PRP$
  (SENARIO_1 managers/NNS and/CC supervisors/NNS)
  ./.
  (SENARIO_1 Company/NN courts/NNS and/CC adjudicators/NNS)
  ./.)


### 7

You can get the chunked corpus here:

test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

#### 7.a

In [261]:
from nltk.corpus import conll2000
pattern = 'NNS: {<[NPJ].*>+}'
cp = nltk.RegexpParser(pattern)
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print (cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  42.1%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


**Let's try UnigramChunker**

In [262]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): 
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [263]:
unigram_chunker = UnigramChunker(train_sents)
print (unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


#### BigramChunker

In [264]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): 
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [265]:
bigram_chunker = BigramChunker(test_sents)
print (bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.4%%
    Precision:     82.6%%
    Recall:        87.5%%
    F-Measure:     85.0%%


#### TrigramChunker

In [266]:
class TrigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents): 
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.TrigramTagger(train_data)

    def parse(self, sentence): 
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [267]:
trigram_chunker = TrigramChunker(train_sents)
print (trigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.5%%
    Recall:        86.8%%
    F-Measure:     84.6%%


### 7.b

In [268]:
from nltk import ChunkScore

In [269]:
chunkscore = nltk.chunk.ChunkScore()

In [270]:
for correct in test_sents:   
    guess = trigram_chunker.parse(correct.leaves())   
    chunkscore.score(correct, guess)   
print('F Measure:', chunkscore.f_measure()) 
print('Chunkscore:', chunkscore) 

('F Measure:', 0.8458218909376227)
('Chunkscore:', <ChunkScoring of 12422 chunks>)


In [271]:
chunkscore.missed()[:5]

[ImmutableTree('NP', [(u'quite', u'RB'), (u'a', u'DT'), (u'bit', u'NN')]),
 ImmutableTree('NP', [(u'so-called', u'JJ'), (u'analog', u'NN'), (u'integrated', u'VBN'), (u'circuits', u'NNS')]),
 ImmutableTree('NP', [(u'certain', u'JJ'), (u'Santa', u'NNP'), (u'Monica', u'NNP'), (u'Mountain', u'NNP'), (u'trails', u'NNS')]),
 ImmutableTree('NP', [(u'next', u'JJ'), (u'May', u'NNP')]),
 ImmutableTree('NP', [(u'``', u'``'), (u'chinless', u'JJ'), (u"''", u"''"), (u'Dan', u'NNP'), (u'Shaughnessy', u'NNP')])]

In [272]:
chunkscore.incorrect()[:5]

[ImmutableTree('NP', [(u'Business', u'NNP'), (u'Channel', u'NNP'), (u'cable', u'NN'), (u'network', u'NN')]),
 ImmutableTree('NP', [(u'different', u'JJ'), (u'kinds', u'NNS')]),
 ImmutableTree('NP', [(u'comfortable', u'JJ')]),
 ImmutableTree('NP', [(u'unclassified', u'JJ'), (u'message', u'NN')]),
 ImmutableTree('NP', [(u'Michael', u'NNP'), (u'``', u'``'), (u'Pee', u'NNP'), (u'Wee', u'NNP')])]

### 8