In [8]:
import nltk

from nltk.corpus import conll2000


##########################################################################
# Exercise 1: Chunking
##########################################################################

print(conll2000.chunked_sents('train.txt')[99])

sents = conll2000.chunked_sents('train.txt')
length = 0
s_num = 0
l_num = 0
np_label = 0
vp_label = 0
pp_label = 0

np_len = 0
vp_len = 0
pp_len = 0
 
for sent in sents:
    s = sent.flatten() # gives you a flattened tree [(token, label), (token, label)]
    length += len(s) #counts the total lenght of the sentences
    s_num += 1 # counts sentences
    for subtree in sent.subtrees():
        #gives us the chunk label
        label = subtree.label()
        if label.endswith("NP") == True:
            np_label +=1
            np_len += len(subtree)
        elif label.endswith("VP") == True:
            vp_label += 1
            vp_len += len(subtree)
        elif label.endswith("PP") == True:
            pp_label += 1
            pp_len += len(subtree)
        
        
print(np_label, vp_label, pp_label)
""" Let's do some data exploration.
1. First, how many sentences are there? 

8936

2. How many NP chunks?
    55081
3. How many VP chunks?
    21467
4. How many PP chunks?
    21281
5. What is the average length of each?
    np : 2.15
    vp : 1.56
    pp : 1.01
    
"""

np_av = np_len / np_label
print(round(np_av, 2))
vp_av = vp_len / vp_label
print(round(vp_av, 2))
pp_av = pp_len/pp_label
print(round(pp_av, 2))

(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)
55081 21467 21281
2.15
1.56
1.01


In [4]:
import nltk
import numpy as np

from nltk.corpus import conll2000
from nltk import FreqDist
from collections import defaultdict


labels = FreqDist()
lenghts = defaultdict(lambda: list()) # The same as doing --> lengths = {"NP": [], "VP": [], "PP": []}

sents = conll2000.chunked_sents('train.txt')

for sent in sents:
    s = sent.flatten()
    for subtree in sent.subtrees():
        label = subtree.label()
        labels.update([label])
        lenghts[label].append(len(subtree))

print("# sents: {}".format(len(sents)))

for label in ["NP", "VP", "PP"]:
    print(label)
    print("# chunks: {}".format(labels[label]))
    print("avg.length: {0:.1f}".format(np.mean(lenghts[label])))
    

# sents: 8936
NP
# chunks: 55081
avg.length: 2.1
VP
# chunks: 21467
avg.length: 1.6
PP
# chunks: 21281
avg.length: 1.0


In [None]:
##########################################################################
# Exercise 2: Unigram chunker
##########################################################################


"""
Now, let's concentrate only on NP chunking
1. Create a unigram chunker using the UnigramChunker class below.
Train on the train sentences and evaluate on the test sentences using
the evaluate method, i.e., my_model.evaluate(test_sents).


2. What is the F1 score?


"""


class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)


train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))