# Importing Misc. Corpora

In [3]:
import nltk
nltk.download('conll2000')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\William\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\William\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\William\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Observing a Chunked Corpus

In [5]:
from nltk.corpus import conll2000

chunked_sentence = conll2000.chunked_sents()[3]
print(chunked_sentence)

(S
  (NP This/DT)
  (VP has/VBZ increased/VBN)
  (NP the/DT risk/NN)
  (PP of/IN)
  (NP the/DT government/NN)
  (VP being/VBG forced/VBN to/TO increase/VB)
  (NP base/NN rates/NNS)
  (PP to/TO)
  (NP 16/CD %/NN)
  (PP from/IN)
  (NP their/PRP$ current/JJ 15/CD %/NN level/NN)
  (VP to/TO defend/VB)
  (NP the/DT pound/NN)
  ,/,
  (NP economists/NNS)
  and/CC
  (NP foreign/JJ exchange/NN market/NN analysts/NNS)
  (VP say/VBP)
  ./.)


# Training A Chunker
## POS Tagging Based

In [6]:
import random
from nltk.chunk import conlltags2tree, tree2conlltags
 
shuffled_conll_sents = list(conll2000.chunked_sents())
random.shuffle(shuffled_conll_sents)
train_sents = shuffled_conll_sents[:int(len(shuffled_conll_sents) * 0.9)]
test_sents = shuffled_conll_sents[int(len(shuffled_conll_sents) * 0.9 + 1):]

In [7]:
from nltk import ChunkParserI, TrigramTagger
 
 
class TrigramChunkParser(ChunkParserI):
    def __init__(self, train_sents):
        # Extract only the (POS-TAG, IOB-CHUNK-TAG) pairs
        train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] 
                      for sent in train_sents]
 
        # Train a TrigramTagger
        self.tagger = TrigramTagger(train_data)
 
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
 
        # Get the Chunk tags
        tagged_pos_tags = self.tagger.tag(pos_tags)
 
        # Assemble the (word, pos, chunk) triplets
        conlltags = [(word, pos_tag, chunk_tag) 
                     for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)]
 
        # Transform to tree
        return conlltags2tree(conlltags)
 
 
trigram_chunker = TrigramChunkParser(train_sents)
print(trigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.6%%
    Precision:     80.9%%
    Recall:        84.1%%
    F-Measure:     82.5%%


## Classifier Based

In [8]:
import pickle
from collections import Iterable
from nltk import ChunkParserI, ClassifierBasedTagger
from nltk.stem.snowball import SnowballStemmer
 
def features(tokens, index, history):
    """
    `tokens`  = a POS-tagged sentence [(w1, t1), ...]
    `index`   = the index of the token we want to extract features for
    `history` = the previous predicted IOB tags
    """
 
    # init the stemmer
    stemmer = SnowballStemmer('english')
 
    # Pad the sequence with placeholders
    tokens = [('__START2__', '__START2__'), ('__START1__', '__START1__')] + list(tokens) + [('__END1__', '__END1__'), ('__END2__', '__END2__')]
    history = ['__START2__', '__START1__'] + list(history)
 
    # shift the index with 2, to accommodate the padding
    index += 2
 
    word, pos = tokens[index]
    prevword, prevpos = tokens[index - 1]
    prevprevword, prevprevpos = tokens[index - 2]
    nextword, nextpos = tokens[index + 1]
    nextnextword, nextnextpos = tokens[index + 2]
 
    return {
        'word': word,
        'lemma': stemmer.stem(word),
        'pos': pos,
 
        'next-word': nextword,
        'next-pos': nextpos,
 
        'next-next-word': nextnextword,
        'nextnextpos': nextnextpos,
 
        'prev-word': prevword,
        'prev-pos': prevpos,
 
        'prev-prev-word': prevprevword,
        'prev-prev-pos': prevprevpos,
    }
 
 
class ClassifierChunkParser(ChunkParserI):
    def __init__(self, chunked_sents, **kwargs):
        assert isinstance(chunked_sents, Iterable)
 
        # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
        chunked_sents = [tree2conlltags(sent) for sent in chunked_sents]
 
        # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
        def triplets2tagged_pairs(iob_sent):
            return [((word, pos), chunk) for word, pos, chunk in iob_sent]
        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]
 
        self.feature_detector = features
        self.tagger = ClassifierBasedTagger(
            train=chunked_sents,
            feature_detector=features,
            **kwargs)
 
    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)
 
        # Transform the result from [((w1, t1), iob1), ...] 
        # to the preferred list of triplets format [(w1, t1, iob1), ...]
        iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
 
        # Transform the list of triplets to nltk.Tree format
        return conlltags2tree(iob_triplets)
 
classifier_chunker = ClassifierChunkParser(train_sents)
print (classifier_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.2%%
    Precision:     87.8%%
    Recall:        91.1%%
    F-Measure:     89.5%%


# Testing

In [14]:
from nltk import word_tokenize, pos_tag
 
# Something from today's NYTimes paper:
print (classifier_chunker.parse(pos_tag(word_tokenize("Hello world! My name is William and I am typing on the computer."))))
 

(S
  (NP Hello/NNP world/NN)
  !/.
  (NP My/PRP$ name/NN)
  (VP is/VBZ)
  (NP William/NNP)
  and/CC
  (NP I/PRP)
  (VP am/VBP typing/VBG)
  (PP on/IN)
  (NP the/DT computer/NN)
  ./.)


# Spacy

In [3]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")
doc = nlp('Hello     World!')
for token in doc:
    print('"' + token.text + '"')

"Hello"
"    "
"World"
"!"


In [22]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Wall Street Journal just published an interesting piece on crypto currencies")
doc2 = nlp("My name is William and I am typing on the computer.")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)
    
print("\n")
    
for chunk in doc2.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

Wall Street Journal NP Journal
an interesting piece NP piece
crypto currencies NP currencies


My name NP name
William NP William
I NP I
the computer NP computer


In [23]:
nlp = spacy.load("es_core_news_sm")
doc = nlp("El Wall Street Journal acaba de publicar una interesante pieza sobre monedas criptogr√°ficas")
doc2 = nlp("Me llamo es William, y yo estoy tecleando en la computadora.")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)
    
print("\n")

for chunk in doc2.noun_chunks:
    print(chunk.text, chunk.label_, chunk.root.text)

El Wall Street Journal NP Wall
una interesante pieza NP pieza
monedas NP monedas


Me NP Me
William NP William
yo NP yo
la computadora NP computadora


NameError: name 'textacy' is not defined

# POS Tagging

In [11]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
