
# TA Workshop: POS Tagging and Lemmatization


In [None]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# ===== POS Tagging and NER using NLTK =====

sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor \
Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, \
in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon \
and about 30 invited guests, on July 03, 2013.'''

# The input for the NE Chunker needs to have POS tags.
# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

[('Professor', 'NNP'),
 ('Tan', 'NNP'),
 ('Eng', 'NNP'),
 ('Chye', 'NNP'),
 (',', ','),
 ('NUS', 'NNP'),
 ('Deputy', 'NNP'),
 ('President', 'NNP'),
 ('and', 'CC'),
 ('Provost', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('Professor', 'NNP'),
 ('Menahem', 'NNP'),
 ('Ben-Sasson', 'NNP'),
 (',', ','),
 ('President', 'NNP'),
 ('of', 'IN'),
 ('HUJ', 'NNP'),
 ('signed', 'VBD'),
 ('the', 'DT'),
 ('joint', 'JJ'),
 ('degree', 'NN'),
 ('agreement', 'NN'),
 ('at', 'IN'),
 ('NUS', 'NNP'),
 (',', ','),
 ('in', 'IN'),
 ('the', 'DT'),
 ('presence', 'NN'),
 ('of', 'IN'),
 ('Ambassador', 'NNP'),
 ('of', 'IN'),
 ('Israel', 'NNP'),
 ('to', 'TO'),
 ('Singapore', 'NNP'),
 ('Her', 'NNP'),
 ('Excellency', 'NNP'),
 ('Amira', 'NNP'),
 ('Arnon', 'NNP'),
 ('and', 'CC'),
 ('about', 'IN'),
 ('30', 'CD'),
 ('invited', 'JJ'),
 ('guests', 'NNS'),
 (',', ','),
 ('on', 'IN'),
 ('July', 'NNP'),
 ('03', 'CD'),
 (',', ','),
 ('2013', 'CD'),
 ('.', '.')]

In [None]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

sent_chunk = ne_chunk(sent_pos)
print(sent_chunk)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
(S
  Professor/NNP
  Tan/NNP
  Eng/NNP
  Chye/NNP
  ,/,
  (ORGANIZATION NUS/NNP)
  Deputy/NNP
  President/NNP
  and/CC
  (ORGANIZATION Provost/NNP)
  ,/,
  and/CC
  (ORGANIZATION Professor/NNP Menahem/NNP)
  Ben-Sasson/NNP
  ,/,
  President/NNP
  of/IN
  (ORGANIZATION HUJ/NNP)
  signed/VBD
  the/DT
  joint/JJ
  degree/NN
  agreement/NN
  at/IN
  (ORGANIZATION NUS/NNP)
  ,/,
  in/IN
  the/DT
  presence/NN
  of/IN
  (ORGANIZATION Ambassador/NNP)
  of/IN
  (GPE Israel/NNP)
  to/TO
  (GPE Singapore/NNP)
  Her/NNP
  Excellency/NNP
  (PERSON Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  July/NNP
  03/CD
  ,/,
  2013/CD
  ./.)


In [None]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

grammar = r"""
  NE: {<NNP>+}      # chunk NE sequences of proper nouns
  NP: {<DT><NN|NNS>}
   # chunk noun phrase by DT+NN  
      
"""

cp = nltk.RegexpParser(grammar)


In [None]:
# Try it on a simple sentence.
result = cp.parse(pos_tag(word_tokenize("Donald Trump is a president."))) 
print(result)

(S (NE Donald/NNP Trump/NNP) is/VBZ (NP a/DT president/NN) ./.)


In [None]:
# The following line would create a graph with the tree if run locally.
# It doesn't work in Colab.
#result.draw()

# a work-around, not always correct
from nltk import Tree
Tree.fromstring(str(result)).pretty_print()

               S                                         
   ____________|__________________________                
  |     |              NE                 NP             
  |     |       _______|______        ____|_______        
is/VBZ ./. Donald/NNP     Trump/NNP a/DT     president/NN



In [None]:
# Now try it on the longer sentence with more noun phrases.
result = cp.parse(sent_pos)
print(result)

(S
  (NE Professor/NNP Tan/NNP Eng/NNP Chye/NNP)
  ,/,
  (NE NUS/NNP Deputy/NNP President/NNP)
  and/CC
  (NE Provost/NNP)
  ,/,
  and/CC
  (NE Professor/NNP Menahem/NNP Ben-Sasson/NNP)
  ,/,
  (NE President/NNP)
  of/IN
  (NE HUJ/NNP)
  signed/VBD
  the/DT
  joint/JJ
  degree/NN
  agreement/NN
  at/IN
  (NE NUS/NNP)
  ,/,
  in/IN
  (NP the/DT presence/NN)
  of/IN
  (NE Ambassador/NNP)
  of/IN
  (NE Israel/NNP)
  to/TO
  (NE Singapore/NNP Her/NNP Excellency/NNP Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  (NE July/NNP)
  03/CD
  ,/,
  2013/CD
  ./.)



# Exercise: 
Modify the above tag patterns to capture the NEs and NPs in the example sentence. 

# Alternative way: using spaCY

Installation of spaCY and the required models:
    pip install -U spacy
    python -m spacy download en_core_web_sm
    python -m spacy download en_core_web_md

Restart the kernel after the above steps

In [None]:
!pip install -U spacy

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/c5/5d/20f8252a9dfe7057721136d83cecb1ca1e0936b21fd7a0a4889d1d6650a8/spacy-3.0.1-cp36-cp36m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 267kB/s 
[?25hCollecting typer<0.4.0,>=0.3.0
  Downloading https://files.pythonhosted.org/packages/90/34/d138832f6945432c638f32137e6c79a3b682f06a63c488dcfaca6b166c64/typer-0.3.2-py3-none-any.whl
Collecting srsly<3.0.0,>=2.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/b7/2a8da28c6b4db5137d6ba949f0160d4628582b1fef5665ead8bb96ac9346/srsly-2.4.0-cp36-cp36m-manylinux2014_x86_64.whl (456kB)
[K     |████████████████████████████████| 460kB 40.4MB/s 
Collecting catalogue<2.1.0,>=2.0.1
  Downloading https://files.pythonhosted.org/packages/48/5c/493a2f3bb0eac17b1d48129ecfd251f0520b6c89493e9fd0522f534a9e4a/catalogue-2.0.1-py3-none-any.whl
Collecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://files.pythonhosted.org/packag

In [None]:
!python -m spacy download en_core_web_md

2021-02-08 09:08:06.470011: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
Collecting en-core-web-md==3.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1MB)
[K     |████████████████████████████████| 47.1MB 98kB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
import spacy
#load the required model
nlp = spacy.load("en_core_web_md")
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7feac0150f10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7feabffdc830>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7feabff8a250>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7feabff8a388>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7feabff1bf48>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7feabff30c88>)]

In [None]:
#process a sentence
result = nlp("James likes the nice gift very much.")

In [None]:
from spacy import displacy

In [None]:
#to display the parsing result as dependency graph
displacy.render(result, style="dep", jupyter=True)

In [None]:
#detailed results for each token
for token in result:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head,
            token.shape_, token.is_alpha, token.is_stop)

James James PROPN NNP nsubj likes Xxxxx True False
likes like VERB VBZ ROOT likes xxxx True False
the the DET DT det gift xxx True True
nice nice ADJ JJ amod gift xxxx True False
gift gift NOUN NN dobj likes xxxx True False
very very ADV RB advmod much xxxx True True
much much ADV RB advmod likes xxxx True True
. . PUNCT . punct likes . False False


In [None]:
# Each token carries detailed morphological information
for token in result:
  print(token.text, token.morph)

James NounType=Prop|Number=Sing
likes Number=Sing|Person=Three|Tense=Pres|VerbForm=Fin
the Definite=Def|PronType=Art
nice Degree=Pos
gift Number=Sing
very 
much 
. PunctType=Peri


In [None]:
result1 = nlp(sent)
displacy.render(result1, style="ent", jupyter=True)

In [None]:
# and the results of NER
for ent in result1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Tan Eng Chye 10 22 PERSON
Menahem Ben-Sasson 72 90 PERSON
HUJ 105 108 ORG
Israel 184 190 GPE
Singapore 194 203 GPE
Amira Arnon 219 230 PERSON
about 30 235 243 CARDINAL
July 03, 2013 263 276 DATE


In [None]:
# Now let's work on an article. Upload the file to your Colab session first.
from nltk import sent_tokenize

f =  open("IE-example.txt", "r")
text = f.read()

sents = sent_tokenize(text)

In [None]:
# Disable the pipeline components that you don't need.
# Process multiple sentences more efficiently using pipe()

results = nlp.pipe(sents, disable=["parser"])
res_list = list(results)

In [None]:
for res in res_list:
    for ent in res.ents:
        print(ent.text, ent.label_)

Hilary Clinton PERSON
Donald Trump PERSON
Russia GPE
ISIS ORG
Clinton PERSON
Trump PERSON
Russian NORP
Vladimir Putin PERSON
Trump PERSON
NBC News ORG
Putin PERSON
US GPE
Barack Obama PERSON
Trump PERSON
US GPE
ISIS ORG
Pentagon ORG
Barack Obama PERSON
Hillary Clinton PERSON
Clinton PERSON
Iraq GPE
Syria GPE


In [None]:
result = nlp("James likes oranges and peaches.")
for tok in result:
  print(tok.text, tok.has_vector)

tok.vector.shape

James True
likes True
oranges True
and True
peaches True
. True


(300,)

In [None]:
print(result[2].similarity(result[0]))
print(result[2].similarity(result[4]))

0.12540577
0.73022324
