
# TA Workshop: POS Tagging and Lemmatization


In [1]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
# ===== POS Tagging and NER using NLTK =====

sent = '''Professor Tan Eng Chye, NUS Deputy President and Provost, and Professor \
Menahem Ben-Sasson, President of HUJ signed the joint degree agreement at NUS, \
in the presence of Ambassador of Israel to Singapore Her Excellency Amira Arnon \
and about 30 invited guests, on July 03, 2013.'''

# The input for the NE Chunker needs to have POS tags.
# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

[('Professor', 'NNP'),
 ('Tan', 'NNP'),
 ('Eng', 'NNP'),
 ('Chye', 'NNP'),
 (',', ','),
 ('NUS', 'NNP'),
 ('Deputy', 'NNP'),
 ('President', 'NNP'),
 ('and', 'CC'),
 ('Provost', 'NNP'),
 (',', ','),
 ('and', 'CC'),
 ('Professor', 'NNP'),
 ('Menahem', 'NNP'),
 ('Ben-Sasson', 'NNP'),
 (',', ','),
 ('President', 'NNP'),
 ('of', 'IN'),
 ('HUJ', 'NNP'),
 ('signed', 'VBD'),
 ('the', 'DT'),
 ('joint', 'JJ'),
 ('degree', 'NN'),
 ('agreement', 'NN'),
 ('at', 'IN'),
 ('NUS', 'NNP'),
 (',', ','),
 ('in', 'IN'),
 ('the', 'DT'),
 ('presence', 'NN'),
 ('of', 'IN'),
 ('Ambassador', 'NNP'),
 ('of', 'IN'),
 ('Israel', 'NNP'),
 ('to', 'TO'),
 ('Singapore', 'NNP'),
 ('Her', 'NNP'),
 ('Excellency', 'NNP'),
 ('Amira', 'NNP'),
 ('Arnon', 'NNP'),
 ('and', 'CC'),
 ('about', 'IN'),
 ('30', 'CD'),
 ('invited', 'JJ'),
 ('guests', 'NNS'),
 (',', ','),
 ('on', 'IN'),
 ('July', 'NNP'),
 ('03', 'CD'),
 (',', ','),
 ('2013', 'CD'),
 ('.', '.')]

In [3]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

sent_chunk = ne_chunk(sent_pos)
print(sent_chunk)

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...


(S
  Professor/NNP
  Tan/NNP
  Eng/NNP
  Chye/NNP
  ,/,
  (ORGANIZATION NUS/NNP)
  Deputy/NNP
  President/NNP
  and/CC
  (ORGANIZATION Provost/NNP)
  ,/,
  and/CC
  (ORGANIZATION Professor/NNP Menahem/NNP)
  Ben-Sasson/NNP
  ,/,
  President/NNP
  of/IN
  (ORGANIZATION HUJ/NNP)
  signed/VBD
  the/DT
  joint/JJ
  degree/NN
  agreement/NN
  at/IN
  (ORGANIZATION NUS/NNP)
  ,/,
  in/IN
  the/DT
  presence/NN
  of/IN
  (ORGANIZATION Ambassador/NNP)
  of/IN
  (GPE Israel/NNP)
  to/TO
  (GPE Singapore/NNP)
  Her/NNP
  Excellency/NNP
  (PERSON Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  July/NNP
  03/CD
  ,/,
  2013/CD
  ./.)


[nltk_data]   Unzipping corpora\words.zip.


In [4]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

grammar = r"""
  NE: {<NNP>+}      # chunk NE sequences of proper nouns
  NP: {<DT><NN|NNS>}
   # chunk noun phrase by DT+NN  
      
"""

cp = nltk.RegexpParser(grammar)


In [5]:
# Try it on a simple sentence.
result = cp.parse(pos_tag(word_tokenize("Donald Trump is a president."))) 
print(result)

(S (NE Donald/NNP Trump/NNP) is/VBZ (NP a/DT president/NN) ./.)


In [6]:
# The following line would create a graph with the tree if run locally.
# It doesn't work in Colab.
#result.draw()

# a work-around, not always correct
from nltk import Tree
Tree.fromstring(str(result)).pretty_print()

               S                                         
   ____________|__________________________                
  |     |              NE                 NP             
  |     |       _______|______        ____|_______        
is/VBZ ./. Donald/NNP     Trump/NNP a/DT     president/NN



In [7]:
# Now try it on the longer sentence with more noun phrases.
result = cp.parse(sent_pos)
print(result)

(S
  (NE Professor/NNP Tan/NNP Eng/NNP Chye/NNP)
  ,/,
  (NE NUS/NNP Deputy/NNP President/NNP)
  and/CC
  (NE Provost/NNP)
  ,/,
  and/CC
  (NE Professor/NNP Menahem/NNP Ben-Sasson/NNP)
  ,/,
  (NE President/NNP)
  of/IN
  (NE HUJ/NNP)
  signed/VBD
  the/DT
  joint/JJ
  degree/NN
  agreement/NN
  at/IN
  (NE NUS/NNP)
  ,/,
  in/IN
  (NP the/DT presence/NN)
  of/IN
  (NE Ambassador/NNP)
  of/IN
  (NE Israel/NNP)
  to/TO
  (NE Singapore/NNP Her/NNP Excellency/NNP Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  (NE July/NNP)
  03/CD
  ,/,
  2013/CD
  ./.)



# Exercise: 
Modify the above tag patterns to capture the NEs and NPs in the example sentence. 

# Alternative way: using spaCY

Installation of spaCY and the required models:
    pip install -U spacy
    python -m spacy download en_core_web_sm
    python -m spacy download en_core_web_md

Restart the kernel after the above steps

In [8]:
!pip install -U spacy

Collecting spacy

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.5.0 requires typing-extensions~=3.7.4, but you have typing-extensions 3.10.0.0 which is incompatible.
gensim 3.8.3 requires Cython==0.29.14, but you have cython 0.29.22 which is incompatible.



  Downloading spacy-3.1.0-cp37-cp37m-win_amd64.whl (11.8 MB)
Collecting srsly<3.0.0,>=2.4.1
  Downloading srsly-2.4.1-cp37-cp37m-win_amd64.whl (450 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Downloading wasabi-0.8.2-py3-none-any.whl (23 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.0-py3-none-any.whl (42 kB)
Collecting thinc<8.1.0,>=8.0.7
  Downloading thinc-8.0.7-cp37-cp37m-win_amd64.whl (1.0 MB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting catalogue<2.1.0,>=2.0.4
  Downloading catalogue-2.0.4-py3-none-any.whl (16 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-win_amd64.whl (1.9 MB)
Collecting spacy-legacy<3.1.0,>=3.0.7
  Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting smart-open<6.0.0,>=5.0.0
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Installing 

In [9]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl (45.4 MB)
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.1.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


2021-07-17 14:43:54.088135: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2021-07-17 14:43:54.088654: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [10]:
import spacy
#load the required model
nlp = spacy.load("en_core_web_md")
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x110b0065dc8>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x110b05399a8>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x110b0290dd8>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x110b05da8c8>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x110b05e9488>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x110b0290f28>)]

In [11]:
#process a sentence
result = nlp("James likes the nice gift very much.")

In [12]:
from spacy import displacy

In [13]:
#to display the parsing result as dependency graph
displacy.render(result, style="dep", jupyter=True)

In [14]:
#detailed results for each token
for token in result:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.head,
            token.shape_, token.is_alpha, token.is_stop)

James James PROPN NNP nsubj likes Xxxxx True False
likes like VERB VBZ ROOT likes xxxx True False
the the DET DT det gift xxx True True
nice nice ADJ JJ amod gift xxxx True False
gift gift NOUN NN dobj likes xxxx True False
very very ADV RB advmod much xxxx True True
much much ADV RB advmod likes xxxx True True
. . PUNCT . punct likes . False False


In [15]:
# Each token carries detailed morphological information
for token in result:
  print(token.text, token.morph)

James NounType=Prop|Number=Sing
likes Number=Sing|Person=Three|Tense=Pres|VerbForm=Fin
the Definite=Def|PronType=Art
nice Degree=Pos
gift Number=Sing
very 
much 
. PunctType=Peri


In [16]:
result1 = nlp(sent)
displacy.render(result1, style="ent", jupyter=True)

In [17]:
# and the results of NER
for ent in result1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Tan Eng Chye 10 22 PERSON
Provost 49 56 PERSON
Menahem Ben-Sasson 72 90 PERSON
HUJ 105 108 ORG
Israel 184 190 GPE
Singapore 194 203 GPE
Amira Arnon 219 230 PERSON
about 30 235 243 CARDINAL
July 03 263 270 DATE
2013 272 276 DATE


In [18]:
# Now let's work on an article. Upload the file to your Colab session first.
from nltk import sent_tokenize

f =  open("IE-example.txt", "r")
text = f.read()

sents = sent_tokenize(text)

In [19]:
# Disable the pipeline components that you don't need.
# Process multiple sentences more efficiently using pipe()

results = nlp.pipe(sents, disable=["parser"])
res_list = list(results)

In [20]:
for res in res_list:
    for ent in res.ents:
        print(ent.text, ent.label_)

Hilary Clinton PERSON
Donald Trump PERSON
Russia GPE
ISIS ORG
Clinton PERSON
Trump PERSON
Russian NORP
Vladimir Putin PERSON
Trump PERSON
NBC News ORG
Putin PERSON
US GPE
Barack Obama PERSON
Trump ORG
US GPE
ISIS ORG
Pentagon ORG
Obama GPE
Barack Obama PERSON
Hillary Clinton PERSON
Clinton PERSON
Iraq GPE
Syria GPE


In [21]:
result = nlp("James likes oranges and peaches.")
for tok in result:
  print(tok.text, tok.has_vector)

tok.vector.shape

James True
likes True
oranges True
and True
peaches True
. True


(300,)

In [22]:
print(result[2].similarity(result[0]))
print(result[2].similarity(result[4]))

0.12540577
0.73022324
