Implementing Named Entity Recognition Using NLTK 

In [0]:
# Importing NLTK Libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
ex = 'the pinwheel mushroom, is a fungus in the family Marasmiaceae. Widespread in the Northern Hemisphere, it was first described scientifically in 1772 by mycologist Giovanni Antonio Scopoli. The mushrooms are characterized by thin whitish caps up to 2 cm (0.8 in) wide that are sunken in the center and pleated with scalloped margins. The wiry black hollow stalks measure up to 8.0 cm (3.1 in) long by 1.5 mm (0.06 in) thick. On the underside of the caps are widely spaced white gills, attached to a collar encircling the stalk. The mushrooms grow in groups or clusters on decaying wood such as fallen twigs and sticks, moss-covered logs, and stumps.'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:
# Word Tokenize and add POS Tags to the sentence
def preprocess(sentence):
  sent = nltk.word_tokenize(sentence)
  sent = nltk.pos_tag(sent)
  return sent

sent = preprocess(ex)

In [4]:
# Defining Grammar to make tree of sentence
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  (NP the/DT pinwheel/NN)
  (NP mushroom/NN)
  ,/,
  is/VBZ
  (NP a/DT fungus/NN)
  in/IN
  (NP the/DT family/NN)
  Marasmiaceae/NNP
  ./.
  Widespread/NNP
  in/IN
  the/DT
  Northern/NNP
  Hemisphere/NNP
  ,/,
  it/PRP
  was/VBD
  first/RB
  described/VBN
  scientifically/RB
  in/IN
  1772/CD
  by/IN
  (NP mycologist/NN)
  Giovanni/NNP
  Antonio/NNP
  Scopoli/NNP
  ./.
  The/DT
  mushrooms/NNS
  are/VBP
  characterized/VBN
  by/IN
  (NP thin/JJ whitish/NN)
  caps/VBZ
  up/IN
  to/TO
  2/CD
  (NP cm/NN)
  (/(
  0.8/CD
  in/IN
  )/)
  wide/JJ
  that/WDT
  are/VBP
  sunken/VBN
  in/IN
  (NP the/DT center/NN)
  and/CC
  pleated/VBN
  with/IN
  scalloped/JJ
  margins/NNS
  ./.
  (NP The/DT wiry/JJ black/JJ hollow/NN)
  stalks/NNS
  measure/VBP
  up/RB
  to/TO
  8.0/CD
  (NP cm/NN)
  (/(
  3.1/CD
  in/IN
  )/)
  long/RB
  by/IN
  1.5/CD
  mm/NNS
  (/(
  0.06/CD
  in/IN
  )/)
  (NP thick/NN)
  ./.
  On/IN
  (NP the/DT underside/NN)
  of/IN
  the/DT
  caps/NNS
  are/VBP
  widely/RB
  space

In [5]:
# IOB Tagging of the sentence 
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('the', 'DT', 'B-NP'),
 ('pinwheel', 'NN', 'I-NP'),
 ('mushroom', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('is', 'VBZ', 'O'),
 ('a', 'DT', 'B-NP'),
 ('fungus', 'NN', 'I-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('family', 'NN', 'I-NP'),
 ('Marasmiaceae', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('Widespread', 'NNP', 'O'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('Northern', 'NNP', 'O'),
 ('Hemisphere', 'NNP', 'O'),
 (',', ',', 'O'),
 ('it', 'PRP', 'O'),
 ('was', 'VBD', 'O'),
 ('first', 'RB', 'O'),
 ('described', 'VBN', 'O'),
 ('scientifically', 'RB', 'O'),
 ('in', 'IN', 'O'),
 ('1772', 'CD', 'O'),
 ('by', 'IN', 'O'),
 ('mycologist', 'NN', 'B-NP'),
 ('Giovanni', 'NNP', 'O'),
 ('Antonio', 'NNP', 'O'),
 ('Scopoli', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('The', 'DT', 'O'),
 ('mushrooms', 'NNS', 'O'),
 ('are', 'VBP', 'O'),
 ('characterized', 'VBN', 'O'),
 ('by', 'IN', 'O'),
 ('thin', 'JJ', 'B-NP'),
 ('whitish', 'NN', 'I-NP'),
 ('caps', 'VBZ', 'O'),
 ('up', 'IN', 'O'),
 ('to', 'TO', 'O'),
 ('2', 'CD'

In [6]:
# Chunking the sentence using ne_chunk
nltk.download('words')
nltk.download('maxent_ne_chunker')
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
(S
  the/DT
  pinwheel/NN
  mushroom/NN
  ,/,
  is/VBZ
  a/DT
  fungus/NN
  in/IN
  the/DT
  family/NN
  (PERSON Marasmiaceae/NNP)
  ./.
  Widespread/NNP
  in/IN
  the/DT
  (LOCATION Northern/NNP Hemisphere/NNP)
  ,/,
  it/PRP
  was/VBD
  first/RB
  described/VBN
  scientifically/RB
  in/IN
  1772/CD
  by/IN
  mycologist/NN
  (PERSON Giovanni/NNP Antonio/NNP Scopoli/NNP)
  ./.
  The/DT
  mushrooms/NNS
  are/VBP
  characterized/VBN
  by/IN
  thin/JJ
  whitish/NN
  caps/VBZ
  up/IN
  to/TO
  2/CD
  cm/NN
  (/(
  0.8/CD
  in/IN
  )/)
  wide/JJ
  that/WDT
  are/VBP
  sunken/VBN
  in/IN
  the/DT
  center/NN
  and/CC
  pleated/VBN
  with/IN
  scalloped/JJ
  margins/NNS
  ./.
  The/DT
  wiry/JJ
  black/JJ
  hollow/NN
  stalks/NNS
  measure/VBP
  up/R

Implementing Named Entity Recognition using Spacy

In [7]:
# Importing Spacy Libraries and downloading and loading english module 
import spacy
from spacy import displacy
from collections import Counter
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [10]:
ex = 'the pinwheel mushroom, is a fungus in the family Marasmiaceae. Widespread in the Northern Hemisphere, it was first described scientifically in 1772 by mycologist Giovanni Antonio Scopoli. The mushrooms are characterized by thin whitish caps up to 2 cm (0.8 in) wide that are sunken in the center and pleated with scalloped margins. The wiry black hollow stalks measure up to 8.0 cm (3.1 in) long by 1.5 mm (0.06 in) thick. On the underside of the caps are widely spaced white gills, attached to a collar encircling the stalk. The mushrooms grow in groups or clusters on decaying wood such as fallen twigs and sticks, moss-covered logs, and stumps.'
doc = nlp(ex)
pprint([(X.text, X.label_) for X in doc.ents])

[('Marasmiaceae', 'GPE'),
 ('the Northern Hemisphere', 'LOC'),
 ('1772', 'DATE'),
 ('Giovanni Antonio Scopoli', 'PERSON'),
 ('2 cm', 'QUANTITY'),
 ('0.8', 'CARDINAL'),
 ('8.0 cm', 'QUANTITY'),
 ('3.1', 'CARDINAL'),
 ('1.5 mm', 'QUANTITY'),
 ('0.06', 'CARDINAL')]


In [11]:
# Printing every word of the sentence with its IOB Tag and Entity Type
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(the, 'O', ''),
 (pinwheel, 'O', ''),
 (mushroom, 'O', ''),
 (,, 'O', ''),
 (is, 'O', ''),
 (a, 'O', ''),
 (fungus, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (family, 'O', ''),
 (Marasmiaceae, 'B', 'GPE'),
 (., 'O', ''),
 (Widespread, 'O', ''),
 (in, 'O', ''),
 (the, 'B', 'LOC'),
 (Northern, 'I', 'LOC'),
 (Hemisphere, 'I', 'LOC'),
 (,, 'O', ''),
 (it, 'O', ''),
 (was, 'O', ''),
 (first, 'O', ''),
 (described, 'O', ''),
 (scientifically, 'O', ''),
 (in, 'O', ''),
 (1772, 'B', 'DATE'),
 (by, 'O', ''),
 (mycologist, 'O', ''),
 (Giovanni, 'B', 'PERSON'),
 (Antonio, 'I', 'PERSON'),
 (Scopoli, 'I', 'PERSON'),
 (., 'O', ''),
 (The, 'O', ''),
 (mushrooms, 'O', ''),
 (are, 'O', ''),
 (characterized, 'O', ''),
 (by, 'O', ''),
 (thin, 'O', ''),
 (whitish, 'O', ''),
 (caps, 'O', ''),
 (up, 'O', ''),
 (to, 'O', ''),
 (2, 'B', 'QUANTITY'),
 (cm, 'I', 'QUANTITY'),
 ((, 'O', ''),
 (0.8, 'B', 'CARDINAL'),
 (in, 'O', ''),
 (), 'O', ''),
 (wide, 'O', ''),
 (that, 'O', ''),
 (are, 'O', ''),
 (sunken, '