**Practical 5 : Write a program to perform Named Entity Recognition (NER) & Chunking on English Text.**

# **Named Entity Recognition**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
# Importing the libraries
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [None]:
dataset = """Abraham Lincoln was an American statesman and lawyer 
              who served as the 16th President of the United States"""

In [None]:
# Tokenize and tagging the data
dataset_tag = pos_tag(word_tokenize(dataset))
dataset_tag

[('Abraham', 'NNP'),
 ('Lincoln', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('statesman', 'NN'),
 ('and', 'CC'),
 ('lawyer', 'NN'),
 ('who', 'WP'),
 ('served', 'VBD'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('16th', 'CD'),
 ('President', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS')]

In [None]:
# Apply Named Entity Recognition with ne_chunk
data_ner = ne_chunk(dataset_tag)
print(data_ner)

(S
  (PERSON Abraham/NNP)
  (PERSON Lincoln/NNP)
  was/VBD
  an/DT
  (GPE American/JJ)
  statesman/NN
  and/CC
  lawyer/NN
  who/WP
  served/VBD
  as/IN
  the/DT
  16th/CD
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS))


# **Chunking**

In [None]:
# Importing the libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

In [None]:
dataset = """Taj Mahal is one of the world’s most celebrated structures 
          in the world.
          It is a stunning symbol of Indian rich history"""

In [None]:
# Tokenize the data
new_data = word_tokenize(dataset)
print(new_data)

['Taj', 'Mahal', 'is', 'one', 'of', 'the', 'world', '’', 's', 'most', 'celebrated', 'structures', 'in', 'the', 'world', '.', 'It', 'is', 'a', 'stunning', 'symbol', 'of', 'Indian', 'rich', 'history']


In [None]:
# Apply the POS Tagging
postagging = pos_tag(new_data)
print(postagging)

[('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('world', 'NN'), ('’', 'NNP'), ('s', 'VBZ'), ('most', 'JJS'), ('celebrated', 'JJ'), ('structures', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a', 'DT'), ('stunning', 'JJ'), ('symbol', 'NN'), ('of', 'IN'), ('Indian', 'JJ'), ('rich', 'JJ'), ('history', 'NN')]


In [None]:
# Define the sequence of Chunk
sequence_chunk = """ 
           chunk:
               {<NNPS>+}
               {<NNP>+}
               {<NN>+} """

In [None]:
chunk = RegexpParser(sequence_chunk)

In [None]:
chunk_result = chunk.parse(postagging)
print(chunk_result)

(S
  (chunk Taj/NNP Mahal/NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  (chunk world/NN)
  (chunk ’/NNP)
  s/VBZ
  most/JJS
  celebrated/JJ
  structures/NNS
  in/IN
  the/DT
  (chunk world/NN)
  ./.
  It/PRP
  is/VBZ
  a/DT
  stunning/JJ
  (chunk symbol/NN)
  of/IN
  Indian/JJ
  rich/JJ
  (chunk history/NN))
