In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Import the necessary libraries including nltk, nltk.tokenize, and nltk.corpus. You can do this using the following code:

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, state_union #to import state_union dataset
from nltk.tokenize import PunktSentenceTokenizer

Load the unstructured dataset that you want to perform NER on. In this example, we will use the "state_union" corpus provided by NLTK. You
can load it using the following code

In [None]:
nltk.download('state_union')



[nltk_data] Downloading package state_union to /root/nltk_data...
[nltk_data]   Unzipping corpora/state_union.zip.


True

In [None]:
#import a dataset
dataset = state_union.raw("2006-GWBush.txt")

In [None]:
print(type(dataset))
print(len(dataset))

<class 'str'>
33411


In [None]:
print(dataset)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
January 31, 2006

THE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream. Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King. (Applause.)

President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan. 31, 2006. White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together. We have gathered under this Capitol dome in moments of national mourning and national achievement. We have served America 

Use NLTK's PunktSentenceTokenizer to tokenize the dataset into sentences. You can do this using the following code

In [None]:
sentence_tokenizer = PunktSentenceTokenizer()
sentences= sentence_tokenizer.tokenize(dataset)

Tokenize each sentence into words using NLTK's word_tokenize function. You can do this using the following code

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
words = [word_tokenize(sentence) for sentence in sentences]

Remove the stopwords from the tokenized words using NLTK's stopwords corpus. You can do this using the following code

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in sentence if word.casefold() not in stop_words] for sentence in words]

Finally, use NLTK's ne_chunk function to perform NER on the filtered words. You can do this using the following code

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [None]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

PoS Tagging

In [None]:
for word in filtered_words:
  tagged = nltk.pos_tag(word)
  print(tagged)

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('CONGRESS', 'NNP'), ('STATE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('PRESIDENT', 'NN'), (':', ':'), ('Thank', 'NNP'), ('.', '.')]
[('Mr', 'NNP'), ('.', '.')]
[('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'NN'), ('nation', 'NN'), ('lost', 'VBD'), ('beloved', 'VBN'), (',', ','), ('graceful', 'JJ'), (',', ','), ('courageous', 'JJ'), ('woman', 'NN'), ('called', 'VBN'), ('America', 'NNP'), ('founding', 'VBG'), ('ideals', 'NNS'), ('carried', 'VBD'), ('noble', 'JJ'), 

Named Entity Recognition

The function nltk.ne_chunk(), recognizes named entities using a classifier, the classifier adds category labels such as PERSON,
ORGANIZATION, and GPE.

In [None]:
for word in filtered_words:
  tagged = nltk.pos_tag(word)
  named_entities = nltk.ne_chunk(tagged)
  print(named_entities)

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  JOINT/NNP
  SESSION/NNP
  (ORGANIZATION CONGRESS/NNP)
  STATE/NNP
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  PRESIDENT/NN
  :/:
  Thank/NNP
  ./.)
(S Mr/NNP ./.)
(S
  (GPE Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  (ORGANIZATION Supreme/NNP Court/NNP)
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  fellow/JJ
  citizens/NNS
  :/:
  Today/NN
  nation/NN
  lost/VBD
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  called/VBN
  (GPE America/NNP)
  founding/VBG
  ideals/NNS
  carried/VBD
  noble/JJ
  dream/NN
  ./.)
(S
  (PERSON Tonight/NNP)
  comforted/VBD
  hope/NN
  glad/NN
  reunion/NN
  husband/NN
  taken/VBN
  long/RB
  ago/RB
  ,/,
  grateful/JJ
  good/JJ
  life/NN
  (PERSON Coretta/NNP Scott/NNP King/NNP)
  ./.)
(S (/( (ORGAN