In [179]:
import warnings
warnings.filterwarnings("ignore")

# Start with loading all necessary libraries
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re

import matplotlib.pyplot as plt
%matplotlib inline

import nltk
import io
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
import fitz

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [180]:
def extractTextFromPdf(path):
    extractedText = ''
    pdf_document = fitz.open(path)
    #for page_number in range(pdf_document.page_count):
    for page_number in range(0,1):
        page = pdf_document[page_number]
        extractedText += page.get_text()
    pdf_document.close()
    #print(text_2021)
    return extractedText

In [181]:
speechText = extractTextFromPdf(r'2022.pdf')

In [182]:
speechText

"Prime Minister's Office\nEnglish Rendering of Prime Minister's address from\nthe ramparts of Red Fort on  76th Independence Day\nPosted On: 15 AUG 2022 12:19PM by PIB Delhi\nBest wishes to my dear countrymen on the momentous occasion of completion of 75 years of Independence.\nMany congratulations to all! It is heartening to see our tricolor being unfurled with pride, honor and glory, not\nonly in all the corners of India, but also worldwide by Indians who love their country immensely. I extend my\nheartiest congratulations to all the people spread across the globe who love India. My heartfelt greetings to all\nmy dear Indians on this Amrit Mahotsav festival of celebrating our freedom. This is a day of historic\nimportance. This is an auspicious occasion to step forward on a new path, with a new resolution and a new\nstrength.\nIn the pursuit of Independence, the entire period of slavery was spent in struggle. No part of India or any time\nperiod remained untouched by the freedom stru

In [183]:
speechText = (' '.join(speechText.splitlines()))

In [184]:
print(speechText)

Prime Minister's Office English Rendering of Prime Minister's address from the ramparts of Red Fort on  76th Independence Day Posted On: 15 AUG 2022 12:19PM by PIB Delhi Best wishes to my dear countrymen on the momentous occasion of completion of 75 years of Independence. Many congratulations to all! It is heartening to see our tricolor being unfurled with pride, honor and glory, not only in all the corners of India, but also worldwide by Indians who love their country immensely. I extend my heartiest congratulations to all the people spread across the globe who love India. My heartfelt greetings to all my dear Indians on this Amrit Mahotsav festival of celebrating our freedom. This is a day of historic importance. This is an auspicious occasion to step forward on a new path, with a new resolution and a new strength. In the pursuit of Independence, the entire period of slavery was spent in struggle. No part of India or any time period remained untouched by the freedom struggle against 

## NER Using NLTK Chunking (Word base tonkenization and binary=True)

In [185]:
# Tokenise to words
words = nltk.word_tokenize(speechText)
words

['Prime',
 'Minister',
 "'s",
 'Office',
 'English',
 'Rendering',
 'of',
 'Prime',
 'Minister',
 "'s",
 'address',
 'from',
 'the',
 'ramparts',
 'of',
 'Red',
 'Fort',
 'on',
 '76th',
 'Independence',
 'Day',
 'Posted',
 'On',
 ':',
 '15',
 'AUG',
 '2022',
 '12:19PM',
 'by',
 'PIB',
 'Delhi',
 'Best',
 'wishes',
 'to',
 'my',
 'dear',
 'countrymen',
 'on',
 'the',
 'momentous',
 'occasion',
 'of',
 'completion',
 'of',
 '75',
 'years',
 'of',
 'Independence',
 '.',
 'Many',
 'congratulations',
 'to',
 'all',
 '!',
 'It',
 'is',
 'heartening',
 'to',
 'see',
 'our',
 'tricolor',
 'being',
 'unfurled',
 'with',
 'pride',
 ',',
 'honor',
 'and',
 'glory',
 ',',
 'not',
 'only',
 'in',
 'all',
 'the',
 'corners',
 'of',
 'India',
 ',',
 'but',
 'also',
 'worldwide',
 'by',
 'Indians',
 'who',
 'love',
 'their',
 'country',
 'immensely',
 '.',
 'I',
 'extend',
 'my',
 'heartiest',
 'congratulations',
 'to',
 'all',
 'the',
 'people',
 'spread',
 'across',
 'the',
 'globe',
 'who',
 'love'

In [186]:
#nltk.download('averaged_perceptron_tagger')


In [187]:
#pos tagging
pos_tags = nltk.pos_tag(words)

In [188]:
pos_tags

[('Prime', 'NNP'),
 ('Minister', 'NNP'),
 ("'s", 'POS'),
 ('Office', 'NNP'),
 ('English', 'NNP'),
 ('Rendering', 'NNP'),
 ('of', 'IN'),
 ('Prime', 'NNP'),
 ('Minister', 'NNP'),
 ("'s", 'POS'),
 ('address', 'NN'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('ramparts', 'NNS'),
 ('of', 'IN'),
 ('Red', 'NNP'),
 ('Fort', 'NNP'),
 ('on', 'IN'),
 ('76th', 'CD'),
 ('Independence', 'NNP'),
 ('Day', 'NNP'),
 ('Posted', 'VBD'),
 ('On', 'IN'),
 (':', ':'),
 ('15', 'CD'),
 ('AUG', 'NNP'),
 ('2022', 'CD'),
 ('12:19PM', 'CD'),
 ('by', 'IN'),
 ('PIB', 'NNP'),
 ('Delhi', 'NNP'),
 ('Best', 'NNP'),
 ('wishes', 'VBZ'),
 ('to', 'TO'),
 ('my', 'PRP$'),
 ('dear', 'JJ'),
 ('countrymen', 'NNS'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('momentous', 'JJ'),
 ('occasion', 'NN'),
 ('of', 'IN'),
 ('completion', 'NN'),
 ('of', 'IN'),
 ('75', 'CD'),
 ('years', 'NNS'),
 ('of', 'IN'),
 ('Independence', 'NNP'),
 ('.', '.'),
 ('Many', 'JJ'),
 ('congratulations', 'NNS'),
 ('to', 'TO'),
 ('all', 'DT'),
 ('!', '.'),
 ('It', 'PRP'),
 ('is',

In [189]:
#nltk.download('maxent_ne_chunker')



In [190]:
#Using ne_chunk and Binary = False
chunks = nltk.ne_chunk(pos_tags, binary=True)
for chunk in chunks:
    print(chunk)

('Prime', 'NNP')
('Minister', 'NNP')
("'s", 'POS')
('Office', 'NNP')
('English', 'NNP')
('Rendering', 'NNP')
('of', 'IN')
('Prime', 'NNP')
('Minister', 'NNP')
("'s", 'POS')
('address', 'NN')
('from', 'IN')
('the', 'DT')
('ramparts', 'NNS')
('of', 'IN')
(NE Red/NNP Fort/NNP)
('on', 'IN')
('76th', 'CD')
('Independence', 'NNP')
('Day', 'NNP')
('Posted', 'VBD')
('On', 'IN')
(':', ':')
('15', 'CD')
('AUG', 'NNP')
('2022', 'CD')
('12:19PM', 'CD')
('by', 'IN')
(NE PIB/NNP Delhi/NNP Best/NNP)
('wishes', 'VBZ')
('to', 'TO')
('my', 'PRP$')
('dear', 'JJ')
('countrymen', 'NNS')
('on', 'IN')
('the', 'DT')
('momentous', 'JJ')
('occasion', 'NN')
('of', 'IN')
('completion', 'NN')
('of', 'IN')
('75', 'CD')
('years', 'NNS')
('of', 'IN')
('Independence', 'NNP')
('.', '.')
('Many', 'JJ')
('congratulations', 'NNS')
('to', 'TO')
('all', 'DT')
('!', '.')
('It', 'PRP')
('is', 'VBZ')
('heartening', 'VBG')
('to', 'TO')
('see', 'VB')
('our', 'PRP$')
('tricolor', 'NN')
('being', 'VBG')
('unfurled', 'VBN')
('with'

In [191]:
entities = []
labels = []

for chunk in chunks:
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())

entities_labels = list(set(zip(entities, labels)))
entities_df =  pd.DataFrame(entities_labels)
#pd.set_option('display.max_rows', None)
entities_df.columns = ['Entities', 'Labels'] 
entities_df

Unnamed: 0,Entities,Labels
0,Subramania Bharati,NE
1,Red Fort,NE
2,Deendayal Upadhyay,NE
3,Ram Manohar Lohia,NE
4,Tatya Tope,NE
5,Rani Gaidinliu,NE
6,Gurudev Rabindranath Tagore,NE
7,Begum Hazrat Mahal,NE
8,Rani Lakshmibai,NE
9,PIB Delhi Best,NE


## NER Using NLTK Chunking (Word base tonkenization and binary=False)

In [192]:
#Using ne_chunk and Binary = False
chunks = nltk.ne_chunk(pos_tags, binary=False)

entities = []
labels = []

for chunk in chunks:
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())

entities_labels = list(set(zip(entities, labels)))
entities_df =  pd.DataFrame(entities_labels)
entities_df.columns = ['Entities', 'Labels'] 
entities_df

Unnamed: 0,Entities,Labels
0,Nehru,GPE
1,Dandi Yatra,PERSON
2,Govind Guru,PERSON
3,Maharishi Aurobindo,PERSON
4,Partition,ORGANIZATION
5,Sardar Vallabhbhai Patel,PERSON
6,Mangal Pandey,PERSON
7,Red Fort,ORGANIZATION
8,Shyama Prasad Mookerjee,PERSON
9,Amrit Mahotsav,PERSON


## NER Using NLTK Chunking (Sentence base tonkenization and binary=True)

In [193]:
entities = []
labels = []

sentences = nltk.sent_tokenize(speechText)

for sent in sentences:
    words = nltk.word_tokenize(sent)
    pos_Tags = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(pos_Tags,binary=True) 
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())


entities_labels = list(set(zip(entities, labels)))
entities_df =  pd.DataFrame(entities_labels)
entities_df.columns = ['Entities', 'Labels'] 
entities_df

Unnamed: 0,Entities,Labels
0,Subramania Bharati,NE
1,Red Fort,NE
2,Deendayal Upadhyay,NE
3,Ram Manohar Lohia,NE
4,Tatya Tope,NE
5,Rani Gaidinliu,NE
6,Gurudev Rabindranath Tagore,NE
7,Begum Hazrat Mahal,NE
8,Rani Lakshmibai,NE
9,PIB Delhi Best,NE


## NER Using NLTK Chunking (Sentence base tonkenization and binary=False)

In [194]:
entities = []
labels = []

sentences = nltk.sent_tokenize(speechText)

for sent in sentences:
    words = nltk.word_tokenize(sent)
    pos_Tags = nltk.pos_tag(words)
    chunks = nltk.ne_chunk(pos_Tags,binary=False) 
    for chunk in chunks:
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk))
            labels.append(chunk.label())


entities_labels = list(set(zip(entities, labels)))
entities_df =  pd.DataFrame(entities_labels)
entities_df.columns = ['Entities', 'Labels'] 
entities_df

Unnamed: 0,Entities,Labels
0,Nehru,GPE
1,Dandi Yatra,PERSON
2,Govind Guru,PERSON
3,Maharishi Aurobindo,PERSON
4,Partition,ORGANIZATION
5,Sardar Vallabhbhai Patel,PERSON
6,Mangal Pandey,PERSON
7,Red Fort,ORGANIZATION
8,Shyama Prasad Mookerjee,PERSON
9,Amrit Mahotsav,PERSON


## NER Using SPACY

In [195]:
import spacy
from spacy import displacy

In [196]:
#load spacy models 
#nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_md")
#nlp = spacy.load("en_core_web_lg")

In [197]:
doc = nlp(speechText)

entities = []
labels = []
description = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    description.append(spacy.explain(ent.label_))

df = pd.DataFrame({'Entities' : entities, 'Labels' : labels, 'Description' : description})
df.head(16)

Unnamed: 0,Entities,Labels,Description
0,"(Red, Fort)",LOC,"Non-GPE locations, mountain ranges, bodies of ..."
1,(76th),ORDINAL,"""first"", ""second"", etc."
2,"(Independence, Day)",EVENT,"Named hurricanes, battles, wars, sports events..."
3,"(15, AUG, 2022)",DATE,Absolute or relative dates or periods
4,"(PIB, Delhi, Best)",ORG,"Companies, agencies, institutions, etc."
5,"(75, years)",DATE,Absolute or relative dates or periods
6,(India),GPE,"Countries, cities, states"
7,(Indians),NORP,Nationalities or religious or political groups
8,(India),GPE,"Countries, cities, states"
9,(Indians),NORP,Nationalities or religious or political groups


## NER Using SPACY Transformers

In [205]:
#import spacy_transformers
#nlp = spacy.load("en_core_web_trf")

#!pip install spacy

In [206]:
#!pip install spacy-transformers
#!python3 -m spacy download en_core_web_trf

In [207]:
#!python -m spacy download en_core_web_trf

In [208]:
import spacy
import spacy_transformers

In [209]:
nlp = spacy.load("en_core_web_trf")

In [210]:
doc = nlp(speechText)

entities = []
labels = []
description = []

for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)    
    description.append(spacy.explain(ent.label_))
    
df = pd.DataFrame({'Entities' : entities, 'Labels' : labels, 'Description' : description})
df.head(16)

Unnamed: 0,Entities,Labels,Description
0,(English),LANGUAGE,Any named language
1,"(Red, Fort)",FAC,"Buildings, airports, highways, bridges, etc."
2,(76th),ORDINAL,"""first"", ""second"", etc."
3,"(15, AUG, 2022)",DATE,Absolute or relative dates or periods
4,"(12:19PM, by)",TIME,Times smaller than a day
5,(Delhi),GPE,"Countries, cities, states"
6,"(75, years)",DATE,Absolute or relative dates or periods
7,(India),GPE,"Countries, cities, states"
8,(Indians),NORP,Nationalities or religious or political groups
9,(India),GPE,"Countries, cities, states"


In [211]:
df

Unnamed: 0,Entities,Labels,Description
0,(English),LANGUAGE,Any named language
1,"(Red, Fort)",FAC,"Buildings, airports, highways, bridges, etc."
2,(76th),ORDINAL,"""first"", ""second"", etc."
3,"(15, AUG, 2022)",DATE,Absolute or relative dates or periods
4,"(12:19PM, by)",TIME,Times smaller than a day
5,(Delhi),GPE,"Countries, cities, states"
6,"(75, years)",DATE,Absolute or relative dates or periods
7,(India),GPE,"Countries, cities, states"
8,(Indians),NORP,Nationalities or religious or political groups
9,(India),GPE,"Countries, cities, states"


In [212]:
 displacy.render(doc, style='ent')