## Named Entity Recognition and De-Identification with SpaCy
### Task1 - NER(Named Entity Recognition)

In [86]:
#1.1)Copy the code examples to scrape the webpage in BeautifulSoup
import bs4 as bs#BeautifulSoup
import urllib.request

import spacy
import en_core_web_sm

from spacy import displacy
from collections import Counter
from pprint import pprint

In [87]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [88]:
#scrapping the text data from webpage with Bs
def _scrape_webpage(url):
    
    scraped_textdata = urllib.request.urlopen(url)
    textdata = scraped_textdata.read()
    parsed_textdata = bs.BeautifulSoup(textdata,'lxml')
    paragraphs = parsed_textdata.find_all('p')
    formated_text = ""
    for para in paragraphs:
        formated_text += para.text
    return formated_text

In [89]:
# I have taken this post from Washingtonpost, this article explains about Sun, its solar flares and radiation. posted in 
#washingtonpost on February 22, 2024.
mytext = _scrape_webpage('https://www.washingtonpost.com/weather/2024/02/22/solar-flares-cycle-xclass-radio-att/')

In [90]:
#1.2 Write the code for NER in SpaCy 
doc = nlp(mytext)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_) 


Three 0 5 CARDINAL
Wednesday and Thursday 65 87 DATE
first 93 98 ORDINAL
two 99 102 CARDINAL
seven hours 112 123 TIME
X1.6 153 157 ORG
third 186 191 ORDINAL
11-year 226 233 DATE
11-year 548 555 DATE
25 595 597 CARDINAL
this year 629 638 DATE
Earth 834 839 LOC
Earth 919 924 LOC
at least a day 959 973 DATE
three 978 983 CARDINAL
Earth 1056 1061 LOC
first 1071 1076 ORDINAL
two 1077 1080 CARDINAL
CME 1106 1109 ORG
third 1168 1173 ORDINAL
Earth 1235 1240 LOC
Earth 1319 1324 LOC
Earth 1357 1362 LOC
between 50 and 600 miles 1376 1400 CARDINAL
Earth 1474 1479 LOC
Three 1796 1801 CARDINAL
Pacific 1898 1905 LOC
Indian 1910 1916 NORP
R3 1942 1944 PRODUCT
1 1962 1963 CARDINAL
5 1972 1973 CARDINAL
the National Oceanic and Atmospheric Administration’s 1993 2046 ORG
about an hour 2191 2204 TIME
Earth 2223 2228 LOC
Thursday 2368 2376 DATE
morning 2377 2384 TIME
AT&T 2397 2401 ORG
Wednesday 2423 2432 DATE
The Space Weather Prediction Center 2449 2484 ORG
Kunches 2626 2633 PERSON
The Washington Post 268

In [91]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])


[(Three, 'B', 'CARDINAL'),
 (top, 'O', ''),
 (-, 'O', ''),
 (tier, 'O', ''),
 (X, 'O', ''),
 (-, 'O', ''),
 (class, 'O', ''),
 (solar, 'O', ''),
 (flares, 'O', ''),
 (launched, 'O', ''),
 (off, 'O', ''),
 (the, 'O', ''),
 (sun, 'O', ''),
 (between, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (and, 'I', 'DATE'),
 (Thursday, 'I', 'DATE'),
 (., 'O', ''),
 (The, 'O', ''),
 (first, 'B', 'ORDINAL'),
 (two, 'B', 'CARDINAL'),
 (occurred, 'O', ''),
 (seven, 'B', 'TIME'),
 (hours, 'I', 'TIME'),
 (apart, 'O', ''),
 (,, 'O', ''),
 (coming, 'O', ''),
 (in, 'O', ''),
 (at, 'O', ''),
 (X1.9, 'O', ''),
 (and, 'O', ''),
 (X1.6, 'B', 'ORG'),
 (magnitude, 'O', ''),
 (respectively, 'O', ''),
 (., 'O', ''),
 (The, 'O', ''),
 (third, 'B', 'ORDINAL'),
 (,, 'O', ''),
 (the, 'O', ''),
 (most, 'O', ''),
 (powerful, 'O', ''),
 (of, 'O', ''),
 (the, 'O', ''),
 (current, 'O', ''),
 (11, 'B', 'DATE'),
 (-, 'I', 'DATE'),
 (year, 'I', 'DATE'),
 (“, 'O', ''),
 (solar, 'O', ''),
 (cycle, 'O', ''),
 (,, 'O', ''),
 (”, 'O', ''

In [92]:
#1.2.1 Get the named entities
pprint([(X.text, X.label_) for X in doc.ents])

[('Three', 'CARDINAL'),
 ('Wednesday and Thursday', 'DATE'),
 ('first', 'ORDINAL'),
 ('two', 'CARDINAL'),
 ('seven hours', 'TIME'),
 ('X1.6', 'ORG'),
 ('third', 'ORDINAL'),
 ('11-year', 'DATE'),
 ('11-year', 'DATE'),
 ('25', 'CARDINAL'),
 ('this year', 'DATE'),
 ('Earth', 'LOC'),
 ('Earth', 'LOC'),
 ('at least a day', 'DATE'),
 ('three', 'CARDINAL'),
 ('Earth', 'LOC'),
 ('first', 'ORDINAL'),
 ('two', 'CARDINAL'),
 ('CME', 'ORG'),
 ('third', 'ORDINAL'),
 ('Earth', 'LOC'),
 ('Earth', 'LOC'),
 ('Earth', 'LOC'),
 ('between 50 and 600 miles', 'CARDINAL'),
 ('Earth', 'LOC'),
 ('Three', 'CARDINAL'),
 ('Pacific', 'LOC'),
 ('Indian', 'NORP'),
 ('R3', 'PRODUCT'),
 ('1', 'CARDINAL'),
 ('5', 'CARDINAL'),
 ('the National Oceanic and Atmospheric Administration’s', 'ORG'),
 ('about an hour', 'TIME'),
 ('Earth', 'LOC'),
 ('Thursday', 'DATE'),
 ('morning', 'TIME'),
 ('AT&T', 'ORG'),
 ('Wednesday', 'DATE'),
 ('The Space Weather Prediction Center', 'ORG'),
 ('Kunches', 'PERSON'),
 ('The Washington Post',

In [93]:
# Count every named entity
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'CARDINAL': 16,
         'DATE': 13,
         'ORDINAL': 6,
         'TIME': 8,
         'ORG': 11,
         'LOC': 13,
         'NORP': 1,
         'PRODUCT': 1,
         'PERSON': 2,
         'WORK_OF_ART': 1})

In [94]:
# 1.2.2 Get most frequent tokens
items = [x.text for x in doc.ents]
Counter(items).most_common(5)

[('Earth', 11), ('CME', 4), ('first', 3), ('two', 3), ('third', 3)]

In [95]:
spacy.explain("GPE")

'Countries, cities, states'

In [96]:
#1.2.3 Pick a random integer K using Python random module, then pick 
#three consecutive sentences starting with Kth, and print these sentences. Note 
#that you must make sure all picked sentences are in the document.  

# Extract individual sentences
sentences = list(doc.sents)

# Pick a random integer K
K = random.randint(0, len(sentences)-3)

# Print the three consecutive sentences starting with the Kth sentence
for i in range(K, K+3):
    print(sentences[i].text.strip())
    
#Extract part-of-speech and lemmatize these consecutive sentences    
    sentence = sentences[i]
    lemmatized_tokens = [token.lemma_ for token in sentence]
    pos_tags= [token.pos_ for token in sentence]
    print("Lemmatized sentence:", ' '.join(lemmatized_tokens))
    print("POS tags:", pos_tags)
    print()
    
## In here the output is given for each kth sentence. that is starting with Sentence K and its corresponding Lemmatized sentences, its part of speech. 

Flares and their associated radio bursts only impact dayside systems if at all,” Kunches said in an email.
Lemmatized sentence: flare and their associated radio burst only impact dayside system if at all , " kunche say in an email .
POS tags: ['NOUN', 'CCONJ', 'PRON', 'ADJ', 'NOUN', 'NOUN', 'ADV', 'VERB', 'VERB', 'NOUN', 'SCONJ', 'ADV', 'ADV', 'PUNCT', 'PUNCT', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT']

“And, even if this was to occur during your daylight hours, chances are near nil that cell service would be affected.
Lemmatized sentence: " and , even if this be to occur during your daylight hour , chance be near nil that cell service would be affect .
POS tags: ['PUNCT', 'CCONJ', 'PUNCT', 'ADV', 'SCONJ', 'PRON', 'AUX', 'PART', 'VERB', 'ADP', 'PRON', 'NOUN', 'NOUN', 'PUNCT', 'NOUN', 'AUX', 'ADP', 'NOUN', 'SCONJ', 'NOUN', 'NOUN', 'AUX', 'AUX', 'VERB', 'PUNCT']

”Solar
Lemmatized sentence: ”Solar
POS tags: ['PUNCT']



In [97]:
# Get and print the entity annotation for each token of the Kth sentence  
kth_sentence = sentences[K]
for token in kth_sentence:
    print(token.text, token.ent_type_)

Flares 
and 
their 
associated 
radio 
bursts 
only 
impact 
dayside 
systems 
if 
at 
all 
, 
” 
Kunches PERSON
said 
in 
an 
email 
. 


In [98]:
displacy.render(kth_sentence, style="ent", jupyter=True)

In [99]:
#options = {"compact": True, "bg": "#09a3d5",
#          "color": "white", "font": "Source Sans Pro"}
displacy.render(kth_sentence, style="dep", jupyter=True)


In [100]:
# Visualize all the entities in the document
displacy.render(doc, style="ent", jupyter=True)

### Task-2 De-Identification

In [101]:
#De-identify all person names (PERSON) in the webpage document with [REDACTED] and visualize them. 

filtered_document = " "
for token in doc:
    if token.ent_type_ == "PERSON":
        filtered_document += "[REDACTED] "
    else:
        filtered_document += token.text_with_ws
        

In [102]:
displacy.render(nlp(filtered_document), jupyter=True, style='ent')

### References

[1]Dr.Liao's Code Example and tutorials :https://mymasonportal.gmu.edu/ultra/courses/_510861_1/cl/outline  
[2]Removing Personal information from Text: https://towardsdatascience.com/remove-personal-information-from-a-text-with-python-part-ii-ner-2e6529d409a6 posted on TowardsDataScience by Leo Van der Meulen on August 12, 2021