### Named Entity Recognition and De-Identification with SpaCy

#### 1. Named Entity Recognition:

##### Import Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import random
from collections import Counter
from pprint import pprint

In [2]:
# Load the SpaCy English model
nlp = spacy.load('en_core_web_sm')

##### 1.1 Copy the code examples to scrape the webpage in BeautifulSoup

In [3]:
# Make a GET request to the webpage
url = 'https://www.nbcnews.com/politics/supreme-court/supreme-court-strikes-affirmative-action-programs-harvard-unc-rcna66770'
data = requests.get(url).text

soup = BeautifulSoup(data, 'html.parser') # Create a BeautifulSoup object to parse the HTML
article = soup.get_text() # Extract the text from the webpage
pprint(article)

('Supreme Court strikes down college affirmative action programsIE 11 is not '
 'supported. For an optimal experience visit our site on another browser.Skip '
 'to ContentNBC News LogoTrump adminPoliticsLocalNew YorkLos '
 'AngelesChicagoDallas-Fort WorthPhiladelphiaWashington, D.C.BostonBay '
 "AreaSouth FloridaSan DiegoConnecticutU.S. NewsWorldBusinessEditors' "
 'picksShoppingTiplineSportsHealthScienceCulture & TrendsShare & Save\xa0—My '
 'NewsManage ProfileEmail PreferencesSign OutSearchSearchProfile\xa0My '
 'NewsSign Out\xa0Sign InCreate your free profileSectionsU.S. NewsDecision '
 '2024PoliticsWorldBusinessSportsInvestigationsCulture & '
 'TrendsHealthScienceTech & MediaWeatherVideo FeaturesPhotosNBC SelectNBC '
 'Asian AmericaNBC BLKNBC LatinoNBC OUTLocalNew YorkLos '
 'AngelesChicagoDallas-Fort WorthPhiladelphiaWashington, D.C.BostonBay '
 'AreaSouth FloridaSan DiegoConnecticuttvTodayNightly NewsMSNBCMeet the '
 'PressDatelineFeaturedNBC News NowNightly FilmsStay TunedSpecia

##### 1.2 Write the code for NER in SpaCy

In [4]:
# Process the document using SpaCy
doc = nlp(article)
print(doc)

Supreme Court strikes down college affirmative action programsIE 11 is not supported. For an optimal experience visit our site on another browser.Skip to ContentNBC News LogoTrump adminPoliticsLocalNew YorkLos AngelesChicagoDallas-Fort WorthPhiladelphiaWashington, D.C.BostonBay AreaSouth FloridaSan DiegoConnecticutU.S. NewsWorldBusinessEditors' picksShoppingTiplineSportsHealthScienceCulture & TrendsShare & Save —My NewsManage ProfileEmail PreferencesSign OutSearchSearchProfile My NewsSign Out Sign InCreate your free profileSectionsU.S. NewsDecision 2024PoliticsWorldBusinessSportsInvestigationsCulture & TrendsHealthScienceTech & MediaWeatherVideo FeaturesPhotosNBC SelectNBC Asian AmericaNBC BLKNBC LatinoNBC OUTLocalNew YorkLos AngelesChicagoDallas-Fort WorthPhiladelphiaWashington, D.C.BostonBay AreaSouth FloridaSan DiegoConnecticuttvTodayNightly NewsMSNBCMeet the PressDatelineFeaturedNBC News NowNightly FilmsStay TunedSpecial FeaturesNewslettersPodcastsListen NowMore From NBCCNBCNBC.COM

##### 1.2.1 Count all the named entities in the document

In [5]:
# Check if there are named entities in the document
if len(doc.ents) == 0:
    print("No named entities found in the document.")
else:
    # Print the named entities
    for entity in doc.ents:
        print(entity.text, entity.label_)

Supreme Court ORG
11 CARDINAL
ContentNBC News ORG
LogoTrump PRODUCT
YorkLos ORG
AreaSouth FloridaSan ORG
& TrendsShare & Save ORG
Asian NORP
BLKNBC ORG
LatinoNBC NORP
YorkLos ORG
AngelesChicagoDallas-Fort WorthPhiladelphiaWashington PERSON
D.C.BostonBay AreaSouth FloridaSan ORG
NBC News ORG
AlertsThere PRODUCT
StoriesTrump adminPoliticsU.S. NewsWorldBusinessEditors' ORG
decades DATE
Lawrence HurleyWASHINGTON PERSON
The Supreme Court ORG
Thursday DATE
the University of North Carolina ORG
Harvard ORG
the Equal Protection Clause ORG
Constitution LAW
6-3 CARDINAL
UNC ORG
6-2 CARDINAL
Harvard ORG
Ketanji Brown Jackson PERSON
Constitution LAW
Donald Trump PERSON
America GPE
Michelle Obama PERSON
first ORDINAL
first ORDINAL
Joe Biden PERSON
North Carolina GPE
Chapel Hill LOC
March 28.Jonathan DATE
Drake / Reuters / ReduxThe court ORG
2003 DATE
Grutter PERSON
Bollinger PERSON
1978 DATE
Black NORP
John Roberts PERSON
Clarence Thomas PERSON
second ORDINAL
Grutter PERSON
Jackson PERSON
first ORDI

In [6]:
# Get the token-level entity annotations using the BILUO tagging scheme to describe the entity boundaries
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc]) 

[(Supreme, 'B', 'ORG'),
 (Court, 'I', 'ORG'),
 (strikes, 'O', ''),
 (down, 'O', ''),
 (college, 'O', ''),
 (affirmative, 'O', ''),
 (action, 'O', ''),
 (programsIE, 'O', ''),
 (11, 'B', 'CARDINAL'),
 (is, 'O', ''),
 (not, 'O', ''),
 (supported, 'O', ''),
 (., 'O', ''),
 (For, 'O', ''),
 (an, 'O', ''),
 (optimal, 'O', ''),
 (experience, 'O', ''),
 (visit, 'O', ''),
 (our, 'O', ''),
 (site, 'O', ''),
 (on, 'O', ''),
 (another, 'O', ''),
 (browser, 'O', ''),
 (., 'O', ''),
 (Skip, 'O', ''),
 (to, 'O', ''),
 (ContentNBC, 'B', 'ORG'),
 (News, 'I', 'ORG'),
 (LogoTrump, 'B', 'PRODUCT'),
 (adminPoliticsLocalNew, 'O', ''),
 (YorkLos, 'B', 'ORG'),
 (AngelesChicagoDallas, 'O', ''),
 (-, 'O', ''),
 (Fort, 'O', ''),
 (WorthPhiladelphiaWashington, 'O', ''),
 (,, 'O', ''),
 (D.C.BostonBay, 'O', ''),
 (AreaSouth, 'B', 'ORG'),
 (FloridaSan, 'I', 'ORG'),
 (DiegoConnecticutU.S., 'O', ''),
 (NewsWorldBusinessEditors, 'O', ''),
 (', 'O', ''),
 (picksShoppingTiplineSportsHealthScienceCulture, 'O', ''),
 (&,

In [7]:
named_entities = set([entity.text for entity in doc.ents])
num_entities = len(set(named_entities))
print("Number of named entities:", num_entities)

# Named entities 
entities = [(X.text, X.label_) for X in doc.ents]

# Count the named entities 
entity_counts = Counter(entities)

# Print the entity counts 
for entity, count in entity_counts.items():
    print(f"{entity[0]} ({entity[1]}): {count}")

Number of named entities: 97
Supreme Court (ORG): 3
11 (CARDINAL): 1
ContentNBC News (ORG): 1
LogoTrump (PRODUCT): 1
YorkLos (ORG): 2
AreaSouth FloridaSan (ORG): 1
& TrendsShare & Save (ORG): 1
Asian (NORP): 2
BLKNBC (ORG): 1
LatinoNBC (NORP): 1
AngelesChicagoDallas-Fort WorthPhiladelphiaWashington (PERSON): 1
D.C.BostonBay AreaSouth FloridaSan (ORG): 1
NBC News (ORG): 2
AlertsThere (PRODUCT): 1
StoriesTrump adminPoliticsU.S. NewsWorldBusinessEditors' (ORG): 1
decades (DATE): 2
Lawrence HurleyWASHINGTON (PERSON): 1
The Supreme Court (ORG): 1
Thursday (DATE): 2
the University of North Carolina (ORG): 1
Harvard (ORG): 5
the Equal Protection Clause (ORG): 1
Constitution (LAW): 4
6-3 (CARDINAL): 1
UNC (ORG): 3
6-2 (CARDINAL): 1
Ketanji Brown Jackson (PERSON): 1
Donald Trump (PERSON): 2
America (GPE): 2
Michelle Obama (PERSON): 1
first (ORDINAL): 5
Joe Biden (PERSON): 2
North Carolina (GPE): 3
Chapel Hill (LOC): 1
March 28.Jonathan (DATE): 1
Drake / Reuters / ReduxThe court (ORG): 1
2003 (D

##### 1.2.2 Count the most frequent tokens for the entire document

In [8]:
# Count the most frequent tokens for the entire document
token_counts = Counter([token.text.lower() for token in doc if not token.is_stop and not token.is_punct])
most_frequent_tokens = token_counts.most_common(10)

# Print the most frequent tokens
print("Most frequent tokens:")
for token, count in most_frequent_tokens:
    print(token, "-", count)

Most frequent tokens:
race - 22
court - 18
ruling - 18
  - 13
affirmative - 11
action - 11
admissions - 10
programs - 10
university - 10
justice - 10


##### 1.2.3 Pick a random integer K and select three consecutive sentences starting from Kth

In [9]:
# Pick a random integer K and select three consecutive sentences starting from Kth
sentences = list(doc.sents)
random_index = random.randint(0, len(sentences) - 3)
selected_sentences = sentences[random_index:random_index + 3]
print("Randomly selected sentences:")
for sentence in selected_sentences:
    print(sentence)

Randomly selected sentences:
The court ruled that both programs violate the Equal Protection Clause of the Constitution and are therefore unlawful.
The vote was 6-3 in the UNC case and 6-2 in the Harvard case, in which liberal Justice Ketanji Brown Jackson was recused.
The decision was hailed by prominent conservatives, who say the Constitution should be "colorblind," with former President Donald Trump calling it "a great day for America."


##### 1.2.4 Extract part-of-speech and lemmatize these consecutive sentences

In [10]:
# Extract part-of-speech and lemmatize these consecutive sentences
for sentence in selected_sentences:
    for token in sentence:
        print(token.text, "-", token.pos_, "-", token.lemma_)

The - DET - the
court - NOUN - court
ruled - VERB - rule
that - SCONJ - that
both - DET - both
programs - NOUN - program
violate - VERB - violate
the - DET - the
Equal - PROPN - Equal
Protection - PROPN - Protection
Clause - PROPN - Clause
of - ADP - of
the - DET - the
Constitution - PROPN - Constitution
and - CCONJ - and
are - AUX - be
therefore - ADV - therefore
unlawful - ADJ - unlawful
. - PUNCT - .
The - DET - the
vote - NOUN - vote
was - AUX - be
6 - NUM - 6
- - SYM - -
3 - NUM - 3
in - ADP - in
the - DET - the
UNC - PROPN - UNC
case - NOUN - case
and - CCONJ - and
6 - NUM - 6
- - SYM - -
2 - NUM - 2
in - ADP - in
the - DET - the
Harvard - PROPN - Harvard
case - NOUN - case
, - PUNCT - ,
in - ADP - in
which - PRON - which
liberal - ADJ - liberal
Justice - PROPN - Justice
Ketanji - PROPN - Ketanji
Brown - PROPN - Brown
Jackson - PROPN - Jackson
was - AUX - be
recused - VERB - recuse
. - PUNCT - .
The - DET - the
decision - NOUN - decision
was - AUX - be
hailed - VERB - hail
by - A

##### 1.2.5 Get and print the entity annotation for each token of the Kth sentence

In [11]:
# Get and print the entity annotation for each token of the Kth sentence
kth_sentence = sentences[random_index]
for token in kth_sentence:
    print(f"Token: {token.text}, Entity: {token.ent_type_}")

Token: The, Entity: 
Token: court, Entity: 
Token: ruled, Entity: 
Token: that, Entity: 
Token: both, Entity: 
Token: programs, Entity: 
Token: violate, Entity: 
Token: the, Entity: ORG
Token: Equal, Entity: ORG
Token: Protection, Entity: ORG
Token: Clause, Entity: ORG
Token: of, Entity: 
Token: the, Entity: 
Token: Constitution, Entity: LAW
Token: and, Entity: 
Token: are, Entity: 
Token: therefore, Entity: 
Token: unlawful, Entity: 
Token: ., Entity: 


##### 1.2.6 Visualize the entities and dependencies of Kth sentence

In [12]:
# Visualize the entities and dependencies of Kth sentence
displacy.render(kth_sentence, style='ent', jupyter=True)

# Visualize dependencies of the Kth sentence
displacy.render(kth_sentence, style="dep", jupyter=True)

# Visualize all the entities in the document
displacy.render(doc, style='ent', jupyter=True)

#### 2. De-identification:

##### 2.1 De-identify all person names (PERSON) in the webpage document with [REDACTED] and visualize them

In [13]:
# De-identify person names with [REDACTED] using doc.ents
redacted_text = []
for token in doc:
    if token.ent_type_ == "PERSON":
        redacted_text.append("[REDACTED]")
    else:
        redacted_text.append(token.text)

redacted_text = " ".join(redacted_text)

# Visualize the entities and dependencies of the modified document
redacted_doc = nlp(redacted_text)
displacy.render(redacted_doc, style="ent", jupyter=True)

# Replace person names with [REDACTED] using entity labels
for entity in doc.ents:
    if entity.label_ == 'PERSON':
        article = article.replace(entity.text, '[REDACTED]')

# Visualize the de-identified text
doc_redacted = nlp(article)
displacy.render(doc_redacted, style='ent', jupyter=True)