# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Named Entity Recognition</p>

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Importing Libraries</p>

In [99]:
# Importing necessary libraries
import spacy
import pandas as pd
import numpy as np
import re
import requests
from bs4 import BeautifulSoup
import en_core_web_sm
from collections import Counter
from spacy import displacy

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Loading SpaCy Language Model</p>

In [100]:
# Loading SpaCy Language Model
nlp = en_core_web_sm.load()

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">NER on Sample Text</p>

In [101]:
# NER on Sample Text
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
# Printing tokens in the sample text
for token in doc:
    print(token, end=" | ")
print()

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Displaying NLP Tokens in DataFrame</p>

In [102]:
# Function to display NLP tokens in DataFrame
def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i, 'text': t.text, 'lemma_': t.lemma_, 'is_stop': t.is_stop, 'is_alpha': t.is_alpha, 'pos_': t.pos_, 'dep_': t.dep_, 'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
# Displaying tokens in DataFrame
print(display_nlp(doc))

        text     lemma_  is_stop  is_alpha   pos_      dep_ ent_type_ ent_iob_
0         My         my     True      True   PRON      poss                  O
1       best       good    False      True    ADJ      amod                  O
2     friend     friend    False      True   NOUN     nsubj                  O
3       Ryan       Ryan    False      True  PROPN  compound    PERSON        B
4     Peters     Peters    False      True  PROPN     appos    PERSON        I
5      likes       like    False      True   VERB      ROOT                  O
6      fancy      fancy    False      True    ADJ      amod                  O
7  adventure  adventure    False      True   NOUN  compound                  O
8      games       game    False      True   NOUN      dobj                  O


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Filtering Non-Stop and Non-Punctuation Tokens</p>

In [103]:
# Filtering non-stop and non-punctuation tokens from a text
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)
non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Extracting Nouns and Proper Nouns</p>

In [104]:
# Extracting nouns and proper nouns from a text
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Extracting Named Entities</p>

In [105]:
# Extracting named entities from a text
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Named Entities in a Complex Sentence</p>

In [106]:
# Named entities in a complex sentence
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Named Entities</p>

In [107]:
# Visualizing named entities
displacy.render(doc, style='ent', jupyter=True)

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Extracting Named Entities from Web Page</p>

In [108]:
# Extracting named entities from a web page
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.google.com/search?q=artificial+intelligence&oq=artificial&gs_lcrp=EgZjaHJvbWUqDQgAEAAYgwEYsQMYgAQyDQgAEAAYgwEYsQMYgAQyDAgBEEUYORixAxiABDINCAIQABiDARixAxiABDIKCAMQABixAxiABDINCAQQABiDARixAxiABDIKCAUQABixAxiABDINCAYQABiDARixAxiABDINCAcQLhiDARixAxiABDINCAgQABiDARixAxiABDIHCAkQABiABNIBCDQ2MzhqMGo3qAIAsAIA&sourceid=chrome&ie=UTF-8')
article = nlp(ny_bb)

print(len(article.ents))
displacy.render(article, style='ent', jupyter=True)

48


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Counting and Displaying Label Counts</p>

In [109]:
# Counting and displaying label counts
labels = [x.label_ for x in article.ents]
label_counts = Counter(labels)
print(label_counts)

items = [ent.text for ent in article.ents]
print(Counter(items).most_common(5))

Counter({'ORG': 29, 'CARDINAL': 5, 'PERSON': 4, 'PRODUCT': 3, 'GPE': 3, 'DATE': 2, 'TIME': 1, 'NORP': 1})
[('AI', 10), ('›', 9), ('Artificial Intelligence', 2), ('SAS', 2), ('a few seconds', 1)]


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Splitting Text into Sentences</p>

In [110]:
# Splitting text into sentences
sentences = [x for x in article.sents]
print(sentences[0])

artificial intelligence - Google SearchGoogle×Please click here if you are not redirected within a few seconds.    


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Named Entities in a Sentence</p>

In [111]:
# Visualizing named entities in a sentence
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Token Analysis in a Sentence</p>

In [112]:
# Token analysis in a sentence
print([(x.orth_,x.pos_, x.lemma_) for x in
  [y for y in nlp(str(sentences[0])) if not y.is_stop and y.pos_ != 'PUNCT']])

[('artificial', 'ADJ', 'artificial'), ('intelligence', 'NOUN', 'intelligence'), ('Google', 'PROPN', 'Google'), ('SearchGoogle×Please', 'PROPN', 'SearchGoogle×Please'), ('click', 'VERB', 'click'), ('redirected', 'VERB', 'redirect'), ('seconds', 'NOUN', 'second'), ('   ', 'SPACE', '   ')]


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Dependency Parsing in a Sentence</p>

In [113]:
# Visualizing dependency parsing in a sentence
displacy.render(nlp(str(sentences[0])), style='dep', jupyter=True, options={'distance': 120})

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Article 2</p>

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Extracting Named Entities from Web Page</p>

In [114]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
# Extracting named entities from another web page
ny_bb = url_to_string('https://www.cnn.com/middleeast/live-news/israel-hamas-war-gaza-news-03-26-24/index.html')
article = nlp(ny_bb)

print(len(article.ents))
displacy.render(article, style='ent', jupyter=True)

1208


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Counting and Displaying Label Counts</p>

In [115]:
# Counting and displaying label counts for the second article
labels = [x.label_ for x in article.ents]
label_counts = Counter(labels)
print(label_counts)

items = [ent.text for ent in article.ents]
print(Counter(items).most_common(5))

Counter({'GPE': 348, 'ORG': 312, 'PERSON': 180, 'NORP': 130, 'DATE': 129, 'CARDINAL': 80, 'TIME': 7, 'FAC': 6, 'LOC': 5, 'ORDINAL': 5, 'QUANTITY': 5, 'PRODUCT': 1})
[('CNN', 92), ('Gaza', 87), ('Israeli', 80), ('Israel', 77), ('US', 59)]


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Splitting Text into Sentences</p>

In [116]:
# Splitting text into sentences for the second article
sentences = [x for x in article.sents]
print(sentences[0])

                              March 26, 2024 Israel-Hamas war | CNN                                                                                                                                                                                                                                                                                                                                                                                                      CNN values your feedback                                                                                                                                                                                                                             1.


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Named Entities in a Sentence</p>

In [117]:
# Visualizing named entities in a sentence for the second article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Token Analysis in a Sentence</p>

In [118]:
# Token analysis in a sentence for the second article
print([(x.orth_,x.pos_, x.lemma_) for x in
  [y for y in nlp(str(sentences[0])) if not y.is_stop and y.pos_ != 'PUNCT']])

[('                              ', 'SPACE', '                              '), ('March', 'PROPN', 'March'), ('26', 'NUM', '26'), ('2024', 'NUM', '2024'), ('Israel', 'PROPN', 'Israel'), ('Hamas', 'PROPN', 'Hamas'), ('war', 'NOUN', 'war'), ('|', 'NOUN', '|'), ('CNN', 'PROPN', 'CNN'), ('                                                                                                                                                                                                                                                                                                                                                                                                     ', 'SPACE', '                                                                                                                                                                                                                                                                                                                        

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Dependency Parsing in a Sentence</p>

In [119]:
# Visualizing dependency parsing in a sentence for the second article
displacy.render(nlp(str(sentences[0])), style='dep', jupyter=True, options={'distance': 120})

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Spanish Article</p>

In [120]:
import spacy
# Load the Spanish language model
nlp = spacy.load("es_core_news_sm")

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Extracting Named Entities from Web Page</p>

In [121]:
# Extracting named entities from a Spanish web page
ny_bb = url_to_string('https://english.elpais.com/')
article = nlp(ny_bb)

print(len(article.ents))
displacy.render(article, style='ent', jupyter=True)

234


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Counting and Displaying Label Counts</p>

In [122]:
# Counting and displaying label counts for the Spanish article
labels = [x.label_ for x in article.ents]
label_counts = Counter(labels)
print(label_counts)

items = [ent.text for ent in article.ents]
print(Counter(items).most_common(5))

Counter({'MISC': 130, 'PER': 50, 'ORG': 39, 'LOC': 15})
[('PAÍS', 3), ('who', 2), ('the king of television who', 2), ('‘I', 2), ('been happier out of the closet', 2)]


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Splitting Text into Sentences</p>

In [123]:
# Splitting text into sentences for the Spanish article
sentences = [x for x in article.sents]
print(sentences[0])

EL PAÍS English____Mar 27, 2024|Updated 01:59 CET|Select:- - -EspañaAméricaMéxicoColombiaChileArgentinaUSAsubscribeHHOLALOG INInternationalU.S.Economy & BusinessScienceHealthTechnologyClimatePeopleLifestyleOpinionSportsMar 27, 2024|Updated 01:59 CET|subscribe_Supreme Court seems inclined not to restrict access to abortion pill Iker Seisdedos|WashingtonAt the oral arguments in the mifepristone case, a majority of justices appeared skeptical about the legal standing of a Christian doctors’ association to sue the Food and Drug AdministrationNew York judge imposes gag order on Donald Trump in hush money caseJudge Juan M. Merchan on Tuesday cited Trump’s previous comments about him and others involved in the case, as well as a looming April 15 trial date, in granting the prosecution’s request for a gag orderBaltimore’s largest bridge collapses after being struck by shipMiguel Jiménez|BaltimoreEmergency crews have rescued two people from the water and are searching for six others.


# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Named Entities in a Sentence</p>

In [124]:
# Visualizing named entities in a sentence for the Spanish article
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Token Analysis in a Sentence</p>

In [125]:
# Token analysis in a sentence for the Spanish article
print([(x.orth_,x.pos_, x.lemma_) for x in
  [y for y in nlp(str(sentences[0])) if not y.is_stop and y.pos_ != 'PUNCT']])

[('PAÍS', 'PROPN', 'PAÍS'), ('English____Mar', 'PROPN', 'English____Mar'), ('27', 'NUM', '27'), ('2024|Updated', 'NUM', '2024|updated'), ('01:59', 'NUM', '01:59'), ('CET|Select:-', 'PROPN', 'CET|Select:-'), ('-EspañaAméricaMéxicoColombiaChileArgentinaUSAsubscribeHHOLALOG', 'PROPN', '-EspañaAméricaMéxicoColombiaChileArgentinaUSAsubscribeHHOLALOG'), ('INInternationalU.S.Economy', 'PROPN', 'INInternationalU.S.Economy'), ('&', 'PROPN', '&'), ('BusinessScienceHealthTechnologyClimatePeopleLifestyleOpinionSportsMar', 'PROPN', 'BusinessScienceHealthTechnologyClimatePeopleLifestyleOpinionSportsMar'), ('27', 'NUM', '27'), ('2024|Updated', 'PROPN', '2024|Updated'), ('01:59', 'NUM', '01:59'), ('CET|subscribe_Supreme', 'VERB', 'cet|subscribe_supreme'), ('Court', 'PROPN', 'Court'), ('seems', 'PROPN', 'seems'), ('inclined', 'VERB', 'inclined'), ('not', 'ADV', 'not'), ('to', 'PROPN', 'to'), ('restrict', 'NOUN', 'restrict'), ('access', 'NOUN', 'access'), ('to', 'PROPN', 'to'), ('abortion', 'PROPN', 'ab

# <p style="background-color:#8502d1; font-family:calibri; color:white; font-size:150%; text-align:center; border-radius:15px 50px;">Visualizing Dependency Parsing in a Sentence</p>

In [126]:
# Visualizing dependency parsing in a sentence
displacy.render(nlp(str(sentences[0])), style='dep', jupyter=True, options={'distance': 140})

#### This code demonstrates Named Entity Recognition (NER) using SpaCy on sample text and web articles. It showcases token analysis, entity extraction, label counting, and visualization. Through web scraping, it extracts entities from diverse sources, displaying a versatile application of NLP techniques for information extraction and analysis.