In [12]:
# pip install -U spacy
# python - m spacy download en_core_web_sm c:\users\lenovo\appdata\local\programs\python\python310\lib\site-packages

import spacy
import requests
from bs4 import BeautifulSoup

nlp = spacy.load('en_core_web_sm')


def extract_entities(text):
    doc = nlp(text)
    entities = {
        "places": [ent.text for ent in doc.ents if ent.label_ == "GPE"],
        "people": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
        "names": [ent.text for ent in doc.ents if ent.label_ == "PERSON"],
        "caste": [ent.text for ent in doc.ents if ent.label_ == "MISC"],
        "country": [ent.text for ent in doc.ents if ent.label_ == "GPE"],
        "numbers": [ent.text for ent in doc.ents if ent.label_ == "CARDINAL"],
        "symbols": [ent.text for ent in doc.ents if ent.label_ == "SYM"],
        "currency": [ent.text for ent in doc.ents if ent.label_ == "MONEY"],
        "units": [ent.text for ent in doc.ents if ent.label_ == "QUANTITY"],
        "sizes": [ent.text for ent in doc.ents if ent.label_ == "QUANTITY"],
        "professions": [ent.text for ent in doc.ents if ent.label_ == "NORP" or ent.label_ == "ORG"],
        "position": [ent.text for ent in doc.ents if ent.label_ == "TITLE"],
    }
    return entities


def get_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text() for p in paragraphs])
    return text



In [13]:
url = "https://www.ndtv.com/world-news/hamas-may-seek-extension-of-4-day-truce-with-israel-report-4608769#pfrom=home-ndtv_topscroll"


text_content = get_text_from_url(url)

entities = extract_entities(text_content)

for category, values in entities.items():
    print(f"{category}: {values}")

places: ['Israel', 'the Gaza Strip', 'US', 'Gaza', 'Israel', 'Israel', 'Gaza', 'Thais', 'Gaza City', 'Israel', 'Israel', 'Qatar', 'the United States', 'Egypt', 'Gaza', 'Gaza', 'Gaza', 'Gaza', 'Gaza', 'Gaza City', 'Gaza City', 'Sheba', 'Ramallah', 'Beitunia', 'West Bank', 'Gaza', 'India']
people: ['Joe Biden', 'Biden', 'Abigail', 'Ron Krivoy', 'Vladimir Putin', 'Catherine Colonna', 'Biden', 'Benjamin Netanyahu', 'Netanyahu', 'Khan Yunis', 'Oussama al Bass', 'Al-Zahra', 'Elma Avraham', 'Shlomi Kodesh', 'Ely', 'Dafna', 'Sharon Avigdori', 'Noam', 'Noorhan Awad', 'Ahmed Al-Ghandour', 'Adnan Abu Hasna']
names: ['Joe Biden', 'Biden', 'Abigail', 'Ron Krivoy', 'Vladimir Putin', 'Catherine Colonna', 'Biden', 'Benjamin Netanyahu', 'Netanyahu', 'Khan Yunis', 'Oussama al Bass', 'Al-Zahra', 'Elma Avraham', 'Shlomi Kodesh', 'Ely', 'Dafna', 'Sharon Avigdori', 'Noam', 'Noorhan Awad', 'Ahmed Al-Ghandour', 'Adnan Abu Hasna']
caste: []
country: ['Israel', 'the Gaza Strip', 'US', 'Gaza', 'Israel', 'Israel'

In [14]:
url = "https://www.ndtv.com/india-news/telangana-election-2023-video-pm-narendra-modi-pauses-telangana-rally-speech-to-ask-people-to-climb-down-towers-4608949#pfrom=home-ndtv_topscroll"


text_content = get_text_from_url(url)

entities = extract_entities(text_content)

for category, values in entities.items():
    print(f"{category}: {values}")

places: ['Nirmal', 'Telangana', 'Telangana', 'Telangana', 'Nirmal', 'Hindi', 'Hyderabad', 'Secunderabad', 'Telangana', 'Hindi', 'Telangana', 'Telangana', 'Modi', 'Nirmal', 'India']
people: ['Narendra Modi', 'Tricolour', 'pic.twitter.com/IlsTOBvSqA', 'Incumbent Bharat Rashtra Samithi', 'Modi Sunday']
names: ['Narendra Modi', 'Tricolour', 'pic.twitter.com/IlsTOBvSqA', 'Incumbent Bharat Rashtra Samithi', 'Modi Sunday']
caste: []
country: ['Nirmal', 'Telangana', 'Telangana', 'Telangana', 'Nirmal', 'Hindi', 'Hyderabad', 'Secunderabad', 'Telangana', 'Hindi', 'Telangana', 'Telangana', 'Modi', 'Nirmal', 'India']
numbers: []
symbols: []
currency: []
units: []
sizes: []
professions: ['PM Modi', 'the Telangana Assembly', 'BRS', 'Congress', 'BJP', 'KCR', 'KCR', 'BJP']
position: []


In [15]:

url = "https://www.ndtv.com/india-news/prayed-for-140-crore-indians-pm-modi-at-tirupati-temple-4608995#pfrom=home-ndtv_topstories"


text_content = get_text_from_url(url)

entities = extract_entities(text_content)

for category, values in entities.items():
    print(f"{category}: {values}")

places: ['Tirumala', 'Tirumala', 'Tirumala', 'Tirumala', 'Modi', 'Tirumala', 'Telangana', 'India']
people: ['Narendra Modi', 'Vedic', 'Andhra', 'S Abdul Nazeer', 'Y S Jagan', 'Mohan Reddy']
names: ['Narendra Modi', 'Vedic', 'Andhra', 'S Abdul Nazeer', 'Y S Jagan', 'Mohan Reddy']
caste: []
country: ['Tirumala', 'Tirumala', 'Tirumala', 'Tirumala', 'Modi', 'Tirumala', 'Telangana', 'India']
numbers: ['140']
symbols: []
currency: []
units: []
sizes: []
professions: ['Indians', 'Indians', 'Namo', 'PromotedListen', 'NDTV', 'Track Latest News Live on NDTV.com']
position: []
