In [1]:
!pip install spacy nltk requests
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import requests

def fetch_news_article(api_key):
    url = f'https://newsapi.org/v2/top-headlines?sources=bbc-news&apiKey={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get('articles')
        if articles:
            return articles[0].get('content')
    return None

# Replace 'your_api_key' with your actual News API key
api_key = '73cffa83d9344d8698a9b6d308a68fad'
article = fetch_news_article(api_key)
print("Article:\n", article)

Article:
 Meteorologists in other regions have been able to link a few recent major floods to atmospheric rivers.
In April 2023, Iraq, Iran, Kuwait and Jordan were all hit by catastrophic flooding after inten… [+1646 chars]


In [3]:
import spacy

def extract_entities_spacy(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

spacy_entities = extract_entities_spacy(article)
print("Entities from spaCy:\n", spacy_entities)

Entities from spaCy:
 [('April 2023', 'DATE'), ('Iraq', 'GPE'), ('Iran', 'GPE'), ('Kuwait', 'GPE'), ('Jordan', 'GPE')]


In [4]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

def extract_entities_nltk(text):
    nltk.download('punkt')
    nltk.download('maxent_ne_chunker')
    nltk.download('words')
    nltk.download('averaged_perceptron_tagger')

    sentences = nltk.sent_tokenize(text)
    entities = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tags = nltk.pos_tag(words)
        tree = nltk.ne_chunk(tags, binary=False)
        for subtree in tree:
            if isinstance(subtree, nltk.Tree):
                entity = " ".join([word for word, tag in subtree.leaves()])
                entity_type = subtree.label()
                entities.append((entity, entity_type))
    return entities

nltk_entities = extract_entities_nltk(article)
print("Entities from NLTK:\n", nltk_entities)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Entities from NLTK:
 [('Iraq', 'GPE'), ('Iran', 'GPE'), ('Kuwait', 'GPE'), ('Jordan', 'PERSON')]


In [5]:
def compare_entities(spacy_entities, nltk_entities):
    spacy_set = set(spacy_entities)
    nltk_set = set(nltk_entities)

    common = spacy_set & nltk_set
    spacy_unique = spacy_set - nltk_set
    nltk_unique = nltk_set - spacy_set

    print("Common Entities:\n", common)
    print("\nEntities unique to spaCy:\n", spacy_unique)
    print("\nEntities unique to NLTK:\n", nltk_unique)

compare_entities(spacy_entities, nltk_entities)

Common Entities:
 {('Kuwait', 'GPE'), ('Iraq', 'GPE'), ('Iran', 'GPE')}

Entities unique to spaCy:
 {('April 2023', 'DATE'), ('Jordan', 'GPE')}

Entities unique to NLTK:
 {('Jordan', 'PERSON')}
