In [2]:
import requests
import re
import spacy
import torch
from collections import Counter
from bs4 import BeautifulSoup
from transformers import BartTokenizer, pipeline, AutoModelForQuestionAnswering, AutoTokenizer

In [69]:
# Load NLP models
nlp = spacy.load("en_core_web_sm")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Load QA model
qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer)


def scrape_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    article_text = ' '.join([p.get_text() for p in soup.find_all('p')][:-1])
    return article_text


def extract_important_entities(text, top_n=5):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    prioritized_types = ["ORG", "PERSON", "GPE"]
    filtered_entities = [ent for ent in entities if ent[1] in prioritized_types]
    entity_counter = Counter([ent[0] for ent in filtered_entities])
    return entity_counter.most_common(top_n)


def clean_text(text):
    text = text.replace("\u200b", "")
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def summarize_text(text, max_input_length=1023, max_length=200, min_length=100, device="cuda"):
    model_id = "facebook/bart-large-cnn"
    summarizer = pipeline("summarization", model=model_id, device=0 if device == "cuda" and torch.cuda.is_available() else -1)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_input_length)
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)

    summary = summarizer(truncated_text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']


def find_relevant_sentences(text, entity):
    """
    Extract sentences containing the entity for better context in QA.
    """
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split into sentences
    relevant_sentences = [s for s in sentences if entity in s]  # Keep sentences mentioning entity
    return " ".join(relevant_sentences)


def ask_questions(text, entities):
    """
    Generates questions and extracts answers using the QA model.
    """
    questions_and_answers = []

    for entity, _ in entities:
        question = f"What is said about {entity} in the text?"
        answer = qa_pipeline(question=question, context=text)
        questions_and_answers.append((question, answer['answer'] if answer['score'] >= 0.00001 else "No clear answer found."))

    return questions_and_answers

def display_long_string(text, line_width=80):
    for i in range(0, len(text), line_width):
        print(text[i:i + line_width])

Device set to use cuda:0


In [55]:
# 🔹 RUN SCRIPT 🔹
url = 'https://edition.cnn.com/2025/02/23/world/charts-ukraine-war-status-dg/index.html?iid=cnn_buildContentRecirc_end_recirc'

# Get and clean article text
text = scrape_article(url)
cleaned_text = clean_text(text)

# Summarize the article (long version for q&a, short version for display)
summary_long = summarize_text(cleaned_text, max_length=400, min_length=300)
summary_short = summarize_text(cleaned_text, max_length=150, min_length=70)

Device set to use cuda:0
Device set to use cuda:0


In [71]:
# Extract important entities
important_entities = extract_important_entities(summary_long, top_n=5)

# Generate Q&A
qa_results = ask_questions(summary_long, important_entities)

# Display results
print("\n🔹 SUMMARY:\n")
display_long_string(summary_short, 80)
print("\n🔹 IMPORTANT ENTITIES:\n", important_entities)
print("\n🔹 Q&A:")
for q, a in qa_results:
    print(f"❓ {q}\n➡️ {a}\n")


🔹 SUMMARY:

Since Russia launched its full-scale invasion in 2022, Ukraine has lost about 11
% of its land. Millions of Ukrainians have been uprooted with thousands killed o
r injured. The United States has been the biggest single contributor of funding 
for Ukraine since the war began in 2022. Ukraine and its European allies are scr
ambling to adapt to the new approach from the United States.

🔹 IMPORTANT ENTITIES:
 [('Ukraine', 4), ('US', 2), ('UN', 2), ('Russia', 1), ('The United States', 1)]

🔹 Q&A:
❓ What is said about Ukraine in the text?
➡️ Millions of Ukrainians have been uprooted with thousands killed or injured

❓ What is said about US in the text?
➡️ The United States has been the biggest single contributor of funding for Ukraine

❓ What is said about UN in the text?
➡️ Human Rights Office

❓ What is said about Russia in the text?
➡️ Russia launched its full-scale invasion

❓ What is said about The United States in the text?
➡️ The United States has been the biggest single 