In [3]:
import os
import re
import time
import json
import torch
import requests
import pandas as pd
from tqdm import tqdm
from gtts import gTTS
from bs4 import BeautifulSoup
from urllib.parse import quote
from collections import Counter
from transformers import pipeline

# Load NLP pipelines for sentiment analysis and summarization
sentiment_analyzer = pipeline("sentiment-analysis")
summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=0 if torch.cuda.is_available() else -1
)


def extract_topic_name(input_text: str) -> str:
    """
    Extracts a clean topic name from a keyword or URL for naming outputs.

    Args:
        input_text (str): URL or keyword.

    Returns:
        str: Cleaned topic name.
    """
    topic_name = re.sub(r'[^A-Za-z0-9_-]', '', input_text)
    return topic_name if topic_name else "topic"


def get_article_links(topic_or_url: str, start_page: int = 1, end_page: int = 3, min_articles: int = 10) -> list:
    """
    Fetches article links from BBC either by topic or from a given BBC page.

    Args:
        topic_or_url (str): Either a BBC topic URL or keyword to search for.
        start_page (int): Start page for search pagination.
        end_page (int): End page for search pagination.
        min_articles (int): Minimum number of articles to collect.

    Returns:
        list: List of article URLs.
    """
    links = set()
    current_end = end_page

    while True:
        if topic_or_url.startswith("http"):
            response = requests.get(topic_or_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            page_links = [
                f"https://www.bbc.com{a['href']}"
                for a in soup.find_all('a', href=True)
                if a['href'].startswith("/news/articles/")
            ]
            links.update(page_links)
        else:
            # Search by keyword across multiple pages
            for page in tqdm(range(start_page, current_end + 1), desc=f"Scraping pages {start_page}-{current_end}"):
                search_url = f"https://www.bbc.co.uk/search?q={quote(topic_or_url)}&filter=news&page={page}"
                response = requests.get(search_url)
                soup = BeautifulSoup(response.text, 'html.parser')
                page_links = [
                    a['href']
                    for a in soup.find_all('a', href=True)
                    if '/news/articles/' in a['href']
                ]
                links.update(page_links)

        if len(links) >= min_articles or topic_or_url.startswith("http"):
            break
        else:
            current_end += 2
            print(f"⚠️ Only found {len(links)} articles, expanding search to page {current_end}...")
            time.sleep(1)

    return list(links)[:min_articles]


def get_article_content(article_url: str) -> tuple:
    """
    Extracts headline, publish date, and full text from a given BBC article.

    Args:
        article_url (str): BBC article URL.

    Returns:
        tuple: headline, publish date, and article text.
    """
    response = requests.get(article_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    headline = soup.find("h1").text.strip() if soup.find("h1") else "No headline"
    pub_date = soup.find("time").text.strip() if soup.find("time") else "No publish date"
    article_text = " ".join([p.text.strip() for p in soup.find_all("p")])
    return headline, pub_date, article_text


def summarize_text(text: str, desired_lines: int = 3) -> str:
    """
    Summarizes text using the BART-large CNN model.

    Args:
        text (str): Full article text.
        desired_lines (int): Approximate number of summary lines.

    Returns:
        str: Summarized text.
    """
    word_limit = 1024
    max_length = min(150, desired_lines * 25)
    min_length = max(10, desired_lines * 10)
    text = text[:word_limit]
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]['summary_text']


def extract_topics(summary: str) -> list:
    """
    Extracts capitalized words or phrases as key topics from summary text.

    Args:
        summary (str): Summarized article text.

    Returns:
        list: List of extracted topics.
    """
    keywords = re.findall(r'\b[A-Z][a-z]+(?: [A-Z][a-z]+)*\b', summary)
    return list(set(keywords))[:5]


def create_hindi_tts(text: str, filename: str = "output.mp3") -> None:
    """
    Converts text into spoken Hindi audio and saves as an MP3.

    Args:
        text (str): Text to convert.
        filename (str): Output file name.
    """
    tts = gTTS(text=text, lang='hi')
    tts.save(filename)
    print(f" Hindi audio saved as {filename}")


def analyze_articles(links: list, company_name: str, desired_lines: int = 3) -> None:
    """
    Processes each article, summarizes content, analyzes sentiment, and generates insights.

    Args:
        links (list): List of article URLs.
        company_name (str): Topic or company name.
        desired_lines (int): Number of summary lines per article.
    """
    articles = []
    sentiments = []

    for link in tqdm(links, desc="Analyzing articles"):
        headline, pub_date, text = get_article_content(link)
        combined_text = f"{headline}. {text}"
        summary = summarize_text(combined_text, desired_lines)
        sentiment = sentiment_analyzer(summary)[0]
        topics = extract_topics(summary)

        articles.append({
            "Title": headline,
            "Published Date": pub_date,
            "Summary": summary,
            "Sentiment": sentiment['label'],
            "Topics": topics,
            "Link": link
        })
        sentiments.append(sentiment['label'])

    sentiment_counts = dict(Counter(sentiments))

    # Generate comparative insights between articles
    comparisons = [
        {
            "Comparison": f"{a1['Title']} vs. {a2['Title']}",
            "Impact": f"Article 1 sentiment: {a1['Sentiment']}, Article 2 sentiment: {a2['Sentiment']}"
        }
        for a1 in articles for a2 in articles if a1 != a2
    ][:2]

    topic_sets = [set(a['Topics']) for a in articles]
    common_topics = list(set.intersection(*topic_sets)) if topic_sets else []
    unique_topics = [list(topics - set(common_topics)) for topics in topic_sets]

    final_sentiment = max(sentiment_counts, key=sentiment_counts.get).lower()
    hindi_summary_text = f"{company_name} की खबरों का समग्र झुकाव {final_sentiment} है।"

    result = {
        "Company": company_name,
        "Articles": articles,
        "Comparative Sentiment Insights": {
            "Sentiment Distribution": sentiment_counts,
            "Comparisons": comparisons,
            "Topic Analysis": {
                "Common Topics": common_topics,
                "Unique Topics per Article": unique_topics
            }
        },
        "Overall Sentiment Conclusion": f"{company_name}'s news coverage is {final_sentiment}.",
        "Hindi Sentiment Summary": hindi_summary_text,
        "Hindi_TTS_Audio_File": f"{company_name.lower()}_sentiment_hindi.mp3"
    }

    # Save analysis output as JSON
    with open(f"{company_name.lower()}_summary.json", "w", encoding='utf-8') as file:
        json.dump(result, file, indent=4, ensure_ascii=False)

    # Generate TTS audio in Hindi
    create_hindi_tts(hindi_summary_text, f"{company_name.lower()}_sentiment_hindi.mp3")

    print(json.dumps(result, indent=4, ensure_ascii=False))


if __name__ == "__main__":
    # Interactive prompt for user input
    query = input(" Enter a BBC topic URL or keyword: ").strip()
    company = extract_topic_name(query)
    lines = int(input(" How many lines of summary per article? (default: 3): ") or 3)
    start_page = int(input(" Start page number (default: 1): ") or 1)
    end_page = int(input(" End page number (default: 3): ") or 3)

    article_links = get_article_links(query, start_page, end_page, min_articles=10)
    print(f" Found {len(article_links)} articles.")

    if article_links:
        analyze_articles(article_links, company, lines)
    else:
        print(" No articles found.")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


👉 Enter a BBC topic URL or keyword: tesla
👉 How many lines of summary per article? (default: 3): 3
👉 Start page number (default: 1): 1
👉 End page number (default: 3): 3


Scraping pages 1-3: 100%|██████████| 3/3 [00:01<00:00,  2.09it/s]


✅ Found 10 articles.


Analyzing articles: 100%|██████████| 10/10 [02:29<00:00, 14.91s/it]

✅ Hindi audio saved as tesla_sentiment_hindi.mp3
{
    "Company": "tesla",
    "Articles": [
        {
            "Title": "Vehicles damaged at Belfast Tesla dealership",
            "Published Date": "17 March 2025",
            "Summary": "20 vehicles damaged at a Tesla dealership on Boucher Road in Belfast. Police received a report of damage to vehicles on a commercial premises on Sunday. Most of the vehicles \"have had their wing mirrors knocked off\"",
            "Sentiment": "NEGATIVE",
            "Topics": [
                "Tesla",
                "Most",
                "Sunday",
                "Police",
                "Belfast"
            ],
            "Link": "https://www.bbc.co.uk/news/articles/cp8vd0j5zk2o"
        },
        {
            "Title": "Tesla surprises with better than expected car sales",
            "Published Date": "2 July 2024",
            "Summary": "Elon Musk's electric car-maker delivered nearly 444,000 vehicles in the three months ended 30 Jun




In [2]:
!pip install gtts

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.4
