In [None]:
!pip install feedparser

import feedparser

# Google News RSS feeds for air quality topics
rss_urls = [
    "https://news.google.com/rss/search?q=air+quality&hl=en-US&gl=US&ceid=US:en",
    "https://news.google.com/rss/search?q=AQI&hl=en-US&gl=US&ceid=US:en",
    "https://news.google.com/rss/search?q=air+pollution&hl=en-US&gl=US&ceid=US:en"
]

# Keywords to ensure the article is about air quality
air_quality_keywords = ["air quality", "AQI", "air pollution", "smog", "particulate matter", "PM2.5", "PM10", "wildfire smoke"]

# Keywords to filter U.S.-related news
us_keywords = ["United States", "U.S.", "USA", "America", "California", "New York", "Texas", "EPA", "Washington", "Los Angeles"]

# U.S.-specific news domains
us_domains = [".gov", ".us", "cnn.com", "nytimes.com", "washingtonpost.com", "npr.org", "latimes.com", "foxnews.com"]

articles = []

for url in rss_urls:
    feed = feedparser.parse(url)
    for entry in feed.entries:
        # Ensure the article is about air quality
        if any(keyword.lower() in entry.title.lower() or (hasattr(entry, 'summary') and keyword.lower() in entry.summary.lower()) for keyword in air_quality_keywords):
            # Check if it is U.S.-specific
            if any(keyword in entry.title or (hasattr(entry, 'summary') and keyword in entry.summary) for keyword in us_keywords) or any(domain in entry.link for domain in us_domains):
                articles.append({"title": entry.title, "url": entry.link})

# Display filtered articles
for article in articles:
    print(f"Title: {article['title']}\nURL: {article['url']}\n")

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6047 sha256=ca4f200edfd9ab1df0ff67dfa93abd718e0973de7335735cd43992edc799b550
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0
Title: U.S. Supreme Court hears Utah, Oklahoma's legal challenge to air quality 

In [None]:
!pip install playwright beautifulsoup4 requests openai
!playwright install

import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
import openai

# Set your OpenAI API Key
openai.api_key = "your_openai_api_key"

# Function to extract article URLs from Google News search results
def get_google_news_urls(google_news_url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(google_news_url, headers=headers)
    if response.status_code != 200:
        print("Failed to retrieve search results")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    links = []

    # Find all article links
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "url=" in href:
            url = href.split("url=")[1].split("&")[0]
            if url.startswith("http"):
                links.append(url)

    return list(set(links))  # Remove duplicates

# Function to extract text from an article using Playwright
def get_article_text(url):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        try:
            page.goto(url, timeout=10000)
            text = page.inner_text("body")  # Extracts text from <body>
        except Exception as e:
            print(f"Failed to extract {url}: {e}")
            text = ""
        finally:
            browser.close()
        return text.strip()

# Function to summarize text using OpenAI GPT
def summarize_text(text):
    if len(text) > 8000:  # Limit text size for OpenAI API
        text = text[:8000]

    prompt = f"Summarize this article in 3 bullet points:\n\n{text}"

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )
        return response["choices"][0]["message"]["content"]
    except Exception as e:
        print(f"Failed to summarize: {e}")
        return "Error in summarization"

# Main execution
if __name__ == "__main__":
    google_news_url = "https://www.google.com/search?q=Toronto+Raptors&tbm=nws"

    print("Fetching Google News results...")
    urls = get_google_news_urls(google_news_url)

    if not urls:
        print("No articles found.")
    else:
        print(f"Found {len(urls)} articles. Extracting and summarizing...\n")
        for idx, url in enumerate(urls[:5]):  # Limit to first 5 articles
            print(f"Article {idx+1}: {url}")
            article_text = get_article_text(url)

            if article_text:
                summary = summarize_text(article_text)
                print(f"Summary:\n{summary}\n")
            else:
                print("Failed to extract article text.\n")

Collecting playwright
  Downloading playwright-1.51.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<13,>=12 (from playwright)
  Downloading pyee-12.1.1-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.51.0-py3-none-manylinux1_x86_64.whl (45.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-12.1.1-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.51.0 pyee-12.1.1
Downloading Chromium 134.0.6998.35 (playwright build v1161)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1161/chromium-linux.zip[22m
[1G164.9 MiB [] 0% 0.0s[0K[1G164.9 MiB [] 0% 686.5s[0K[1G164.9 MiB [] 0% 683.5s[0K[1G164.9 MiB [] 0% 536.5s[0K[1G164.9 MiB [] 0% 451.9s[0K[1G164.9 MiB [] 0% 380.8s[0K[1G164.9 MiB [] 0% 429.7s[0K[1G164.9 MiB [] 0% 392.2s[0K[1G164.9 MiB [] 0% 362.4s[0K[1G164.9 Mi