In [11]:
import requests
from bs4 import BeautifulSoup
import csv
import time

BASE = "https://www.foxnews.com"
CATEGORY = BASE + "/category/world/global-economy"

def get_articles(page_number):
    if page_number == 1:
        url = CATEGORY
    else:
        url = f"{CATEGORY}?page={page_number}"

    print("Scraping:", url)
    r = requests.get(url, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")

    articles = []

    # Each article is now in a div with class 'content'
    for div in soup.find_all("div", class_="content"):
        # Headline
        h2 = div.find("h4") or div.find("h2")  # some pages use h4
        if not h2:
            continue
        headline = h2.get_text(strip=True)

        # URL
        a_tag = div.find("a")
        if not a_tag or not a_tag.get("href"):
            continue
        href = a_tag.get("href")
        if href.startswith("/"):
            href = BASE + href

        # Summary (optional)
        p_tag = div.find("p")
        summary = p_tag.get_text(strip=True) if p_tag else ""

        articles.append({
            "headline": headline,
            "url": href,
            "summary": summary
        })

    return articles

def scrape_all():
    all_articles = []
    page = 1

    while True:
        articles = get_articles(page)
        if not articles:
            print("No articles on this page. Stopping.")
            break
        all_articles.extend(articles)
        page += 1
        time.sleep(1)

    return all_articles

def save_to_csv(articles, filename="global_economy_foxnews.csv"):
    print(f"Saving {len(articles)} articles to {filename}...")
    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["headline", "url", "summary"])
        writer.writeheader()
        for art in articles:
            writer.writerow(art)
    print("Saved successfully!")

# -------------------
# Run scraper
# -------------------
articles = scrape_all()
save_to_csv(articles)


Scraping: https://www.foxnews.com/category/world/global-economy
Scraping: https://www.foxnews.com/category/world/global-economy?page=2
Scraping: https://www.foxnews.com/category/world/global-economy?page=3
Scraping: https://www.foxnews.com/category/world/global-economy?page=4
Scraping: https://www.foxnews.com/category/world/global-economy?page=5
Scraping: https://www.foxnews.com/category/world/global-economy?page=6
Scraping: https://www.foxnews.com/category/world/global-economy?page=7
Scraping: https://www.foxnews.com/category/world/global-economy?page=8
Scraping: https://www.foxnews.com/category/world/global-economy?page=9
Scraping: https://www.foxnews.com/category/world/global-economy?page=10
Scraping: https://www.foxnews.com/category/world/global-economy?page=11
Scraping: https://www.foxnews.com/category/world/global-economy?page=12
Scraping: https://www.foxnews.com/category/world/global-economy?page=13
Scraping: https://www.foxnews.com/category/world/global-economy?page=14
Scraping

In [12]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import pandas as pd
import time
from urllib.parse import urljoin

# --- SELENIUM IMPORTS ---
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

# ----------------------------------------------------
# 1. CONFIGURATION
# ----------------------------------------------------

# The URL for the first page of CNN's economy search results
SEARCH_URL = "https://edition.cnn.com/search?q=economy&from=0&size=10&page=1&sort=newest&type=article&section="
BASE_DOMAIN = "https://edition.cnn.com"
scraped_articles = []
processed_urls = set()

# ----------------------------------------------------
# 2. STEP 1: EXTRACT LINKS AND HEADLINES (USING SELENIUM)
# ----------------------------------------------------

def get_article_links_selenium(search_url):
    """Uses Selenium to load the dynamic page, waits for content, and extracts links."""
    print(f"-> Using Selenium (with webdriver-manager) to fetch dynamic page: {search_url}")
    
    # Configure Chrome options: Run headless for speed and server environments
    opts = Options()
    opts.add_argument("--headless=new") 
    opts.add_argument("--no-sandbox") # Required for some Linux/server environments
    opts.add_argument("--disable-dev-shm-usage") # Fixes errors in Docker/Linux containers
    
    driver = None
    try:
        # **FIX:** Use ChromeDriverManager().install() to automatically find and manage the compatible driver
        driver = webdriver.Chrome(
            service=ChromeService(ChromeDriverManager().install()),
            options=opts
        )
    except WebDriverException as e:
        print(f"FATAL ERROR: Could not initialize WebDriver. Check if Chrome is installed and retry. Error: {e}")
        return []

    # Navigate to the page
    driver.get(search_url)

    try:
        # Wait until the element containing the headline text appears (10-second timeout)
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'container__headline-text'))
        )
    except TimeoutException:
        print("Timed out waiting for article content to load. Content may not be available.")
        driver.quit()
        return []

    # Get the fully rendered HTML source from the browser
    page_source = driver.page_source
    driver.quit() # Close the browser

    # Now use BeautifulSoup on the complete HTML source
    soup = BeautifulSoup(page_source, 'html.parser')
    links = []
    
    # *** VERIFIED SELECTORS (Optimized for the loaded DOM) ***
    headline_tags = soup.find_all('span', class_='container__headline-text') 
    
    for headline_span in headline_tags:
        # The link (<a> tag) is usually the direct parent of the headline span
        link_tag = headline_span.find_parent('a', href=True)
        
        # The article item container helps locate the date
        article_item_div = headline_span.find_parent('div', class_='cnn-search__result-contents')
        date_tag = article_item_div.find('div', class_='timestamp') if article_item_div else None
        
        if link_tag:
            # Construct the full URL
            full_url = urljoin(BASE_DOMAIN, link_tag.get('href'))
            raw_date = date_tag.text.strip() if date_tag else "Date Not Found"

            links.append({
                'url': full_url,
                'headline': headline_span.text.strip(),
                'raw_date': raw_date
            })
            
    return links

# ----------------------------------------------------
# 3. STEP 2: SCRAPE FULL CONTENT FOR EACH ARTICLE
# ----------------------------------------------------

def scrape_full_article(article_data):
    """Uses Newspaper3k to scrape the full body text and clean date."""
    url = article_data['url']

    if url in processed_urls:
        return None
    processed_urls.add(url)

    try:
        article = Article(url, language='en')
        article.download()
        article.parse()
        
        return {
            'Headline': article_data['headline'],
            # Use Newspaper's parsed date if available, otherwise use the raw date from the search page
            'Publication_Date': article.publish_date.strftime('%Y-%m-%d') if article.publish_date else article_data['raw_date'],
            'Full_Text': article.text,
            'URL': url
        }
        
    except Exception as e:
        # This handles issues like paywalls or malformed articles
        print(f"Error processing article {url}: {e}")
        return None

# ----------------------------------------------------
# 4. MAIN EXECUTION
# ----------------------------------------------------

if __name__ == "__main__":
    
    article_links = get_article_links_selenium(SEARCH_URL)
    
    if not article_links:
        print("Execution stopped. Could not retrieve article links via Selenium.")
    else:
        print(f"Found {len(article_links)} article links. Beginning content scraping...")
        
        for i, link_info in enumerate(article_links):
            result = scrape_full_article(link_info)
            if result:
                scraped_articles.append(result)
            
            # Be polite: Wait a short time between requests to avoid IP blocking
            time.sleep(1.5) 
            
            if (i + 1) % 5 == 0:
                print(f"--- Processed {i + 1} full articles ---")

        if scraped_articles:
            df = pd.DataFrame(scraped_articles)
            df = df[['Headline', 'Publication_Date', 'URL', 'Full_Text']]
            
            print("\n--- SCRAPING COMPLETE ---")
            print(f"Successfully scraped {len(df)} full articles.")
            print("\nHead of DataFrame:")
            print(df.head())
            df.to_csv('cnn_economy_news_final.csv', index=False)
            print("\nData saved to cnn_economy_news_final.csv")
        else:
            print("No articles were successfully scraped from the links found.")

-> Using Selenium (with webdriver-manager) to fetch dynamic page: https://edition.cnn.com/search?q=economy&from=0&size=10&page=1&sort=newest&type=article&section=
FATAL ERROR: Could not initialize WebDriver. Check if Chrome is installed and retry. Error: Message: unknown error: cannot find Chrome binary
Stacktrace:
#0 0x59fcddfb54e3 <unknown>
#1 0x59fcddce4c76 <unknown>
#2 0x59fcddd0b757 <unknown>
#3 0x59fcddd0a029 <unknown>
#4 0x59fcddd48ccc <unknown>
#5 0x59fcddd4847f <unknown>
#6 0x59fcddd3fde3 <unknown>
#7 0x59fcddd152dd <unknown>
#8 0x59fcddd1634e <unknown>
#9 0x59fcddf753e4 <unknown>
#10 0x59fcddf793d7 <unknown>
#11 0x59fcddf83b20 <unknown>
#12 0x59fcddf7a023 <unknown>
#13 0x59fcddf481aa <unknown>
#14 0x59fcddf9e6b8 <unknown>
#15 0x59fcddf9e847 <unknown>
#16 0x59fcddfae243 <unknown>
#17 0x7d2bed28aaa4 <unknown>
#18 0x7d2bed317c6c <unknown>

Execution stopped. Could not retrieve article links via Selenium.
