In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin

# ----------------------------------------------------
# 1. CONFIGURATION
# ----------------------------------------------------

# Base URL pattern where {page_num} will be replaced by the loop counter
SEARCH_URL_PATTERN = "https://www.foxnews.com/category/world/global-economy?page={}"
BASE_DOMAIN = "https://www.foxnews.com"

# The loop will stop if it exceeds this maximum page number or finds an empty page.
MAX_PAGES_TO_TRY = 320 

# ----------------------------------------------------
# 2. EXTRACTION FUNCTION (Handles a single page)
# ----------------------------------------------------

def scrape_fox_news_page(page_num):
    """Fetches and parses articles from a single page number."""
    
    current_url = SEARCH_URL_PATTERN.format(page_num)
    print(f"-> Loading Page {page_num}: {current_url}")
    
    # Use a standard user-agent header
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    
    try:
        response = requests.get(current_url, headers=headers, timeout=20)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        return []

    # Parse using lxml parser for speed
    soup = BeautifulSoup(response.content, "lxml")
    
    # Find the main container holding all articles 
    article_list_container = soup.find('div', class_='content article-list')
    
    if not article_list_container:
        # If the container is missing, the page structure might have changed, or the page is empty
        print(f"Warning: Article list container not found on Page {page_num}. Stopping.")
        return []

    # Find all individual articles within that container
    articles = article_list_container.find_all('article', class_='article')
    
    scraped_articles = []

    for article in articles:
        # --- 1. News Type (Eyebrow Category) ---
        eyebrow_tag = article.find('span', class_='eyebrow')
        news_type = eyebrow_tag.a.text.strip() if eyebrow_tag and eyebrow_tag.a else "N/A"

        # --- 2. Headline and Link ---
        headline_tag = article.find('h4', class_='title')
        link_tag = headline_tag.find('a', href=True) if headline_tag else None
        
        # --- 3. Summary (Dek) ---
        summary_tag = article.find('p', class_='dek')

        # --- 4. Date/Time ---
        time_tag = article.find('span', class_='time')
        
        if link_tag:
            # Construct full URL for relative paths
            url = urljoin(BASE_DOMAIN, link_tag.get('href'))
            
            scraped_articles.append({
                'News_Type': news_type,
                'Headline': headline_tag.text.strip(),
                'Date': time_tag.text.strip() if time_tag else "Date Not Found",
                'Summary': summary_tag.text.strip() if summary_tag else "Summary Not Found",
                'URL': url
            })
    
    return scraped_articles

# ----------------------------------------------------
# 3. MAIN EXECUTION: LOOP THROUGH ALL PAGES
# ----------------------------------------------------

if __name__ == "__main__":
    
    final_data = []
    current_page = 1
    
    while current_page <= MAX_PAGES_TO_TRY:
        # Scrape the data for the current page
        page_data = scrape_fox_news_page(current_page)
        
        if not page_data:
            # Stop the loop if the page is empty (reached the end of history)
            print(f"\nStopping loop: Page {current_page} returned no articles.")
            break
            
        # Add the collected articles to the final list
        final_data.extend(page_data)
        
        print(f"--- Finished processing Page {current_page}. Articles collected: {len(page_data)} ---")
        
        # Politeness delay between page requests
        time.sleep(2) 
        
        # Increment to the next page
        current_page += 1

    # --- Final Output ---
    if final_data:
        df = pd.DataFrame(final_data)
        
        print("\n=== FINAL SCRAPING SUMMARY ===")
        print(f"Successfully scraped a total of {len(df)} articles across {current_page - 1} pages.")
        
        # Save to CSV
        output_filename = 'fox_news_global_economy_all_pages.csv'
        df.to_csv(output_filename, index=False)
        
        print(f"Data saved to '{output_filename}'")
        print("\nHead of Final DataFrame:")
        print(df[['News_Type', 'Headline', 'Date', 'URL']].head(10))
    else:
        print("\nFATAL: No articles were successfully scraped across all pages.")

-> Loading Page 1: https://www.foxnews.com/category/world/global-economy?page=1
--- Finished processing Page 1. Articles collected: 30 ---
-> Loading Page 2: https://www.foxnews.com/category/world/global-economy?page=2
--- Finished processing Page 2. Articles collected: 31 ---
-> Loading Page 3: https://www.foxnews.com/category/world/global-economy?page=3
--- Finished processing Page 3. Articles collected: 31 ---
-> Loading Page 4: https://www.foxnews.com/category/world/global-economy?page=4
--- Finished processing Page 4. Articles collected: 31 ---
-> Loading Page 5: https://www.foxnews.com/category/world/global-economy?page=5
--- Finished processing Page 5. Articles collected: 31 ---
-> Loading Page 6: https://www.foxnews.com/category/world/global-economy?page=6
--- Finished processing Page 6. Articles collected: 31 ---
-> Loading Page 7: https://www.foxnews.com/category/world/global-economy?page=7
--- Finished processing Page 7. Articles collected: 31 ---
-> Loading Page 8: https://

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def scrape_article(url):
    """
    Returns (article_text, article_date)
    """
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # ----- Extract date -----
        date_tag = soup.find('time')
        if date_tag:
            article_date = date_tag.get_text(strip=True)
        else:
            article_date = None

        # ----- Extract article body -----
        article_div = soup.find('div', class_='article-body')
        if not article_div:
            return None, article_date

        paragraphs = article_div.find_all('p')
        article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])

        return article_text, article_date

    except Exception as e:
        return None, None


# ------- Load dataframe --------
df = pd.read_csv("fox_news_global_economy_all_pages.csv")

# Create new empty columns
df["Article_Text"] = None
df["Corrected_Date"] = None

# Loop through rows
for idx, row in tqdm(df.iterrows(), total=len(df)):

    news_type = str(row["News_type"]).lower()

    # Skip videos
    if "VIDEO" in news_type:
        continue

    url = row["URL"]

    text, date = scrape_article(url)

    df.at[idx, "Article_Text"] = text
    df.at[idx, "Corrected_Date"] = date

# Save the updated dataframe
df.to_csv("df_with_articles.csv", index=False)


Treasury Secretary Scott Bessent discusses President Donald Trump’s economic success, U.S.-China relations and more on ‘Hannity.’
China’s systematic and orchestrated campaign of intellectual property theft and technological advancement poses an unprecedented threat to America’s national security, demanding a comprehensive and urgent response. The United States, once undisputed in its technological prowess, now grapples with threats emerging from the Chinese Communist Party (CCP) agenda that are both insidious and far-reaching in terms of the threats to our national security.  
As the world enters an era defined by new forms of power, emerging technologies such as artificial intelligence (AI), quantum computing and hypersonic weapons are at the forefront of U.S.-China competition. The National Security Commission on Artificial Intelligence warned as far back as 2021 that, "China possesses the might, talent, and ambition to surpass the United States as the world’s leader in AI in the nex