In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from urllib.parse import urljoin
from tqdm import tqdm
import concurrent.futures
import os

In [15]:
# ----------------------------------------------------
# 1. PATHS
# ----------------------------------------------------

RAW_DATA_DIR = r"C:\Users\Enkhsaikhan\Final_paper_text_as_data\raw_data"
os.makedirs(RAW_DATA_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(RAW_DATA_DIR, "foxnews_links.csv")

# ----------------------------------------------------
# 2. CONFIG
# ----------------------------------------------------

SEARCH_URL_PATTERN = "https://www.foxnews.com/category/us/economy?page={}"
BASE_DOMAIN = "https://www.foxnews.com"
MAX_PAGES_TO_TRY = 129  # increase to cover all pages
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0 Safari/537.36"
    )
}

# Stop only after this many consecutive empty pages
MAX_EMPTY_CONSECUTIVE = 5

# ----------------------------------------------------
# 3. SCRAPER FUNCTION
# ----------------------------------------------------

def scrape_fox_news_page(page_num, seen_links):
    url = SEARCH_URL_PATTERN.format(page_num)
    print(f"Scraping Page {page_num}: {url}")

    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            print(f"Status code {r.status_code} — skipping page.")
            return []

        soup = BeautifulSoup(r.content, "lxml")

        container = soup.find("div", class_="content article-list")
        if not container:
            print("No article container found — page may be empty.")
            return []

        articles = container.find_all("article", class_="article")
        new_links = []

        for article in articles:
            # Skip videos
            eyebrow = article.find("span", class_="eyebrow")
            if eyebrow and "VIDEO" in eyebrow.get_text(strip=True).upper():
                continue

            headline = article.find("h4", class_="title")
            link_tag = headline.find("a", href=True) if headline else None

            if not link_tag:
                continue

            full_url = urljoin(BASE_DOMAIN, link_tag["href"])

            if full_url not in seen_links:
                seen_links.add(full_url)
                new_links.append(full_url)

        return new_links

    except Exception as e:
        print(f"Error on page {page_num}: {e}")
        return []

# ----------------------------------------------------
# 4. MAIN LOOP
# ----------------------------------------------------

all_links = []
seen_links = set()
consecutive_empty = 0

for page in range(1, MAX_PAGES_TO_TRY + 1):
    page_links = scrape_fox_news_page(page, seen_links)

    if not page_links:
        consecutive_empty += 1
        print(f"No new links found on page {page} ({consecutive_empty}/{MAX_EMPTY_CONSECUTIVE})")
        if consecutive_empty >= MAX_EMPTY_CONSECUTIVE:
            print(f"Stopping after {MAX_EMPTY_CONSECUTIVE} consecutive empty pages.")
            break
    else:
        consecutive_empty = 0
        all_links.extend(page_links)

    time.sleep(1)  # politeness delay

print("-" * 40)
print(f"Total unique Fox News links collected: {len(all_links)}")

# ----------------------------------------------------
# 5. SAVE TO CSV
# ----------------------------------------------------

df = pd.DataFrame({"link": all_links})
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")

print(f"Links saved to: {OUTPUT_FILE}")
print(df.head())

Scraping Page 1: https://www.foxnews.com/category/us/economy?page=1
Scraping Page 2: https://www.foxnews.com/category/us/economy?page=2
Scraping Page 3: https://www.foxnews.com/category/us/economy?page=3
Scraping Page 4: https://www.foxnews.com/category/us/economy?page=4
Scraping Page 5: https://www.foxnews.com/category/us/economy?page=5
Scraping Page 6: https://www.foxnews.com/category/us/economy?page=6
Scraping Page 7: https://www.foxnews.com/category/us/economy?page=7
Scraping Page 8: https://www.foxnews.com/category/us/economy?page=8
Scraping Page 9: https://www.foxnews.com/category/us/economy?page=9
Scraping Page 10: https://www.foxnews.com/category/us/economy?page=10
Scraping Page 11: https://www.foxnews.com/category/us/economy?page=11
Scraping Page 12: https://www.foxnews.com/category/us/economy?page=12
Scraping Page 13: https://www.foxnews.com/category/us/economy?page=13
Scraping Page 14: https://www.foxnews.com/category/us/economy?page=14
Scraping Page 15: https://www.foxnews.

In [16]:
len(all_links)

756

In [17]:
# -------------------------------
# 1. Prepare data for processing
# -------------------------------
# all_links is your existing list of Fox News URLs
indexed_data = [{"index": i, "url": url} for i, url in enumerate(all_links)]

# Copy structure for final results
final_data = [{"url": url, "headline": None, "body": None, "date": None} for url in all_links]

# -------------------------------
# 2. Scraper function
# -------------------------------
def scrape_article(article_data):
    """
    Scrapes a Fox News article given its URL.
    Returns (index, headline, body_text, date)
    """
    url = article_data["url"]
    idx = article_data["index"]
    article_text = None
    article_date = None
    article_headline = None

    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            return idx, None, None, None

        soup = BeautifulSoup(response.text, "html.parser")

        # --- Headline ---
        headline_tag = soup.find("h1", class_="headline speakable")
        if headline_tag:
            article_headline = headline_tag.get_text(strip=True)

        # --- Article body ---
        article_div = soup.find("div", class_="article-body")
        if not article_div:
            article_div = soup.find("div", class_="article-content")
        if not article_div:
            article_div = soup.find("div", class_="page-content")

        if article_div:
            paragraphs = article_div.find_all("p")
            article_text = "\n".join(
                p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)
            )

        # --- Article date ---
        time_tag = soup.find("time")
        if time_tag and time_tag.has_attr("datetime"):
            article_date = time_tag["datetime"].split("T")[0]

        return idx, article_headline, article_text, article_date

    except Exception:
        return idx, None, None, None

# -------------------------------
# 3. Run scraper in parallel
# -------------------------------
print(f"Scraping {len(indexed_data)} articles...")

with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:  # keep 1 to avoid rate-limiting
    futures = [executor.submit(scrape_article, item) for item in indexed_data]

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(indexed_data)):
        idx, headline, text, date = future.result()
        final_data[idx]["headline"] = headline
        final_data[idx]["body"] = text
        final_data[idx]["date"] = date

# -------------------------------
# 4. Save results to CSV
# -------------------------------
RAW_DATA_DIR = r"C:\Users\Enkhsaikhan\Final_paper_text_as_data\raw_data"
os.makedirs(RAW_DATA_DIR, exist_ok=True)
OUTPUT_FILE = os.path.join(RAW_DATA_DIR, "foxnews_articles.csv")

df_articles = pd.DataFrame(final_data)
df_articles.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")

print("-" * 40)
print(f"Scraping finished: {len(df_articles)} articles saved.")
print(df_articles.head())

Scraping 756 articles...


100%|██████████| 756/756 [14:42<00:00,  1.17s/it]


----------------------------------------
Scraping finished: 756 articles saved.
                                                 url  \
0  https://www.foxnews.com/politics/affordability...   
1  https://www.foxnews.com/politics/perception-vs...   
2  https://www.foxnews.com/opinion/sec-turner-hom...   
3  https://www.foxnews.com/opinion/im-new-virgini...   
4  https://www.foxnews.com/tech/3d-printed-housin...   

                                            headline  \
0  Affordability: The issue that boosted Trump an...   
1  ‘Perception vs. reality’: Trump’s economy pick...   
2  SEC TURNER: Homeownership is making a comeback...   
3  I’m the new Virginia governor and affordabilit...   
4  3D-printed housing project for student apartme...   

                                                body        date  
0  O'Leary Ventures chairman Kevin O'Leary analyz...  2025-12-27  
1  Unleash Prosperity co-founder Stephen Moore sa...  2025-12-27  
2  Citizens Alliance CEO Cliff Maloney joins 