In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import boto3
from urllib.parse import urljoin
from tqdm import tqdm
import concurrent.futures

In [2]:
#Paths
raw_folder = "../raw_scraped_data/"
analyzed_folder = "../analyzed_data/"
raw_file = "fox_news_economic_news.jsonl"
analyzed_file = "fox_news_economic_sentiment.jsonl"
REGION = "eu-west-1"

In [4]:
already_analyzed_urls = pd.read_json(analyzed_folder + analyzed_file, lines=True)['url'].tolist()
seen_links = set()  # Set for fast duplicate checking
seen_links.update(already_analyzed_urls)  # Preload with already analyzed URLs

In [10]:
# ----------------------------------------------------
# 1. CONFIGURATION
# ----------------------------------------------------

# Base URL pattern where {page_num} will be replaced by the loop counter
SEARCH_URL_PATTERN = "https://www.foxnews.com/category/us/economy?page={}"
BASE_DOMAIN = "https://www.foxnews.com"

# The loop will stop if it exceeds this maximum page number or finds an empty page.
MAX_PAGES_TO_TRY = 100

# ----------------------------------------------------
# 2. EXTRACTION FUNCTION (Handles a single page)
# ----------------------------------------------------

def scrape_fox_news_page(page_num):
    """Fetches and parses articles from a single page number."""
    
    current_url = SEARCH_URL_PATTERN.format(page_num)
    print(f"-> Loading Page {page_num}: {current_url}")
    
    # Use a standard user-agent header
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    
    try:
        response = requests.get(current_url, headers=headers, timeout=20)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        return []

    # Parse using lxml parser for speed
    soup = BeautifulSoup(response.content, "lxml")
    
    # Find the main container holding all articles 
    article_list_container = soup.find('div', class_='content article-list')
    
    if not article_list_container:
        # If the container is missing, the page structure might have changed, or the page is empty
        print(f"Warning: Article list container not found on Page {page_num}. Stopping.")
        return []

    # Find all individual articles within that container
    articles = article_list_container.find_all('article', class_='article')
    
    scraped_articles = []

    for article in articles:
        # --- 1. News Type (Eyebrow Category) ---
        eyebrow_tag = article.find('span', class_='eyebrow')
        news_type = eyebrow_tag.a.text.strip() if eyebrow_tag and eyebrow_tag.a else "N/A"

        # Skip videos
        if "VIDEO" in news_type.upper():
            continue


        # --- 2. Headline and Link ---
        headline_tag = article.find('h4', class_='title')
        link_tag = headline_tag.find('a', href=True) if headline_tag else None
        
        
        if link_tag:
            # Construct full URL for relative paths
            url = urljoin(BASE_DOMAIN, link_tag.get('href'))
            if url in seen_links:
                print(f"Skipping already analyzed URL: {url}")
                continue
            
            scraped_articles.append({
                'title': headline_tag.text.strip(),
                'date': None,
                'body': None,
                'url': url
            })
    
    return scraped_articles

# ----------------------------------------------------
# 3. MAIN EXECUTION: LOOP THROUGH ALL PAGES
# ----------------------------------------------------

if __name__ == "__main__":
    
    final_data = []
    current_page = 1
    
    while current_page <= MAX_PAGES_TO_TRY:
        # Scrape the data for the current page
        page_data = scrape_fox_news_page(current_page)

        # Add the collected articles to the final list
        final_data.extend(page_data)
        
        print(f"--- Finished processing Page {current_page}. Articles collected: {len(page_data)} ---")
        
        # Increment to the next page
        current_page += 1

-> Loading Page 1: https://www.foxnews.com/category/us/economy?page=1
Skipping already analyzed URL: https://www.foxnews.com/politics/trump-drops-receipts-us-savings-since-bidens-oval-office-exit
--- Finished processing Page 1. Articles collected: 4 ---
-> Loading Page 2: https://www.foxnews.com/category/us/economy?page=2
Skipping already analyzed URL: https://www.foxnews.com/politics/trump-touts-bringing-country-back-from-brink-ruin
Skipping already analyzed URL: https://www.foxnews.com/politics/trump-set-address-nation-primetime-white-house-speech-americans-report-economic-squeeze
Skipping already analyzed URL: https://www.foxnews.com/politics/fox-news-poll-prices-pinch-voters-see-trump-focused-elsewhere
Skipping already analyzed URL: https://www.foxnews.com/media/washington-post-blast-rent-controls-failed-policy-leaves-renters-worse-off-than-before
Skipping already analyzed URL: https://www.foxnews.com/politics/two-key-senate-republicans-join-push-overturn-trumps-federal-union-order

KeyboardInterrupt: 

In [11]:
final_data

[{'title': 'NC Senate showdown escalates as Trump rallies behind Whatley to keep GOP seat',
  'date': None,
  'body': None,
  'url': 'https://www.foxnews.com/politics/nc-senate-showdown-escalates-trump-rallies-behind-whatley-keep-gop-seat'},
 {'title': "Commerce secretary predicts 'extraordinary year' ahead as inflation drops to 2.7% in November",
  'date': None,
  'body': None,
  'url': 'https://www.foxnews.com/media/commerce-secretary-predicts-extraordinary-year-ahead-inflation-drops-2-7-november'},
 {'title': 'Trump to hand out $2.6B in ‘warrior dividends’ — and the surprising pot he’s pulling the money from',
  'date': None,
  'body': None,
  'url': 'https://www.foxnews.com/politics/trump-hand-out-2-6b-warrior-dividends-surprising-pot-hes-pulling-money-from'},
 {'title': 'Harvard economist says Trump inflation report leaves ‘no other way to spin it’ but good news',
  'date': None,
  'body': None,
  'url': 'https://www.foxnews.com/media/harvard-economist-says-trump-inflation-report-

In [12]:

def scrape_article(article_data):
    """
    Takes the whole article dict, scrapes the URL, and returns
    a tuple of (index, text, date) so we can map it back correctly.
    """
    url = article_data["url"]
    idx = article_data["index"] # We pass the index to keep track
    
    # Initialize defaults
    article_text = None
    article_date = None

    try:
        # standard timeout is good practice
        response = requests.get(url, timeout=10)
        
        if response.status_code != 200:
            return idx, None, None

        soup = BeautifulSoup(response.text, 'html.parser')

        # ----- Extract article body -----
        article_div = soup.find('div', class_='article-body')
        if article_div:
            paragraphs = article_div.find_all('p')
            article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])

        # ----- Extract article date -----
        date_span = soup.find('span', class_='article-date')
        if date_span:
            time_tag = date_span.find('time')
            if time_tag and time_tag.has_attr('datetime'):
                raw_date = time_tag['datetime']
                article_date = raw_date.split("T")[0]

        return idx, article_text, article_date

    except Exception as e:
        return idx, None, None

# --- 2. Setup Data for Processing ---
# (Threads finish in random order, so we need the index to put data back in the right spot)
indexed_data = []
for i, article in enumerate(final_data):
    # Create a lightweight object to pass to the worker
    indexed_data.append({"index": i, "url": article["url"]})

# --- 3. Run in Parallel ---
# max_workers=1 means 1 requests happen at the exact same time. Fox does rate limiting aggressively.

print(f"Scraping {len(indexed_data)} articles...")

with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
    # Submit all tasks
    futures = [executor.submit(scrape_article, item) for item in indexed_data]
    
    # Process results as they finish (tqdm shows progress bar)
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(indexed_data)):
        idx, text, date = future.result()
        
        # Update the original list directly using the index we passed through
        final_data[idx]["body"] = text
        final_data[idx]["date"] = date


Scraping 4 articles...


100%|██████████| 4/4 [00:00<00:00,  4.05it/s]


In [13]:
#save updated raw data
with open(raw_folder + raw_file, "a", encoding="utf-8") as f:
    for entry in final_data:
        json_line = json.dumps(entry, ensure_ascii=False)
        f.write(json_line + "\n")

In [16]:
from helper_functions.bedrock_article_analyser import is_related_to_economy, get_sentiment
# ==========================================
# 2. Main Execution Flow
# ==========================================


processed_data = []

for row in final_data:
    title = row.get('title')
    body = row.get('body')
    
    print(f"Checking: '{title}'")
    
    # Step 1: Filter with Bedrock
    if is_related_to_economy(body):
        print("  [✓] Economy Related. Analyzing sentiment...")
        
        # Step 2: Analyze Sentiment with Bedrock
        sentiment_val = get_sentiment(body)
        
        if sentiment_val is not None:
            row['is_economy'] = True
            row['sentiment'] = sentiment_val
            
            processed_data.append(row)
            print(f"  -> Sentiment: {row['sentiment']} (0=NEGATIVE, 1=POSITIVE")
    else:
        print("  [x] Unrelated. Skipping.")
    print("-" * 40)

# Final Output
print("\n--- Final Results ---")
for item in processed_data:
    print(f"{item.get('date')} | {item.get('sentiment')} | {item.get('title')}")

Checking: 'NC Senate showdown escalates as Trump rallies behind Whatley to keep GOP seat'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 1 (0=NEGATIVE, 1=POSITIVE
----------------------------------------
Checking: 'Commerce secretary predicts 'extraordinary year' ahead as inflation drops to 2.7% in November'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 1 (0=NEGATIVE, 1=POSITIVE
----------------------------------------
Checking: 'Trump to hand out $2.6B in ‘warrior dividends’ — and the surprising pot he’s pulling the money from'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 1 (0=NEGATIVE, 1=POSITIVE
----------------------------------------
Checking: 'Harvard economist says Trump inflation report leaves ‘no other way to spin it’ but good news'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 1 (0=NEGATIVE, 1=POSITIVE
----------------------------------------

--- Final Results ---
2025-12-19 | 1 | NC Senate showdown escalates as

In [17]:
#Append final_data to analyzed_file
with open(analyzed_folder+analyzed_file, 'a') as f:
    for entry in final_data:
        f.write(json.dumps(entry) + '\n')

In [18]:
from helper_functions.topic_scraper import econ_topic_scaper
econ_topic_scaper(analyzed_folder + analyzed_file,analyzed_folder, analyzed_file)

--- Loading data from: ../analyzed_data/fox_news_economic_sentiment.jsonl ---
Processing 550 rows...

--- Topic Totals (Articles containing topic) ---
TOPIC                | TOTAL
------------------------------
topic_taxes          | 436  
topic_housing        | 331  
topic_jobs           | 286  
topic_inflation      | 228  
topic_energy         | 220  
topic_stocks         | 79   
topic_crypto         | 11   
------------------------------

Saving to ../analyzed_data/fox_news_economic_sentiment.jsonl...
[✓] Done. File updated.


In [19]:
from helper_functions.upload_file_to_s3 import upload_file_to_s3
upload_file_to_s3(analyzed_folder + analyzed_file,"business-news-sentiments", f"news_sentiments/fox/{analyzed_file}")

✅ Success! Uploaded ../analyzed_data/fox_news_economic_sentiment.jsonl to s3://business-news-sentiments/news_sentiments/fox/fox_news_economic_sentiment.jsonl
