In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import boto3
from urllib.parse import urljoin
from tqdm import tqdm
import concurrent.futures

In [2]:
#Paths
raw_folder = "../raw_scraped_data/"
analyzed_folder = "../analyzed_data/"
raw_file = "fox_news_economic_news.jsonl"
analyzed_file = "fox_news_economic_sentiment.jsonl"
REGION = "eu-west-1"

In [None]:
# ----------------------------------------------------
# 1. CONFIGURATION
# ----------------------------------------------------

# Base URL pattern where {page_num} will be replaced by the loop counter
SEARCH_URL_PATTERN = "https://www.foxnews.com/category/us/economy?page={}"
BASE_DOMAIN = "https://www.foxnews.com"

# The loop will stop if it exceeds this maximum page number or finds an empty page.
MAX_PAGES_TO_TRY = 60

# ----------------------------------------------------
# 2. EXTRACTION FUNCTION (Handles a single page)
# ----------------------------------------------------

def scrape_fox_news_page(page_num):
    """Fetches and parses articles from a single page number."""
    
    current_url = SEARCH_URL_PATTERN.format(page_num)
    print(f"-> Loading Page {page_num}: {current_url}")
    
    # Use a standard user-agent header
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
    
    try:
        response = requests.get(current_url, headers=headers, timeout=20)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        return []

    # Parse using lxml parser for speed
    soup = BeautifulSoup(response.content, "lxml")
    
    # Find the main container holding all articles 
    article_list_container = soup.find('div', class_='content article-list')
    
    if not article_list_container:
        # If the container is missing, the page structure might have changed, or the page is empty
        print(f"Warning: Article list container not found on Page {page_num}. Stopping.")
        return []

    # Find all individual articles within that container
    articles = article_list_container.find_all('article', class_='article')
    
    scraped_articles = []

    for article in articles:
        # --- 1. News Type (Eyebrow Category) ---
        eyebrow_tag = article.find('span', class_='eyebrow')
        news_type = eyebrow_tag.a.text.strip() if eyebrow_tag and eyebrow_tag.a else "N/A"

        # Skip videos
        if "VIDEO" in news_type.upper():
            continue

        # --- 2. Headline and Link ---
        headline_tag = article.find('h4', class_='title')
        link_tag = headline_tag.find('a', href=True) if headline_tag else None
        
        
        if link_tag:
            # Construct full URL for relative paths
            url = urljoin(BASE_DOMAIN, link_tag.get('href'))
            
            scraped_articles.append({
                'title': headline_tag.text.strip(),
                'date': None,
                'body': None,
                'url': url
            })
    
    return scraped_articles

# ----------------------------------------------------
# 3. MAIN EXECUTION: LOOP THROUGH ALL PAGES
# ----------------------------------------------------

if __name__ == "__main__":
    
    final_data = []
    current_page = 1
    
    while current_page <= MAX_PAGES_TO_TRY:
        # Scrape the data for the current page
        page_data = scrape_fox_news_page(current_page)

        # Add the collected articles to the final list
        final_data.extend(page_data)
        
        print(f"--- Finished processing Page {current_page}. Articles collected: {len(page_data)} ---")
        
        # Increment to the next page
        current_page += 1

In [None]:

def scrape_article(article_data):
    """
    Takes the whole article dict, scrapes the URL, and returns
    a tuple of (index, text, date) so we can map it back correctly.
    """
    url = article_data["url"]
    idx = article_data["index"] # We pass the index to keep track
    
    # Initialize defaults
    article_text = None
    article_date = None

    try:
        # standard timeout is good practice
        response = requests.get(url, timeout=10)
        
        if response.status_code != 200:
            return idx, None, None

        soup = BeautifulSoup(response.text, 'html.parser')

        # ----- Extract article body -----
        article_div = soup.find('div', class_='article-body')
        if article_div:
            paragraphs = article_div.find_all('p')
            article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])

        # ----- Extract article date -----
        date_span = soup.find('span', class_='article-date')
        if date_span:
            time_tag = date_span.find('time')
            if time_tag and time_tag.has_attr('datetime'):
                raw_date = time_tag['datetime']
                article_date = raw_date.split("T")[0]

        return idx, article_text, article_date

    except Exception as e:
        return idx, None, None

# --- 2. Setup Data for Processing ---
# (Threads finish in random order, so we need the index to put data back in the right spot)
indexed_data = []
for i, article in enumerate(final_data):
    # Create a lightweight object to pass to the worker
    indexed_data.append({"index": i, "url": article["url"]})

# --- 3. Run in Parallel ---
# max_workers=5 means 5 requests happen at the exact same time.

print(f"Scraping {len(indexed_data)} articles...")

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Submit all tasks
    futures = [executor.submit(scrape_article, item) for item in indexed_data]
    
    # Process results as they finish (tqdm shows progress bar)
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(indexed_data)):
        idx, text, date = future.result()
        
        # Update the original list directly using the index we passed through
        final_data[idx]["body"] = text
        final_data[idx]["date"] = date


In [None]:
#drop any years before 2025
final_data = [article for article in final_data if article["date"] and article["date"] >= "2025-01-01"]
len(final_data)
#save updated raw data
with open(raw_folder + raw_file, 'w') as f:
    for article in final_data:
        f.write(json.dumps(article) + '\n')
print(f"\nUpdated {len(final_data)} articles.")

In [None]:
# ==========================================
# 1. Configuration & Client Initialization
# ==========================================

# Initialize Clients (Comprehend removed)
bedrock_runtime = boto3.client('bedrock-runtime', region_name=REGION)

# Use the EU Inference Profile ID for Nova Micro
NOVA_MODEL_ID = "eu.amazon.nova-micro-v1:0"

def is_related_to_economy(title):
    """
    Uses Amazon Nova Micro to filter news titles.
    Returns: True if related to US Economy, False otherwise.
    """
    
    # Nova models accept a specific system prompt structure
    system_prompt = (
        "You are a news classifier. "
        "Your task is to analyze the user's news title and determine if it "
        "loosely relates to one or more of the following: Economy, Stock Market, Financial Policy, Business, Prices, Inflation, Markets, Commodities, Incomes, Tax, Taxes, Crypto, GDP, Employment, Trade."
        "If you are not sure, default to YES."
        "Respond with ONLY one word: 'YES' or 'NO'."
    )

    # Nova request payload structure
    payload = {
        "system": [{"text": system_prompt}],
        "messages": [
            {
                "role": "user", 
                "content": [{"text": f"Title: {title}"}]
            }
        ],
        "inferenceConfig": {
            "max_new_tokens": 5,  # We only need a short YES/NO response
            "temperature": 0.0    # Set to 0 for deterministic (consistent) results
        }
    }

    try:
        response = bedrock_runtime.invoke_model(
            modelId=NOVA_MODEL_ID,
            body=json.dumps(payload)
        )
        
        # Parse Nova response body
        response_body = json.loads(response.get('body').read())
        
        # Navigate the Nova output structure
        # output -> message -> content -> list -> text
        model_answer = response_body["output"]["message"]["content"][0]["text"]
        
        # Clean and check answer
        clean_answer = model_answer.strip().upper()
        return "YES" in clean_answer

    except Exception as e:
        print(f"Error checking title '{title}': {e}")
        return False

def get_sentiment(text):
    """
    Uses Amazon Nova Micro to analyze sentiment regarding the US Administration.
    Returns: 
        0 if Negative
        1 if Positive)
    """
    if not text:
        return None
        
    # Truncate to a reasonable limit for the context window if needed
    truncated_text = text[:10000]

    system_prompt = (
        "You are a political analyst. "
        "Read the provided article text and answer the following question: "
        "'Is the article more positive or negative considering the current/future state of the US economy? Be sure to consider both explicit statements and implicit tones.'"
        "Respond with ONLY one of the following words: 'POSITIVE', 'NEGATIVE'. "
        "Do not provide any explanation."
    )

    payload = {
        "system": [{"text": system_prompt}],
        "messages": [
            {
                "role": "user", 
                "content": [{"text": f"Article Text: {truncated_text}"}]
            }
        ],
        "inferenceConfig": {
            "max_new_tokens": 5,
            "temperature": 0.0
        }
    }

    try:
        response = bedrock_runtime.invoke_model(
            modelId=NOVA_MODEL_ID,
            body=json.dumps(payload)
        )
        
        response_body = json.loads(response.get('body').read())
        model_answer = response_body["output"]["message"]["content"][0]["text"]
        clean_answer = model_answer.strip().upper()
        
        # Map text answer to integer
        if "POSITIVE" in clean_answer:
            return 1
        else:
            return 0 # Defaults to Negative for other outputs

    except Exception as e:
        print(f"Bedrock Sentiment Error: {e}")
        return None

# ==========================================
# 2. Main Execution Flow
# ==========================================

# Example Data (Simulating your scraped table)
articles_table = []
# Ensure raw_folder and raw_file are defined in your environment
try:
    with open(raw_folder + raw_file, 'r') as f:
        for line in f:
            articles_table.append(json.loads(line))
except NameError:
    print("Error: raw_folder or raw_file not defined. Please define them before running.")
    articles_table = []

print(f"--- Processing {len(articles_table)} articles in {REGION} ---\n")

processed_data = []

for row in articles_table:
    title = row.get('title')
    body = row.get('body')
    
    print(f"Checking: '{title}'")
    
    # Step 1: Filter with Bedrock
    if is_related_to_economy(body):
        print("  [✓] Economy Related. Analyzing sentiment...")
        
        # Step 2: Analyze Sentiment with Bedrock
        sentiment_val = get_sentiment(body)
        
        if sentiment_val is not None:
            row['is_economy'] = True
            row['sentiment'] = sentiment_val
            
            processed_data.append(row)
            print(f"  -> Sentiment: {row['sentiment']} (0=NEGATIVE, 1=POSITIVE")
    else:
        print("  [x] Unrelated. Skipping.")
    print("-" * 40)

# Final Output
print("\n--- Final Results ---")
for item in processed_data:
    print(f"{item.get('date')} | {item.get('sentiment')} | {item.get('title')}")

In [None]:
articles_df = pd.DataFrame(articles_table)
#keep only articels from 2025
fox_news_economic_sentiment = articles_df.loc[(articles_df["is_economy"] == True)]
fox_news_economic_sentiment.to_json(analyzed_folder+analyzed_file, orient="records", lines=True)

In [3]:
from helper_functions.topic_scraper import econ_topic_scaper
econ_topic_scaper(analyzed_folder + analyzed_file,analyzed_folder, analyzed_file)

--- Loading data from: ../analyzed_data/fox_news_economic_sentiment.jsonl ---
Processing 456 rows...

--- Topic Totals (Articles containing topic) ---
TOPIC                | TOTAL
------------------------------
topic_taxes          | 382  
topic_jobs           | 259  
topic_housing        | 251  
topic_inflation      | 236  
topic_energy         | 202  
topic_stocks         | 70   
topic_crypto         | 10   
------------------------------

Saving to ../analyzed_data/fox_news_economic_sentiment.jsonl...
[✓] Done. File updated.


In [4]:
from helper_functions.delete_folder_contents_from_s3 import delete_folder_contents_from_s3
delete_folder_contents_from_s3("business-news-sentiments", "news_sentiments_monthly/combined_networks/")

Deleted batch of 1 files...
Folder cleared.


In [3]:
from helper_functions.upload_file_to_s3 import upload_file_to_s3
upload_file_to_s3(analyzed_folder + analyzed_file,"business-news-sentiments", f"news_sentiments/fox/{analyzed_file}")

✅ Success! Uploaded ../analyzed_data/fox_news_economic_sentiment.jsonl to s3://business-news-sentiments/news_sentiments/fox/fox_news_economic_sentiment.jsonl
