In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import json
import boto3
import pandas as pd
import concurrent.futures


In [2]:
#Paths and other constants
raw_folder = "../raw_scraped_data/"
analyzed_folder = "../analyzed_data/"
raw_file = "cnn_economic_news.jsonl"
analyzed_file = "cnn_economic_sentiment.jsonl"
REGION = "eu-west-1"

In [None]:
key_words = ["price", "prices", "inflation", "cost", "costs", "market", "markets", "stocks", "economy", "money", "tax", "taxes", "business", "commodities", "finances", "financial policy", "economic policy", "fiscal policy", "GDP", "unemployment", "interest rates", "recession", "economic growth", "budget deficit", "trade deficit", "consumer spending", "investment", "monetary policy", "fiscal stimulus", "housing market", "labor market", "wages", "corporate earnings", "supply chain", "energy prices", "commodity prices", "financial markets", "stock market volatility", "economic outlook", "economic indicators", "central bank policy", "inflation expectations", "currency exchange rates", "debt levels", "credit markets", "business cycles", "economic uncertainty", "global economy", "economic reforms", "tax policy changes"]

# --- Setup Firefox Options ---
options = Options()
options.add_argument("--headless")
options.add_argument("--width=1920")
options.add_argument("--height=1080")
options.page_load_strategy = 'eager' 

# --- Setup Service ---
service = Service(log_output="geckodriver.log") 

# Lists to store data
unique_urls_list = [] # Stores just the unique links (strings)
seen_links = set()  # Set for fast duplicate checking

print("Attempting to launch Firefox...")

try:
    driver = webdriver.Firefox(options=options, service=service)
    driver.set_page_load_timeout(30) 
    print("Firefox launched successfully!")

    for key_word in key_words:
        # Note: We are scraping page 1 (size 100) for each keyword
        url = f"https://edition.cnn.com/search?q={key_word}&from=0&size=100&page=1&sort=newest&types=article&section="
        
        print(f"Scraping Keyword: {key_word}...")
        
        try:
            driver.get(url)
        except Exception:
            print(f"Page load timeout for '{key_word}' (continuing anyway)...")

        try:
            # Wait for headlines
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
            )
        except Exception:
            print(f"No results found for {key_word}, skipping...")
            continue
        
        # Parse content
        soup = BeautifulSoup(driver.page_source, "html.parser")
        headlines = soup.find_all("span", class_="container__headline-text")

        new_count = 0
        for h in headlines:
            link = h.get("data-zjs-href") 
            
            # Fallback if link is not in the data attribute
            if not link:
                parent = h.find_parent("a")
                if parent:
                    link = parent.get("href")
            
            # --- UNIQUENESS CHECK ---
            if link and link not in seen_links:
                seen_links.add(link)
                
                # Add just the link to your simple list
                unique_urls_list.append(link)
                
                new_count += 1
        
        print(f"  Found {new_count} new unique links.")

except Exception as e:
    print(f"CRITICAL ERROR: {e}")

finally:
    if 'driver' in locals():
        driver.quit()

print("-" * 30)
print(f"Scraping complete.")
print(f"Total unique links collected: {len(unique_urls_list)}")

# Printing the first 5 links as a preview
print("Preview of unique links:", unique_urls_list[:5])

In [None]:
unique_urls_list
#drop if link contains "cnn-underscored", which is a section about deals and product reviews
unique_urls_list = [link for link in unique_urls_list if "cnn-underscored" not in link]
#drop if link does not contain "2025"
unique_urls_list = [link for link in unique_urls_list if "/2025/" in link]

len(unique_urls_list)

In [None]:
# --- 1. Setup Global Session ---
# Using a session creates a connection pool, speeding up requests to the same domain.
session = requests.Session()
# Optional: Add a header to look like a real browser
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})

def process_url(url):
    """
    Scrapes a single URL and returns the data dictionary.
    """
    try:
        # Date extraction (moved from the loop to here)
        # Assuming the URL structure is consistent based on your slicing indices
        year = url[20:24]
        month = url[25:27]
        day = url[28:30]
        date = f"{year}-{month}-{day}"

        # Request
        response = session.get(url, timeout=10)
        
        # If the page errors out (e.g. 404), return partial data or skip
        if response.status_code != 200:
            return {"title": None, "date": date, "body": None, "link": url}

        soup = BeautifulSoup(response.text, "lxml")

        # Headline
        headline_tag = soup.find("h1")
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        # Article text
        article_div = soup.find("div", class_="article__content-container")
        if article_div:
            paragraphs = article_div.find_all("p")
            article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])
        else:
            article_text = None

        return {"title": headline, "date": date, "body": article_text, "link": url}

    except Exception as e:
        # Return None or a log object so the main loop knows it failed
        return None

# --- 2. Run Parallel Scrape ---

final_data = []

# max_workers=10 is a safe starting point. 
print(f"Scraping {len(unique_urls_list)} articles...")

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    futures = [executor.submit(process_url, url) for url in unique_urls_list]
    
    # Process as they complete
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(unique_urls_list)):
        result = future.result()
        if result:
            final_data.append(result)

# --- 3. Save to JSONL ---
with open(raw_folder + raw_file, 'w') as f:
    for article in final_data:
        f.write(json.dumps(article) + '\n')

print(f"All articles saved to {raw_folder + raw_file}")

In [None]:
# ==========================================
# 1. Configuration & Client Initialization
# ==========================================

# Initialize Clients (Comprehend removed)
bedrock_runtime = boto3.client('bedrock-runtime', region_name=REGION)

# Use the EU Inference Profile ID for Nova Micro
NOVA_MODEL_ID = "eu.amazon.nova-micro-v1:0"

def is_related_to_economy(title):
    """
    Uses Amazon Nova Micro to filter news titles.
    Returns: True if related to US Economy, False otherwise.
    """
    
    # Nova models accept a specific system prompt structure
    system_prompt = (
        "You are a news classifier. "
        "Your task is to analyze the user's news title and determine if it "
        "loosely relates to one or more of the following: Economy, Stock Market, Financial Policy, Business, Prices, Inflation, Markets, Commodities, Incomes, Tax, Crypto, GDP, Employment, Trade."
        "If you are not sure, default to YES."        
        "Respond with ONLY one word: 'YES' or 'NO'."
    )

    # Nova request payload structure
    payload = {
        "system": [{"text": system_prompt}],
        "messages": [
            {
                "role": "user", 
                "content": [{"text": f"Title: {title}"}]
            }
        ],
        "inferenceConfig": {
            "max_new_tokens": 5,  # We only need a short YES/NO response
            "temperature": 0.0    # Set to 0 for deterministic (consistent) results
        }
    }

    try:
        response = bedrock_runtime.invoke_model(
            modelId=NOVA_MODEL_ID,
            body=json.dumps(payload)
        )
        
        # Parse Nova response body
        response_body = json.loads(response.get('body').read())
        
        # Navigate the Nova output structure
        # output -> message -> content -> list -> text
        model_answer = response_body["output"]["message"]["content"][0]["text"]
        
        # Clean and check answer
        clean_answer = model_answer.strip().upper()
        return "YES" in clean_answer

    except Exception as e:
        print(f"Error checking title '{title}': {e}")
        return False

def get_sentiment(text):
    """
    Uses Amazon Nova Micro to analyze sentiment regarding the US Administration.
    Returns: 
        0 if NO (Not positive)
        1 if YES (Positive)
        2 if MIXED
    """
    if not text:
        return None
        
    # Truncate to a reasonable limit for the context window if needed
    truncated_text = text[:10000]

    system_prompt = (
        "You are a political analyst. "
        "Read the provided article text and answer the following question: "
        "'Is the article more positive or negative considering the current/future state of the US economy? Be sure to consider both explicit statements and implicit tones.'"
        "Respond with ONLY one of the following words: 'POSITIVE', 'NEGATIVE'. "
        "Do not provide any explanation."
    )

    payload = {
        "system": [{"text": system_prompt}],
        "messages": [
            {
                "role": "user", 
                "content": [{"text": f"Article Text: {truncated_text}"}]
            }
        ],
        "inferenceConfig": {
            "max_new_tokens": 5,
            "temperature": 0.0
        }
    }

    try:
        response = bedrock_runtime.invoke_model(
            modelId=NOVA_MODEL_ID,
            body=json.dumps(payload)
        )
        
        response_body = json.loads(response.get('body').read())
        model_answer = response_body["output"]["message"]["content"][0]["text"]
        clean_answer = model_answer.strip().upper()
        
        # Map text answer to integer
        if "POSITIVE" in clean_answer:
            return 1
        else:
            return 0 # Defaults to Negative for "NO" or other outputs

    except Exception as e:
        print(f"Bedrock Sentiment Error: {e}")
        return None

# ==========================================
# 2. Main Execution Flow
# ==========================================

# Example Data (Simulating your scraped table)
articles_table = []
# Ensure raw_folder and raw_file are defined in your environment
try:
    with open(raw_folder + raw_file, 'r') as f:
        for line in f:
            articles_table.append(json.loads(line))
except NameError:
    print("Error: raw_folder or raw_file not defined. Please define them before running.")
    articles_table = []

print(f"--- Processing {len(articles_table)} articles in {REGION} ---\n")

processed_data = []

for row in articles_table:
    title = row.get('title')
    body = row.get('body')
    
    print(f"Checking: '{title}'")
    
    # Step 1: Filter with Bedrock
    if is_related_to_economy(body):
        print("  [✓] Economy Related. Analyzing sentiment...")
        
        # Step 2: Analyze Sentiment with Bedrock
        sentiment_val = get_sentiment(body)
        
        if sentiment_val is not None:
            row['is_economy'] = True
            row['sentiment'] = sentiment_val
            
            processed_data.append(row)
            print(f"  -> Sentiment: {row['sentiment']} (0=NEGATIVE, 1=POSITIVE)")
    else:
        print("  [x] Unrelated. Skipping.")
    print("-" * 40)

# Final Output
print("\n--- Final Results ---")
for item in processed_data:
    print(f"{item.get('date')} | {item.get('sentiment')} | {item.get('title')}")

In [None]:
articles_df = pd.DataFrame(articles_table)
cnn_economic_sentiment = articles_df.loc[articles_df["is_economy"] == True]
cnn_economic_sentiment.to_json(analyzed_folder+analyzed_file, orient="records", lines=True)

In [None]:
print(len(cnn_economic_sentiment))
cnn_economic_sentiment["sentiment"].sum()

In [None]:
S3_WIKI_BUCKET = "business-news-sentiments"
file_path_local = analyzed_folder + analyzed_file
file_key_s3 = f"news_sentiments/cnn/{analyzed_file}"

# Create S3 client
s3 = boto3.client('s3', region_name= REGION)

# Upload directly from disk
try:
    s3.upload_file(file_path_local, S3_WIKI_BUCKET, file_key_s3)
    print(f"✅ Success! Uploaded {file_path_local} to s3://{S3_WIKI_BUCKET}/{file_key_s3}")
except Exception as e:
    print(f"❌ Error uploading file: {e}")