In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import json
import boto3
import pandas as pd
import concurrent.futures


In [2]:
#Paths and other constants
raw_folder = "../raw_scraped_data/"
analyzed_folder = "../analyzed_data/"
raw_file = "cnn_economic_news.jsonl"
analyzed_file = "cnn_economic_sentiment.jsonl"
REGION = "eu-west-1"

In [6]:
key_words = ["price", "prices", "inflation", "cost", "costs", "market", "markets", "stocks", "economy", "money", "tax", "taxes", "business", "commodities", "finances", "financial policy", "economic policy", "fiscal policy", "GDP", "unemployment", "interest rates", "recession", "economic growth", "budget deficit", "trade deficit", "consumer spending", "investment", "monetary policy", "fiscal stimulus", "housing market", "labor market", "wages", "corporate earnings", "supply chain", "energy prices", "commodity prices", "financial markets", "stock market volatility", "economic outlook", "economic indicators", "central bank policy", "inflation expectations", "currency exchange rates", "debt levels", "credit markets", "business cycles", "economic uncertainty", "global economy", "economic reforms", "tax policy changes"]

# --- Setup Firefox Options ---
options = Options()
options.add_argument("--headless")
options.add_argument("--width=1920")
options.add_argument("--height=1080")
options.page_load_strategy = 'eager' 

# --- Setup Service ---
service = Service(log_output="geckodriver.log") 

# Lists to store data
unique_urls_list = [] # Stores just the unique links (strings)
already_analyzed_urls = pd.read_json(analyzed_folder + analyzed_file, lines=True)['link'].tolist()
seen_links = set()  # Set for fast duplicate checking
seen_links.update(already_analyzed_urls)  # Preload with already analyzed URLs

print("Attempting to launch Firefox...")

try:
    driver = webdriver.Firefox(options=options, service=service)
    driver.set_page_load_timeout(30) 
    print("Firefox launched successfully!")

    for key_word in key_words:
        # Note: We are scraping page 1 (size 100) for each keyword
        url = f"https://edition.cnn.com/search?q={key_word}&from=0&size=100&page=1&sort=newest&types=article&section="
        
        print(f"Scraping Keyword: {key_word}...")
        
        try:
            driver.get(url)
        except Exception:
            print(f"Page load timeout for '{key_word}' (continuing anyway)...")

        try:
            # Wait for headlines
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
            )
        except Exception:
            print(f"No results found for {key_word}, skipping...")
            continue
        
        # Parse content
        soup = BeautifulSoup(driver.page_source, "html.parser")
        headlines = soup.find_all("span", class_="container__headline-text")

        new_count = 0
        for h in headlines:
            link = h.get("data-zjs-href") 
            
            # Fallback if link is not in the data attribute
            if not link:
                parent = h.find_parent("a")
                if parent:
                    link = parent.get("href")
            
            # --- UNIQUENESS CHECK ---
            if link and link not in seen_links:
                seen_links.add(link)
                
                # Add just the link to your simple list
                unique_urls_list.append(link)
                
                new_count += 1
        
        print(f"  Found {new_count} new unique links.")

except Exception as e:
    print(f"CRITICAL ERROR: {e}")

finally:
    if 'driver' in locals():
        driver.quit()

print("-" * 30)
print(f"Scraping complete.")
print(f"Total unique links collected: {len(unique_urls_list)}")

# Printing the first 5 links as a preview
print("Preview of unique links:", unique_urls_list[:5])

Attempting to launch Firefox...
Firefox launched successfully!
Scraping Keyword: price...
  Found 2 new unique links.
Scraping Keyword: prices...
  Found 0 new unique links.
Scraping Keyword: inflation...
  Found 0 new unique links.
Scraping Keyword: cost...
  Found 1 new unique links.
Scraping Keyword: costs...
  Found 1 new unique links.
Scraping Keyword: market...
  Found 0 new unique links.
Scraping Keyword: markets...
  Found 0 new unique links.
Scraping Keyword: stocks...
  Found 3 new unique links.
Scraping Keyword: economy...
  Found 1 new unique links.
Scraping Keyword: money...
  Found 0 new unique links.
Scraping Keyword: tax...
  Found 0 new unique links.
Scraping Keyword: taxes...
  Found 1 new unique links.
Scraping Keyword: business...
  Found 0 new unique links.
Scraping Keyword: commodities...
  Found 0 new unique links.
Scraping Keyword: finances...
  Found 1 new unique links.
Scraping Keyword: financial policy...
  Found 0 new unique links.
Scraping Keyword: economic

In [7]:
# --- 1. Setup Global Session ---
# Using a session creates a connection pool, speeding up requests to the same domain.
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
})

def process_url(url):
    """
    Scrapes a single URL and returns the data dictionary.
    """
    try:
        # Date extraction
        year = url[20:24]
        month = url[25:27]
        day = url[28:30]
        date = f"{year}-{month}-{day}"

        # Request
        response = session.get(url, timeout=10)
        
        # If the page errors out (e.g. 404), return partial data or skip
        if response.status_code != 200:
            return {"title": None, "date": date, "body": None, "link": url}

        soup = BeautifulSoup(response.text, "lxml")

        # Headline
        headline_tag = soup.find("h1")
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        # Article text
        article_div = soup.find("div", class_="article__content-container")
        if article_div:
            paragraphs = article_div.find_all("p")
            article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])
        else:
            article_text = None

        return {"title": headline, "date": date, "body": article_text, "link": url}

    except Exception as e:
        # Return None or a log object so the main loop knows it failed
        return None

# --- 2. Run Parallel Scrape ---

final_data = []

print(f"Scraping {len(unique_urls_list)} articles...")

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    futures = [executor.submit(process_url, url) for url in unique_urls_list]
    
    # Process as they complete
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(unique_urls_list)):
        result = future.result()
        if result:
            final_data.append(result)

# --- 3. Append Data to JSONL ---
with open(raw_folder + raw_file, "a", encoding="utf-8") as f:
    for entry in final_data:
        json_line = json.dumps(entry, ensure_ascii=False)
        f.write(json_line + "\n")


print(f"All articles saved to {raw_folder + raw_file}")

Scraping 14 articles...


  0%|          | 0/14 [00:00<?, ?it/s]

100%|██████████| 14/14 [00:03<00:00,  3.54it/s]

All articles saved to ../raw_scraped_data/cnn_economic_news.jsonl





In [10]:
from helper_functions.bedrock_article_analyser import is_related_to_economy, get_sentiment
# ==========================================
# 2. Main Execution Flow
# ==========================================

processed_data = []

for row in final_data:
    title = row.get('title')
    body = row.get('body')
    
    print(f"Checking: '{title}'")
    
    # Step 1: Filter with Bedrock
    if is_related_to_economy(body):
        print("  [✓] Economy Related. Analyzing sentiment...")
        
        # Step 2: Analyze Sentiment with Bedrock
        sentiment_val = get_sentiment(body)
        
        if sentiment_val is not None:
            row['is_economy'] = True
            row['sentiment'] = sentiment_val
            
            processed_data.append(row)
            print(f"  -> Sentiment: {row['sentiment']} (0=NEGATIVE, 1=POSITIVE)")
    else:
        print("  [x] Unrelated. Skipping.")
    print("-" * 40)

# Final Output
print("\n--- Final Results ---")
for item in processed_data:
    print(f"{item.get('date')} | {item.get('sentiment')} | {item.get('title')}")

Checking: 'None'
  [x] Unrelated. Skipping.
----------------------------------------
Checking: 'Americans have a new thing to worry about: A stuck job market with no quick fix'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 0 (0=NEGATIVE, 1=POSITIVE)
----------------------------------------
Checking: 'Congress leaves town until 2026, letting enhanced Obamacare tax credits expire in two weeks'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 0 (0=NEGATIVE, 1=POSITIVE)
----------------------------------------
Checking: 'Trump debuted a new affordability script. It’s unclear to advisers whether he’ll stick to it'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 0 (0=NEGATIVE, 1=POSITIVE)
----------------------------------------
Checking: 'None'
  [x] Unrelated. Skipping.
----------------------------------------
Checking: 'Big Tech can’t save your investments'
  [✓] Economy Related. Analyzing sentiment...
  -> Sentiment: 0 (0=NEGATIVE, 1=POSITIVE)
-

In [12]:
#Append final_data to analyzed_file

with open(analyzed_folder+analyzed_file, 'a') as f:
    for entry in final_data:
        f.write(json.dumps(entry) + '\n')


In [13]:
from helper_functions.topic_scraper import econ_topic_scaper
econ_topic_scaper(analyzed_folder + analyzed_file,analyzed_folder, analyzed_file)

--- Loading data from: ../analyzed_data/cnn_economic_sentiment.jsonl ---
Processing 2809 rows...

--- Topic Totals (Articles containing topic) ---
TOPIC                | TOTAL
------------------------------
topic_taxes          | 1938 
topic_inflation      | 1888 
topic_housing        | 1688 
topic_jobs           | 1328 
topic_energy         | 1291 
topic_stocks         | 839  
topic_crypto         | 109  
------------------------------

Saving to ../analyzed_data/cnn_economic_sentiment.jsonl...
[✓] Done. File updated.


In [14]:
from helper_functions.upload_file_to_s3 import upload_file_to_s3
upload_file_to_s3(analyzed_folder + analyzed_file,"business-news-sentiments", f"news_sentiments/cnn/{analyzed_file}")

✅ Success! Uploaded ../analyzed_data/cnn_economic_sentiment.jsonl to s3://business-news-sentiments/news_sentiments/cnn/cnn_economic_sentiment.jsonl
