In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import json
import pandas as pd
import concurrent.futures
import os
import time
import re
import random

Scaring links using key words

In [None]:
# ===============================
# CNN ARTICLE LINK SCRAPER
# ===============================
# -------------------------------
# -------------------------------
# 1. Keywords
# -------------------------------
key_words = [
    "inflation", "stocks", "economy", "tax", "taxes", "business", "finances", "financial policy", "economic policy",
    "fiscal policy", "GDP", "unemployment", "interest rates", "recession", "economic growth", "budget deficit", "trade deficit",
    "consumer spending", "investment", "monetary policy", "fiscal stimulus", "housing market", "labor market", "wages",
    "corporate earnings", "supply chain", "energy prices", "commodity prices", "financial markets",
    "stock market volatility", "economic outlook", "economic indicators", "central bank policy",
    "inflation expectations", "currency exchange rates", "credit markets", "business cycles", "economic uncertainty", "global economy",
    "economic reforms", "tax policy changes"
]

# -------------------------------
# 2. Paths
# -------------------------------
RAW_DATA_DIR = r"C:\Users\Enkhsaikhan\Final_paper_text_as_data\raw_data"
os.makedirs(RAW_DATA_DIR, exist_ok=True)

CNN_LINKS_FILE = os.path.join(RAW_DATA_DIR, "cnn_links.csv")

# -------------------------------
# 3. Load existing links (safe)
# -------------------------------
if os.path.exists(CNN_LINKS_FILE):
    seen_links = set(
        pd.read_csv(CNN_LINKS_FILE)["link"].dropna().tolist()
    )
else:
    seen_links = set()

unique_urls_list = []

# -------------------------------
# 4. Chrome setup
# -------------------------------
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

service = Service()  # Selenium Manager handles chromedriver

print("Launching Chrome...")

try:
    driver = webdriver.Chrome(options=chrome_options, service=service)
    driver.set_page_load_timeout(30)
    print("Chrome launched successfully.")

    # -------------------------------
    # 5. Scrape links
    # -------------------------------
    for key_word in key_words:
        search_url = (
            f"https://edition.cnn.com/search"
            f"?q={key_word}&from=0&size=100&page=1"
            f"&sort=newest&types=article&section="
        )

        print(f"Scraping keyword: {key_word}")

        try:
            driver.get(search_url)
        except Exception:
            print(f"  Timeout loading page for '{key_word}', continuing...")

        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, "container__headline-text")
                )
            )
        except Exception:
            print(f"  No results found for '{key_word}'")
            continue

        soup = BeautifulSoup(driver.page_source, "html.parser")
        headlines = soup.find_all(
            "span", class_="container__headline-text"
        )

        new_count = 0

        for h in headlines:
            link = h.get("data-zjs-href")

            if not link:
                parent = h.find_parent("a")
                if parent:
                    link = parent.get("href")

            if not link:
                continue

            # Normalize CNN links
            if link.startswith("/"):
                link = "https://edition.cnn.com" + link

            if link not in seen_links:
                seen_links.add(link)
                unique_urls_list.append(link)
                new_count += 1

        print(f"  New links found: {new_count}")

except Exception as e:
    print(f"CRITICAL ERROR: {e}")

finally:
    if "driver" in locals():
        driver.quit()

# -------------------------------
# 6. Save links
# -------------------------------
if unique_urls_list:
    df_links = pd.DataFrame({"link": unique_urls_list})

    df_links.to_csv(
        CNN_LINKS_FILE,
        mode="a" if os.path.exists(CNN_LINKS_FILE) else "w",
        header=not os.path.exists(CNN_LINKS_FILE),
        index=False
    )

print("-" * 40)
print("Scraping complete.")
print(f"Total new links saved: {len(unique_urls_list)}")
print(f"File location: {CNN_LINKS_FILE}")
print("Preview:", unique_urls_list[:5])


Launching Chrome...
Chrome launched successfully.
Scraping keyword: inflation
  New links found: 96
Scraping keyword: stocks
  New links found: 95
Scraping keyword: economy
  New links found: 86
Scraping keyword: tax
  New links found: 82
Scraping keyword: taxes
  New links found: 24
Scraping keyword: business
  New links found: 40
Scraping keyword: finances
  New links found: 85
Scraping keyword: financial policy
  New links found: 72
Scraping keyword: economic policy
  New links found: 43
Scraping keyword: fiscal policy
  New links found: 87
Scraping keyword: GDP
  New links found: 62
Scraping keyword: unemployment
  New links found: 82
Scraping keyword: interest rates
  New links found: 82
Scraping keyword: recession
  New links found: 48
Scraping keyword: economic growth
  New links found: 25
Scraping keyword: budget deficit
  New links found: 58
Scraping keyword: trade deficit
  New links found: 67
Scraping keyword: consumer spending
  New links found: 53
Scraping keyword: investm

Scraping whole articles using scraped links

In [6]:
RAW_DATA_DIR = r"C:\Users\Enkhsaikhan\Final_paper_text_as_data\raw_data"
LINKS_FILE = os.path.join(RAW_DATA_DIR, "cnn_links.csv")
OUTPUT_FILE = os.path.join(RAW_DATA_DIR, "cnn_articles.csv")

In [9]:
urls = (
    pd.read_csv(LINKS_FILE)["link"]
    .dropna()
    .unique()
    .tolist()
)

print(f"Loaded {len(urls)} article links.")


Loaded 2357 article links.


In [11]:
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://edition.cnn.com/",
    "Connection": "keep-alive",
}

def scrape_article(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=15)
        if r.status_code != 200:
            return None

        soup = BeautifulSoup(r.text, "lxml")

        # ---- Date from URL ----
        m = re.search(r"/(\d{4})/(\d{2})/(\d{2})/", url)
        date = f"{m.group(1)}-{m.group(2)}-{m.group(3)}" if m else None

        # ---- Title ----
        title_tag = soup.find("h1")
        title = title_tag.get_text(strip=True) if title_tag else None

        # ---- Article body (multiple CNN layouts) ----
        paragraphs = []

        # Layout 1 (most common)
        article = soup.find("article")
        if article:
            paragraphs = article.find_all("p")

        # Layout 2
        if not paragraphs:
            body = soup.find("div", {"data-component-name": "article-body"})
            if body:
                paragraphs = body.find_all("p")

        # Layout 3 (legacy)
        if not paragraphs:
            body = soup.find("div", class_="article__content")
            if body:
                paragraphs = body.find_all("p")

        text = "\n".join(
            p.get_text(strip=True)
            for p in paragraphs
            if p.get_text(strip=True)
        )

        if not text:
            return None

        return {
            "title": title,
            "date": date,
            "body": text,
            "link": url
        }

    except Exception:
        return None
    
results = []

print("Scraping articles...")

for url in tqdm(urls):
    article = scrape_article(url)
    if article:
        results.append(article)

    # Random delay to avoid detection
    time.sleep(random.uniform(0.5, 1.2))

df = pd.DataFrame(results)
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")

print("-" * 40)
print(f"Scraping finished: {len(df)} articles saved.")
print(df.head())

Scraping articles...


100%|██████████| 2357/2357 [2:36:24<00:00,  3.98s/it]     


----------------------------------------
Scraping finished: 2272 articles saved.
                                               title        date  \
0  The US economy expanded at the fastest pace in...  2025-12-23   
1  Inflation cooled in November to 2.7%, but econ...  2025-12-18   
2  The No. 1 cause of America’s affordability pro...  2025-12-16   
3  The solution to America’s affordability proble...  2025-12-15   
4  Analysis: How the White House is using mislead...  2025-12-11   

                                                body  \
0  An initial reading of third-quarter gross dome...   
1  Inflation unexpectedly – and sharply – slowed ...   
2  America’scost-of-living problemis simple math:...   
3  The best way to fix Americans’ cost-of-living ...   
4  The year-over-yearinflation ratein January, th...   

                                                link  
0   https://www.cnn.com/2025/12/23/economy/us-gdp-q3  
1  https://www.cnn.com/2025/12/18/economy/us-cpi-...  
2  https