In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import time

# Open CSV file to save results
with open("cnn_headlines.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Headline", "Link"])  # header row

    driver = webdriver.Chrome()

    for page in range(1, 19):  # Pages 1 to 18
        url = f"https://edition.cnn.com/search?q=economy+inflation+finance+stock+market+&from={(page-1)*10}&size=10&page=={page}&sort=relevance&types=article&section="
        driver.get(url)
        # Wait for headlines to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
        )

        # Small pause to ensure all content loads
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        headlines = soup.find_all("span", class_="container__headline-text")

        for h in headlines:
            headline = h.get_text(strip=True)
            link = h.get("data-zjs-href")  # URL of the article

            # Write to CSV
            writer.writerow([headline, link])

            print("Headline:", headline)
            print("Link:", link)
            print("-" * 80)

    driver.quit()

print("Scraping complete. Results saved to cnn_headlines.csv")


Headline: There’s an economic explanation for why everything feels so tense right now
Link: https://www.cnn.com/2025/10/28/business/economy-tense-nightcap
--------------------------------------------------------------------------------
Headline: As the shutdown drags on, here’s how it can drag down the economy
Link: https://www.cnn.com/2025/10/31/economy/us-government-shutdown-economy-snowball
--------------------------------------------------------------------------------
Headline: The government is back open. Here’s what that means for economic data
Link: https://www.cnn.com/2025/11/14/economy/federal-data-what-to-expect-economic-data
--------------------------------------------------------------------------------
Headline: Walmart just proved it’s America’s solution to the affordability crisis
Link: https://www.cnn.com/2025/11/20/business/walmart-economy-consumers
--------------------------------------------------------------------------------
Headline: What will happen next in this

In [59]:
import requests
from bs4 import BeautifulSoup
import csv
import os

# Input CSV with scraped links
input_file = "cnn_headlines.csv"

# Output CSV for full articles
output_file = r"C:\Users\Enkhsaikhan\Data-engineering-scrap\Scaping_data\cnn_full_articles.csv"

# Ensure the folder exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Read links from the input CSV
links = []
with open(input_file, mode="r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        links.append(row["Link"])

# Open output CSV to save full articles
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Headline", "Link", "Date", "Article Text"])  # header

    for url in links:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Headline from article page
        headline_tag = soup.find("h1")
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        # Date
        date_tag = soup.find("div", class_="timestamp__published")
        time_tag = date_tag.find("time") if date_tag else None
        date = time_tag.get_text(strip=True) if time_tag else None

        # Article text
        article_div = soup.find("div", class_="article__content-container")
        if article_div:
            paragraphs = article_div.find_all("p")
            article_text = "\n".join([p.get_text(strip=True) for p in paragraphs])
        else:
            article_text = None

        # Write to CSV
        writer.writerow([headline, url, date, article_text])

        print("Saved article:", headline)
        print("Date:", date)
        print("-" * 80)

print(f"All articles saved to {output_file}")


Saved article: There’s an economic explanation for why everything feels so tense right now
Date: Oct 28, 2025, 5:00 AM ET
--------------------------------------------------------------------------------
Saved article: As the shutdown drags on, here’s how it can drag down the economy
Date: Oct 31, 2025, 5:30 AM ET
--------------------------------------------------------------------------------
Saved article: The government is back open. Here’s what that means for economic data
Date: Nov 14, 2025, 11:48 AM ET
--------------------------------------------------------------------------------
Saved article: Walmart just proved it’s America’s solution to the affordability crisis
Date: Nov 20, 2025, 7:13 AM ET
--------------------------------------------------------------------------------
Saved article: What will happen next in this topsy-turvy stock market? Choose your own Wall Street adventure!
Date: Oct 17, 2025, 12:40 PM ET
-----------------------------------------------------------------