In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager # 自动化下载 chromedriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

# 设置 Chrome 选项 (可选，但建议用于无头模式等)
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 在后台运行，不打开浏览器窗口。调试时可以注释掉这行。
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")


# 使用 WebDriverManager 自动处理 chromedriver
driver = None # 初始化 driver 为 None，用于 finally 块
data = {}

# Helper function to extract data from a row (辅助函数：从行中提取数据)
def extract_row_data(soup_obj, aria_label_value):
    """
    根据 aria-label 查找对应的行，并提取 value1 和 value2。
    移除数字中的逗号。
    """
    label_cell = soup_obj.find('td', attrs={'aria-label': aria_label_value})
    if label_cell:
        row = label_cell.find_parent('tr')
        if row:
            cells = row.find_all('td')
            # 确保至少有3个 td (标签, 值1, 值2)
            if len(cells) >= 3:
                # 获取文本并移除可能的逗号和空格
                val1 = cells[1].get_text(strip=True).replace(',', '')
                val2 = cells[2].get_text(strip=True).replace(',', '')
                return {'value1': val1, 'value2': val2}
    return None # 如果未找到或解析失败，返回 None

# 定义需要抓取的项目及其对应的 aria-label
items_to_scrape = {
    "New Highs": "Issues At New Highs",
    "New Lows": "Issues At New Lows",
    "Advancing Issues": "Issues Advancing",   # 新增
    "Declining Issues": "Issues Declining" # 新增
}

try:
    print("Setting up WebDriver...")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    url = "https://www.wsj.com/market-data/stocks/us"
    print(f"Attempting to fetch URL: {url}")
    driver.get(url)

    wait_time = 20 # 秒
    print(f"Waiting up to {wait_time} seconds for content to load...")

    # --- 潜在步骤: 关闭 cookie 横幅 (如果出现) ---
    # 请根据实际情况调整或取消这部分代码
    # try:
    #     # 尝试查找并点击接受 cookie 的按钮。WSJ 的 ID 可能是 'onetrust-accept-btn-handler'
    #     cookie_button_id = 'onetrust-accept-btn-handler'
    #     cookie_button = WebDriverWait(driver, 10).until(
    #         EC.element_to_be_clickable((By.ID, cookie_button_id))
    #     )
    #     cookie_button.click()
    #     print("Cookie banner clicked (assumed accept).")
    #     time.sleep(2) # 等待页面响应
    # except Exception as e:
    #     print(f"No cookie banner found or could not click: {e}")
    # --- 结束潜在步骤 ---


    # 等待页面上的关键元素出现。我们等待 "New Highs" 标签出现，这表明相关的表格数据已加载。
    key_element_aria_label = "Issues At New Highs"
    key_element_xpath = f"//td[@aria-label='{key_element_aria_label}']"

    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.XPATH, key_element_xpath))
    )
    print(f"Content with '{key_element_aria_label}' seems to have loaded.")

    # 获取 JavaScript 执行后的页面源代码
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # 遍历定义的项目并抓取数据
    for display_name, aria_label in items_to_scrape.items():
        print(f"Attempting to extract data for: {display_name} (aria-label='{aria_label}')")
        row_data = extract_row_data(soup, aria_label)
        if row_data:
            data[display_name] = row_data
            print(f"  Successfully extracted: {row_data}")
        else:
            print(f"  Warning: Could not find or parse data for '{display_name}' (aria-label: '{aria_label}')")


except Exception as e:
    print(f"An error occurred during scraping: {e}")
    # 如果出错，可以保存页面源码以便调试
    # if driver:
    #     with open("wsj_error_page_combined.html", "w", encoding="utf-8") as f:
    #         f.write(driver.page_source)
    #     print("Saved error page source to wsj_error_page_combined.html")

finally:
    # 确保浏览器在任何情况下都被关闭
    if driver:
        driver.quit()
        print("Browser closed.")

print("\n--- Extracted Data ---")
output_filename="wsj_data.txt"
if data:
    for key, values in data.items():
        print(f"{key}: Value 1 = {values['value1']}, Value 2 = {values['value2']}")

    # --- 将数据保存到 TXT 文件 (JSON 格式) ---
    try:
        with open(output_filename, 'w', encoding='utf-8') as f_out:
            json.dump(data, f_out, indent=4, ensure_ascii=False) # indent 使 JSON 文件更易读
        print(f"\nData successfully saved to {output_filename}")
    except IOError as e_io:
        print(f"Error saving data to file {output_filename}: {e_io}")
    except Exception as e_json:
        print(f"Error during JSON serialization: {e_json}")
    # --- 保存结束 ---
else:
    print("No data was extracted. Nothing to save to file.")

Setting up WebDriver...
Attempting to fetch URL: https://www.wsj.com/market-data/stocks/us
Waiting up to 20 seconds for content to load...
Content with 'Issues At New Highs' seems to have loaded.
Attempting to extract data for: New Highs (aria-label='Issues At New Highs')
  Successfully extracted: {'value1': '48', 'value2': '70'}
Attempting to extract data for: New Lows (aria-label='Issues At New Lows')
  Successfully extracted: {'value1': '47', 'value2': '111'}
Attempting to extract data for: Advancing Issues (aria-label='Issues Advancing')
  Successfully extracted: {'value1': '1225', 'value2': '1823'}
Attempting to extract data for: Declining Issues (aria-label='Issues Declining')
  Successfully extracted: {'value1': '1537', 'value2': '2671'}
Browser closed.

--- Extracted Data ---
New Highs: Value 1 = 48, Value 2 = 70
New Lows: Value 1 = 47, Value 2 = 111
Advancing Issues: Value 1 = 1225, Value 2 = 1823
Declining Issues: Value 1 = 1537, Value 2 = 2671
Error during JSON serialization