In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json # 导入 json 模块

# 设置 Chrome 选项
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")

driver = None
data = {} # Initialize data dictionary
output_filename = "wsj_market_data.txt"

def extract_row_data(soup_obj, aria_label_value):
    label_cell = soup_obj.find('td', attrs={'aria-label': aria_label_value})
    if label_cell:
        row = label_cell.find_parent('tr')
        if row:
            cells = row.find_all('td')
            if len(cells) >= 3:
                val1 = cells[1].get_text(strip=True).replace(',', '')
                val2 = cells[2].get_text(strip=True).replace(',', '')
                return {'value1': val1, 'value2': val2}
    return None

items_to_scrape = {
    "New Highs": "Issues At New Highs",
    "New Lows": "Issues At New Lows",
    "Advancing Issues": "Issues Advancing",
    "Declining Issues": "Issues Declining"
}

try:
    print("Setting up WebDriver...")
    for attempt in range(3):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            print("WebDriver setup successful.")
            break
        except Exception as e_wd:
            print(f"WebDriver setup attempt {attempt + 1} failed: {e_wd}")
            if attempt < 2:
                print("Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print("Max retries reached for WebDriver setup. Exiting.")
                raise

    if not driver:
        print("Failed to initialize WebDriver after multiple attempts.")
        exit()

    url = "https://www.wsj.com/market-data/stocks/us"
    print(f"Attempting to fetch URL: {url}")
    driver.get(url)

    wait_time = 20
    print(f"Waiting up to {wait_time} seconds for page elements to load...")

    # --- 尝试处理 Cookie 横幅 ---
    try:
        cookie_button_id = 'onetrust-accept-btn-handler'
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, cookie_button_id))
        )
        cookie_button.click()
        print("Cookie banner likely accepted.")
        time.sleep(3)
    except Exception as e_cookie:
        print(f"No cookie banner found or could not click (ID: {cookie_button_id}): {e_cookie}")
    # --- Cookie 处理结束 ---

    # Wait for a key element from the main table to ensure it's loaded
    key_table_element_xpath = f"//td[@aria-label='{items_to_scrape['New Highs']}']"
    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.XPATH, key_table_element_xpath))
    )
    print(f"Main table content (e.g., '{items_to_scrape['New Highs']}') seems to have loaded.")

    # Also wait for the "Markets Diary" title or timestamp to be present
    # The timestamp span has class WSJBase--card__timestamp--3F2HxyAE
    # Its parent div has class WSJBase--card__title--3kQqqhNt
    diary_timestamp_xpath = "//span[contains(@class, 'WSJBase--card__timestamp')]" # More robust selector
    # Alternative, more specific if class is stable:
    # diary_timestamp_xpath = "//span[@class='WSJBase--card__timestamp--3F2HxyAE']"
    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.XPATH, diary_timestamp_xpath))
    )
    print("Markets Diary timestamp element seems to have loaded.")
    
    time.sleep(2) # Extra pause for full rendering

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # --- Extract Market Timestamp ---
    market_timestamp_str = "N/A" # Default value
    timestamp_span = soup.find('span', class_='WSJBase--card__timestamp--3F2HxyAE')
    # You can also find the parent div first if needed for more context or stability:
    # diary_title_div = soup.find('div', class_='WSJBase--card__title--3kQqqhNt')
    # if diary_title_div:
    #     timestamp_span = diary_title_div.find('span', class_='WSJBase--card__timestamp--3F2HxyAE')

    if timestamp_span:
        market_timestamp_str = timestamp_span.get_text(strip=True)
        print(f"Extracted Market Timestamp: {market_timestamp_str}")
    else:
        print("Warning: Market timestamp span not found using class 'WSJBase--card__timestamp--3F2HxyAE'.")
    
    data['market_timestamp'] = market_timestamp_str # Add to data dictionary
    # --- Timestamp Extraction End ---

    # --- Extract Table Data ---
    table_data = {}
    for display_name, aria_label in items_to_scrape.items():
        # print(f"Attempting to extract data for: {display_name} (aria-label='{aria_label}')")
        row_data = extract_row_data(soup, aria_label)
        if row_data:
            table_data[display_name] = row_data
            # print(f"  Successfully extracted: {row_data}")
        else:
            print(f"  Warning: Could not find or parse data for '{display_name}' (aria-label: '{aria_label}')")
    
    data['metrics'] = table_data # Store table metrics under a sub-key for better organization
    # --- Table Data Extraction End ---

except Exception as e:
    print(f"An error occurred during scraping: {e}")
    if driver:
        try:
            with open("wsj_error_page_debug.html", "w", encoding="utf-8") as f_debug:
                f_debug.write(driver.page_source)
            print("Saved error page source to wsj_error_page_debug.html")
        except Exception as e_save:
            print(f"Could not save debug page source: {e_save}")

finally:
    if driver:
        driver.quit()
        print("Browser closed.")

print("\n--- Extracted Data (including Timestamp) ---")
if data.get('market_timestamp') != "N/A" or data.get('metrics'): # Check if any data was actually extracted
    print(f"Market Timestamp: {data.get('market_timestamp', 'N/A')}")
    if data.get('metrics'):
        for key, values in data['metrics'].items():
            print(f"  {key}: Value 1 = {values['value1']}, Value 2 = {values['value2']}")

    # --- 将数据保存到 TXT 文件 (JSON 格式) ---
    try:
        with open(output_filename, 'w', encoding='utf-8') as f_out:
            json.dump(data, f_out, indent=4, ensure_ascii=False)
        print(f"\nData successfully saved to {output_filename}")
    except IOError as e_io:
        print(f"Error saving data to file {output_filename}: {e_io}")
    except Exception as e_json:
        print(f"Error during JSON serialization: {e_json}")
    # --- 保存结束 ---
else:
    print("No significant data was extracted. Nothing to save to file.")

Setting up WebDriver...
WebDriver setup successful.
Attempting to fetch URL: https://www.wsj.com/market-data/stocks/us
Waiting up to 20 seconds for page elements to load...
No cookie banner found or could not click (ID: onetrust-accept-btn-handler): Message: 
Stacktrace:
	GetHandleVerifier [0x0074FC03+61635]
	GetHandleVerifier [0x0074FC44+61700]
	(No symbol) [0x005705D3]
	(No symbol) [0x005B899E]
	(No symbol) [0x005B8D3B]
	(No symbol) [0x00600E12]
	(No symbol) [0x005DD2E4]
	(No symbol) [0x005FE61B]
	(No symbol) [0x005DD096]
	(No symbol) [0x005AC840]
	(No symbol) [0x005AD6A4]
	GetHandleVerifier [0x009D4523+2701795]
	GetHandleVerifier [0x009CFCA6+2683238]
	GetHandleVerifier [0x009EA9EE+2793134]
	GetHandleVerifier [0x007668C5+155013]
	GetHandleVerifier [0x0076CFAD+181357]
	GetHandleVerifier [0x00757458+92440]
	GetHandleVerifier [0x00757600+92864]
	GetHandleVerifier [0x00741FF0+5296]
	BaseThreadInitThunk [0x75935D49+25]
	RtlInitializeExceptionChain [0x7731D03B+107]
	RtlGetAppContainerNamed

In [None]:
import json

def load_data_from_file(filepath="wsj_market_data.txt"):
    """
    从指定的 TXT 文件加载 JSON 数据。
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f_in:
            data = json.load(f_in)
        return data
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{filepath}'. The file might be corrupted or not in JSON format.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while reading '{filepath}': {e}")
        return None

if __name__ == "__main__":
    market_data_filepath = "wsj_market_data.txt"
    loaded_data = load_data_from_file(market_data_filepath)

    if loaded_data:
        print("--- Successfully Loaded Market Data ---")
        
        # 读取并打印市场时间戳
        market_time = loaded_data.get('market_timestamp', 'Timestamp not found in data')
        print(f"Market Timestamp: {market_time}")

        # 读取并打印指标数据 (if you used the nested structure)
        metrics_data = loaded_data.get('metrics')
        if metrics_data:
            print("\nMetrics:")
            for key, values in metrics_data.items():
                print(f"  {key}: Value 1 = {values.get('value1', 'N/A')}, Value 2 = {values.get('value2', 'N/A')}")
        else:
            print("No metrics data found.")
            # If you didn't nest 'metrics', you'd iterate directly over loaded_data for those keys,
            # skipping 'market_timestamp'. Example for flat structure:
            # print("\nMetrics:")
            # for key, values in loaded_data.items():
            #     if key != 'market_timestamp' and isinstance(values, dict): # Check if it's metric data
            #         print(f"  {key}: Value 1 = {values.get('value1', 'N/A')}, Value 2 = {values.get('value2', 'N/A')}")


        # Example of accessing specific data:
        # if metrics_data and "New Highs" in metrics_data:
        #     new_highs_val1 = metrics_data["New Highs"].get("value1")
        #     print(f"\nSpecific access - New Highs Value 1: {new_highs_val1}")
            
    else:
        print("Failed to load data from the file.")

In [7]:
#贪婪恐惧指数cnn
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json # 导入 json 模块

# 设置 Chrome 选项
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
# options.add_argument("--window-size=1920,1080") # 有时无头模式下需要指定窗口大小

driver = None
data = {} # 初始化数据字典
output_filename = "cnn_fear_greed_data.txt" # 修改输出文件名

# 注意：CNN页面结构相对简单，可能不需要复杂的extract_row_data函数，直接在主逻辑中提取

try:
    print("Setting up WebDriver...")
    for attempt in range(3):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            print("WebDriver setup successful.")
            break
        except Exception as e_wd:
            print(f"WebDriver setup attempt {attempt + 1} failed: {e_wd}")
            if attempt < 2:
                print("Retrying in 5 seconds...")
                time.sleep(5)
            else:
                print("Max retries reached for WebDriver setup. Exiting.")
                raise

    if not driver:
        print("Failed to initialize WebDriver after multiple attempts.")
        exit()

    url = "https://www.cnn.com/markets/fear-and-greed" # 修改URL
    print(f"Attempting to fetch URL: {url}")
    driver.get(url)

    wait_time = 10 # 增加等待时间，CNN页面有时加载较慢
    print(f"Waiting up to {wait_time} seconds for page elements to load...")

    # --- 尝试处理 Cookie 横幅 (CNN的Cookie横幅ID可能不同，需要检查) ---
    # CNN Cookie banner might be different. Common ones are related to 'onetrust'
    # Inspect the page to find the correct ID or class if this doesn't work.
    # Sometimes it's a button within a div with id 'onetrust-banner-sdk'
    # For CNN, it seems to be a button inside a shadow DOM or a more complex structure.
    # Let's try a general approach if a simple ID click fails.
    # Often, the cookie banner might not interfere with headless scraping of main content.
    # If it does, more advanced handling (like executing JavaScript to click) might be needed.
    # For now, we'll keep a simple attempt.
    try:
        # Try a common ID first
        cookie_button_id = 'onetrust-accept-btn-handler' # This is a common ID
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, cookie_button_id))
        )
        cookie_button.click()
        print("Cookie banner (ID) likely accepted.")
        time.sleep(3) # Give time for banner to disappear
    except Exception as e_cookie_id:
        print(f"Could not click cookie banner by ID '{cookie_button_id}': {e_cookie_id}")
        # try:
        #     # Fallback: Look for a button with common text if ID fails (less reliable)
        #     # Note: This XPath is very generic and might click the wrong button on other pages
        #     cookie_button_xpath = "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept') or contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agree')]"
        #     cookie_button = WebDriverWait(driver, 5).until(
        #         EC.element_to_be_clickable((By.XPATH, cookie_button_xpath))
        #     )
        #     cookie_button.click()
        #     print("Cookie banner (XPath Text) likely accepted.")
        #     time.sleep(3)
        # except Exception as e_cookie_xpath:
        #     print(f"No common cookie banner found or could not click (XPath Text): {e_cookie_xpath}")
        #     # If the cookie banner is persistent and problematic, you might need to investigate its shadow DOM structure
        #     # or find a more reliable selector. Often, headless mode bypasses some overlays.
        print("Continuing without explicit cookie banner click, hoping it doesn't interfere.")


    # Wait for the specific element containing the Fear & Greed value
    # The HTML provided: <span class="market-fng-gauge__dial-number-value">64</span>
    target_value_class = "market-fng-gauge__dial-number-value"
    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.CLASS_NAME, target_value_class))
    )
    print(f"Fear & Greed index value element (class: '{target_value_class}') seems to have loaded.")

    # Also wait for the timestamp element if desired
    # The timestamp is usually in a div like: <div class="market-fng-gauge__timestamp">Last updated ...</div>
    timestamp_class = "market-fng-gauge__timestamp"
    WebDriverWait(driver, wait_time).until(
        EC.presence_of_element_located((By.CLASS_NAME, timestamp_class))
    )
    print(f"Fear & Greed timestamp element (class: '{timestamp_class}') seems to have loaded.")

    time.sleep(2) # Extra pause for full rendering, especially if JS updates are slow

    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # --- Extract Fear & Greed Index Value ---
    fear_greed_index_str = "N/A" # Default value
    value_span = soup.find('span', class_=target_value_class)
    if value_span:
        fear_greed_index_str = value_span.get_text(strip=True)
        print(f"Extracted Fear & Greed Index String: {fear_greed_index_str}")
        try:
            data['fear_greed_index'] = int(fear_greed_index_str)
        except ValueError:
            data['fear_greed_index'] = fear_greed_index_str # Store as string if conversion fails
            print(f"Warning: Could not convert '{fear_greed_index_str}' to an integer.")
    else:
        print(f"Warning: Fear & Greed index value span not found using class '{target_value_class}'.")
        data['fear_greed_index'] = "N/A"
    # --- Value Extraction End ---

    # --- Extract Market Timestamp ---
    market_timestamp_str = "N/A" # Default value
    timestamp_div = soup.find('div', class_=timestamp_class)
    if timestamp_div:
        market_timestamp_str = timestamp_div.get_text(strip=True)
        # Clean up the timestamp string if needed, e.g., remove "Last updated "
        if "Last updated " in market_timestamp_str:
            market_timestamp_str = market_timestamp_str.replace("Last updated ", "").strip()
        print(f"Extracted Market Timestamp: {market_timestamp_str}")
    else:
        print(f"Warning: Market timestamp div not found using class '{timestamp_class}'.")

    data['market_timestamp'] = market_timestamp_str # Add to data dictionary
    # --- Timestamp Extraction End ---


except Exception as e:
    print(f"An error occurred during scraping: {e}")
    if driver:
        try:
            # Save page source for debugging if an error occurs
            debug_filename = "cnn_error_page_debug.html"
            with open(debug_filename, "w", encoding="utf-8") as f_debug:
                f_debug.write(driver.page_source)
            print(f"Saved error page source to {debug_filename}")
        except Exception as e_save:
            print(f"Could not save debug page source: {e_save}")

finally:
    if driver:
        driver.quit()
        print("Browser closed.")

print("\n--- Extracted Data ---")
# Check if any meaningful data was extracted
if data.get('fear_greed_index', "N/A") != "N/A" or data.get('market_timestamp', "N/A") != "N/A":
    print(f"Market Timestamp: {data.get('market_timestamp', 'N/A')}")
    print(f"Fear & Greed Index: {data.get('fear_greed_index', 'N/A')}")

    # --- 将数据保存到 TXT 文件 (JSON 格式) ---
    try:
        with open(output_filename, 'w', encoding='utf-8') as f_out:
            json.dump(data, f_out, indent=4, ensure_ascii=False)
        print(f"\nData successfully saved to {output_filename}")
    except IOError as e_io:
        print(f"Error saving data to file {output_filename}: {e_io}")
    except Exception as e_json: # Catch any other exceptions during JSON processing
        print(f"Error  during JSON serialization or file writing: {e_json}")
    # --- 保存结束 ---
else:
    print("No significant data was extracted. Nothing to save to file.")

Setting up WebDriver...
WebDriver setup successful.
Attempting to fetch URL: https://www.cnn.com/markets/fear-and-greed
Waiting up to 30 seconds for page elements to load...
Could not click cookie banner by ID 'onetrust-accept-btn-handler': Message: 
Stacktrace:
	GetHandleVerifier [0x00ADFC03+61635]
	GetHandleVerifier [0x00ADFC44+61700]
	(No symbol) [0x009005D3]
	(No symbol) [0x0094899E]
	(No symbol) [0x00948D3B]
	(No symbol) [0x00990E12]
	(No symbol) [0x0096D2E4]
	(No symbol) [0x0098E61B]
	(No symbol) [0x0096D096]
	(No symbol) [0x0093C840]
	(No symbol) [0x0093D6A4]
	GetHandleVerifier [0x00D64523+2701795]
	GetHandleVerifier [0x00D5FCA6+2683238]
	GetHandleVerifier [0x00D7A9EE+2793134]
	GetHandleVerifier [0x00AF68C5+155013]
	GetHandleVerifier [0x00AFCFAD+181357]
	GetHandleVerifier [0x00AE7458+92440]
	GetHandleVerifier [0x00AE7600+92864]
	GetHandleVerifier [0x00AD1FF0+5296]
	BaseThreadInitThunk [0x75935D49+25]
	RtlInitializeExceptionChain [0x7731D03B+107]
	RtlGetAppContainerNamedObjectPat