In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json

# --- Configuration ---
URL = "https://www.wsj.com/market-data/stocks/us"
OUTPUT_FILENAME = "wsj_market_data.txt"
MAX_WEBDRIVER_SETUP_ATTEMPTS = 3
MAX_SCRAPING_ATTEMPTS = 3
RETRY_DELAY_SECONDS = 5 # Delay between scraping retries
WAIT_TIME_SECONDS = 20

ITEMS_TO_SCRAPE = {
    "New Highs": "Issues At New Highs",
    "New Lows": "Issues At New Lows",
    "Advancing Issues": "Issues Advancing",
    "Declining Issues": "Issues Declining"
}
# --- End Configuration ---

# --- Helper Function ---
def extract_row_data(soup_obj, aria_label_value):
    label_cell = soup_obj.find('td', attrs={'aria-label': aria_label_value})
    if label_cell:
        row = label_cell.find_parent('tr')
        if row:
            cells = row.find_all('td')
            if len(cells) >= 3:
                val1 = cells[1].get_text(strip=True).replace(',', '')
                val2 = cells[2].get_text(strip=True).replace(',', '')
                return {'value1': val1, 'value2': val2}
    return None
# --- End Helper Function ---

# --- Main Script ---
driver = None
final_data = {} # Initialize to store final successfully scraped data

# Setup Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")

try:
    # --- WebDriver Setup with Retries ---
    print("Setting up WebDriver...")
    for attempt in range(MAX_WEBDRIVER_SETUP_ATTEMPTS):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            print("WebDriver setup successful.")
            break # Exit loop on successful setup
        except Exception as e_wd:
            print(f"WebDriver setup attempt {attempt + 1}/{MAX_WEBDRIVER_SETUP_ATTEMPTS} failed: {e_wd}")
            if attempt < MAX_WEBDRIVER_SETUP_ATTEMPTS - 1:
                print(f"Retrying WebDriver setup in {RETRY_DELAY_SECONDS} seconds...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                print("Max retries reached for WebDriver setup. Exiting.")
                raise # Re-raise the last exception if all attempts fail
    if not driver:
        print("Failed to initialize WebDriver. Exiting script.")
        exit()
    # --- End WebDriver Setup ---

    # --- Scraping Loop with Retries ---
    for scrape_attempt in range(MAX_SCRAPING_ATTEMPTS):
        print(f"\n--- Scraping Attempt {scrape_attempt + 1}/{MAX_SCRAPING_ATTEMPTS} ---")
        current_attempt_data = {} # Reset data for this attempt

        try:
            print(f"Attempting to fetch URL: {URL}")
            driver.get(URL)

            print(f"Waiting up to {WAIT_TIME_SECONDS} seconds for page elements to load...")

            # --- Handle Cookie Banner ---
            try:
                cookie_button_id = 'onetrust-accept-btn-handler'
                cookie_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.ID, cookie_button_id))
                )
                cookie_button.click()
                print("Cookie banner accepted.")
                time.sleep(3) # Give time for banner to disappear
            except Exception as e_cookie:
                print(f"No cookie banner found or could not click (ID: {cookie_button_id}): {e_cookie}")
            # --- End Cookie Handling ---

            # --- Wait for Key Elements ---
            key_table_element_xpath = f"//td[@aria-label='{ITEMS_TO_SCRAPE['New Highs']}']"
            WebDriverWait(driver, WAIT_TIME_SECONDS).until(
                EC.presence_of_element_located((By.XPATH, key_table_element_xpath))
            )
            print(f"Main table content (e.g., '{ITEMS_TO_SCRAPE['New Highs']}') seems to have loaded.")

            diary_timestamp_xpath = "//span[contains(@class, 'WSJBase--card__timestamp')]"
            WebDriverWait(driver, WAIT_TIME_SECONDS).until(
                EC.presence_of_element_located((By.XPATH, diary_timestamp_xpath))
            )
            print("Markets Diary timestamp element seems to have loaded.")
            time.sleep(2) # Extra pause for full rendering
            # --- End Wait for Key Elements ---

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # --- Extract Market Timestamp ---
            market_timestamp_str = "N/A"
            timestamp_span = soup.find('span', class_=lambda x: x and 'WSJBase--card__timestamp' in x) # More robust class search
            if timestamp_span:
                market_timestamp_str = timestamp_span.get_text(strip=True)
                print(f"Extracted Market Timestamp: {market_timestamp_str}")
            else:
                print("Warning: Market timestamp span not found.")
            current_attempt_data['market_timestamp'] = market_timestamp_str
            # --- End Timestamp Extraction ---

            # --- Extract Table Data ---
            table_data = {}
            for display_name, aria_label in ITEMS_TO_SCRAPE.items():
                row_data = extract_row_data(soup, aria_label)
                if row_data:
                    table_data[display_name] = row_data
                else:
                    print(f"  Warning: Could not find or parse data for '{display_name}' (aria-label: '{aria_label}')")
            current_attempt_data['metrics'] = table_data
            # --- End Table Data Extraction ---

            # --- Check if data is sufficient ---
            is_timestamp_valid = current_attempt_data.get('market_timestamp', "N/A") != "N/A"
            are_metrics_present = bool(current_attempt_data.get('metrics'))

            if is_timestamp_valid or are_metrics_present: # Consider valid if at least timestamp OR some metrics found
                print(f"Data successfully scraped on attempt {scrape_attempt + 1}.")
                final_data = current_attempt_data # Store successful data
                break # Exit scraping loop on success
            else:
                print(f"Scraping attempt {scrape_attempt + 1} yielded empty data.")
                if scrape_attempt < MAX_SCRAPING_ATTEMPTS - 1:
                    print(f"Retrying in {RETRY_DELAY_SECONDS} seconds...")
                    time.sleep(RETRY_DELAY_SECONDS)
                else:
                    print("Max scraping attempts reached. No valid data obtained.")

        except Exception as e_scrape:
            print(f"An error occurred during scraping attempt {scrape_attempt + 1}: {e}")
            try:
                # Try to save page source only if driver is available
                if driver:
                    debug_filename = f"wsj_error_page_attempt_{scrape_attempt + 1}.html"
                    with open(debug_filename, "w", encoding="utf-8") as f_debug:
                        f_debug.write(driver.page_source)
                    print(f"Saved error page source to {debug_filename}")
            except Exception as e_save_debug:
                print(f"Could not save debug page source: {e_save_debug}")
            
            if scrape_attempt < MAX_SCRAPING_ATTEMPTS - 1:
                print(f"Retrying scraping in {RETRY_DELAY_SECONDS} seconds due to error...")
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                print("Max scraping attempts reached after errors.")
        # No driver.quit() here, we reuse it for the next attempt or quit it in the outer finally

    # --- End Scraping Loop ---

except Exception as e_outer:
    print(f"An critical error occurred outside the scraping loop: {e_outer}")

finally:
    if driver:
        driver.quit()
        print("Browser closed.")

# --- Process and Save Final Data ---
print("\n--- Final Extracted Data ---")
if final_data.get('market_timestamp', 'N/A') != "N/A" or final_data.get('metrics'):
    print(f"Market Timestamp: {final_data.get('market_timestamp', 'N/A')}")
    if final_data.get('metrics'):
        for key, values in final_data['metrics'].items():
            print(f"  {key}: Value 1 = {values['value1']}, Value 2 = {values['value2']}")

    try:
        with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f_out:
            json.dump(final_data, f_out, indent=4, ensure_ascii=False)
        print(f"\nData successfully saved to {OUTPUT_FILENAME}")
    except IOError as e_io:
        print(f"Error saving data to file {OUTPUT_FILENAME}: {e_io}")
    except Exception as e_json:
        print(f"Error during JSON serialization: {e_json}")
else:
    print("No significant data was extracted after all attempts. Nothing to save to file.")

print("Script finished.")

Setting up WebDriver...
WebDriver setup successful.
Attempting to fetch URL: https://www.wsj.com/market-data/stocks/us
Waiting up to 20 seconds for page elements to load...
No cookie banner found or could not click (ID: onetrust-accept-btn-handler): Message: 
Stacktrace:
	GetHandleVerifier [0x0074FC03+61635]
	GetHandleVerifier [0x0074FC44+61700]
	(No symbol) [0x005705D3]
	(No symbol) [0x005B899E]
	(No symbol) [0x005B8D3B]
	(No symbol) [0x00600E12]
	(No symbol) [0x005DD2E4]
	(No symbol) [0x005FE61B]
	(No symbol) [0x005DD096]
	(No symbol) [0x005AC840]
	(No symbol) [0x005AD6A4]
	GetHandleVerifier [0x009D4523+2701795]
	GetHandleVerifier [0x009CFCA6+2683238]
	GetHandleVerifier [0x009EA9EE+2793134]
	GetHandleVerifier [0x007668C5+155013]
	GetHandleVerifier [0x0076CFAD+181357]
	GetHandleVerifier [0x00757458+92440]
	GetHandleVerifier [0x00757600+92864]
	GetHandleVerifier [0x00741FF0+5296]
	BaseThreadInitThunk [0x75935D49+25]
	RtlInitializeExceptionChain [0x7731D03B+107]
	RtlGetAppContainerNamed

In [None]:
import json

def load_data_from_file(filepath="wsj_market_data.txt"):
    """
    从指定的 TXT 文件加载 JSON 数据。
    """
    try:
        with open(filepath, 'r', encoding='utf-8') as f_in:
            data = json.load(f_in)
        return data
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from '{filepath}'. The file might be corrupted or not in JSON format.")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while reading '{filepath}': {e}")
        return None

if __name__ == "__main__":
    market_data_filepath = "wsj_market_data.txt"
    loaded_data = load_data_from_file(market_data_filepath)

    if loaded_data:
        print("--- Successfully Loaded Market Data ---")
        
        # 读取并打印市场时间戳
        market_time = loaded_data.get('market_timestamp', 'Timestamp not found in data')
        print(f"Market Timestamp: {market_time}")

        # 读取并打印指标数据 (if you used the nested structure)
        metrics_data = loaded_data.get('metrics')
        if metrics_data:
            print("\nMetrics:")
            for key, values in metrics_data.items():
                print(f"  {key}: Value 1 = {values.get('value1', 'N/A')}, Value 2 = {values.get('value2', 'N/A')}")
        else:
            print("No metrics data found.")
            # If you didn't nest 'metrics', you'd iterate directly over loaded_data for those keys,
            # skipping 'market_timestamp'. Example for flat structure:
            # print("\nMetrics:")
            # for key, values in loaded_data.items():
            #     if key != 'market_timestamp' and isinstance(values, dict): # Check if it's metric data
            #         print(f"  {key}: Value 1 = {values.get('value1', 'N/A')}, Value 2 = {values.get('value2', 'N/A')}")


        # Example of accessing specific data:
        # if metrics_data and "New Highs" in metrics_data:
        #     new_highs_val1 = metrics_data["New Highs"].get("value1")
        #     print(f"\nSpecific access - New Highs Value 1: {new_highs_val1}")
            
    else:
        print("Failed to load data from the file.")

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import json

# --- Global Settings ---
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 10
OUTPUT_FILENAME = "cnn_fear_greed_data.txt" # Using .json for clarity

# Chrome options (defined once)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36")
# options.add_argument("--window-size=1920,1080")

def scrape_cnn_data(attempt_num):
    """
    Attempts to scrape CNN Fear & Greed data.
    Returns a tuple (data_dict, success_bool).
    Manages its own WebDriver instance.
    """
    print(f"\n--- Scraping Attempt {attempt_num} of {MAX_RETRIES} ---")
    local_driver = None
    scraped_data = {}
    attempt_success = False

    try:
        print("Setting up WebDriver for this attempt...")
        for wd_setup_attempt in range(3): # WebDriver setup retry loop
            try:
                local_driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
                print("WebDriver setup successful for this attempt.")
                break
            except Exception as e_wd:
                print(f"WebDriver setup (inner) attempt {wd_setup_attempt + 1} failed: {e_wd}")
                if wd_setup_attempt < 2:
                    print("Retrying WebDriver setup in 5 seconds...")
                    time.sleep(5)
                else:
                    print("Max retries reached for WebDriver setup for this attempt.")
                    # This exception will be caught by the outer try-except of scrape_cnn_data
                    raise RuntimeError("WebDriver setup failed after multiple retries.")

        if not local_driver:
            print("Failed to initialize WebDriver for this attempt. This attempt will fail.")
            return {}, False

        url = "https://www.cnn.com/markets/fear-and-greed"
        print(f"Attempting to fetch URL: {url}")
        local_driver.get(url)

        # Increased wait time as CNN can be slow and has dynamic content
        wait_time = 25
        print(f"Waiting up to {wait_time} seconds for page elements to load...")

        # --- Optional: Cookie Banner Handling ---
        # CNN's cookie banner can be tricky. This is a best-effort attempt.
        # If it fails or is not needed, scraping might still proceed.
        try:
            # Common cookie banner button ID, but might change or be in shadow DOM
            cookie_button_id = 'onetrust-accept-btn-handler'
            cookie_button_xpath = f"//button[@id='{cookie_button_id}']" # More robust locator

            # Sometimes the banner is within an iframe or shadow DOM,
            # making direct clicks hard. If this simple click fails,
            # the script will continue, hoping the banner doesn't obstruct.
            WebDriverWait(local_driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, cookie_button_xpath))
            ).click()
            print("Cookie banner likely accepted.")
            time.sleep(3)  # Give time for banner to disappear and page to settle
        except Exception as e_cookie:
            print(f"Could not click cookie banner (ID: {cookie_button_id}): {e_cookie}. Continuing...")
            # Try to scroll down a bit if banner is an overlay
            try:
                local_driver.execute_script("window.scrollBy(0, 200);")
                time.sleep(1)
            except:
                pass


        # Wait for the specific element containing the Fear & Greed value
        target_value_class = "market-fng-gauge__dial-number-value"
        WebDriverWait(local_driver, wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, target_value_class))
        )
        # Also wait for the timestamp element
        timestamp_class = "market-fng-gauge__timestamp"
        WebDriverWait(local_driver, wait_time).until(
            EC.presence_of_element_located((By.CLASS_NAME, timestamp_class))
        )
        print("Key elements (value and timestamp) seem to have loaded.")
        time.sleep(3) # Extra pause for full rendering

        page_source = local_driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # --- Extract Fear & Greed Index Value ---
        fear_greed_index_str = "N/A"
        value_span = soup.find('span', class_=target_value_class)
        if value_span:
            fear_greed_index_str = value_span.get_text(strip=True)
            print(f"Extracted Fear & Greed Index String: {fear_greed_index_str}")
            try:
                scraped_data['fear_greed_index'] = int(fear_greed_index_str)
            except ValueError:
                scraped_data['fear_greed_index'] = fear_greed_index_str
                print(f"Warning: Could not convert Fear & Greed Index '{fear_greed_index_str}' to an integer.")
        else:
            print(f"Warning: Fear & Greed index value span not found using class '{target_value_class}'.")
            scraped_data['fear_greed_index'] = "N/A"

        # --- Extract Market Timestamp ---
        market_timestamp_str = "N/A"
        timestamp_div = soup.find('div', class_=timestamp_class)
        if timestamp_div:
            market_timestamp_str = timestamp_div.get_text(strip=True)
            if "Last updated " in market_timestamp_str: # Clean up the string
                market_timestamp_str = market_timestamp_str.replace("Last updated ", "").strip()
            print(f"Extracted Market Timestamp: {market_timestamp_str}")
        else:
            print(f"Warning: Market timestamp div not found using class '{timestamp_class}'.")
        scraped_data['market_timestamp'] = market_timestamp_str

        # --- Determine if this attempt was successful ---
        if scraped_data.get('fear_greed_index', "N/A") != "N/A" and \
           scraped_data.get('market_timestamp', "N/A") != "N/A":
            print("Data extraction successful for this attempt.")
            attempt_success = True
        else:
            print("Data extraction was incomplete for this attempt.")
            # Save page source for debugging if extraction failed but page seemed to load
            debug_filename = f"cnn_extraction_fail_debug_attempt_{attempt_num}.html"
            try:
                with open(debug_filename, "w", encoding="utf-8") as f_debug:
                    f_debug.write(local_driver.page_source)
                print(f"Saved incomplete extraction page source to {debug_filename}")
            except Exception as e_save:
                print(f"Could not save debug page source for incomplete extraction: {e_save}")

        return scraped_data, attempt_success

    except Exception as e:
        print(f"An error occurred during scraping attempt {attempt_num}: {e}")
        if local_driver: # If driver was initialized, try to save page source
            try:
                # Save page source for debugging if an error occurs
                debug_filename = f"cnn_error_page_debug_attempt_{attempt_num}.html"
                with open(debug_filename, "w", encoding="utf-8") as f_debug:
                    f_debug.write(local_driver.page_source)
                print(f"Saved error page source to {debug_filename}")
            except Exception as e_save:
                print(f"Could not save debug page source on error: {e_save}")
        return {}, False # Return empty data and failure for this attempt
    finally:
        if local_driver:
            local_driver.quit()
            print(f"Browser closed for attempt {attempt_num}.")


# --- Main Execution Logic with Retries ---
if __name__ == "__main__":
    final_extracted_data = {}
    overall_success = False

    for attempt in range(1, MAX_RETRIES + 1):
        data_from_attempt, success_status = scrape_cnn_data(attempt)
        if success_status:
            final_extracted_data = data_from_attempt
            overall_success = True
            print(f"\nSuccessfully fetched data on attempt {attempt}.")
            break # Exit loop on success
        elif attempt < MAX_RETRIES:
            print(f"Attempt {attempt} failed. Retrying in {RETRY_DELAY_SECONDS} seconds...")
            time.sleep(RETRY_DELAY_SECONDS)
        else:
            print("\nAll scraping attempts failed.")

    print("\n--- Final Extracted Data ---")
    if overall_success and final_extracted_data:
        print(f"Market Timestamp: {final_extracted_data.get('market_timestamp', 'N/A')}")
        print(f"Fear & Greed Index: {final_extracted_data.get('fear_greed_index', 'N/A')}")

        try:
            with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f_out:
                json.dump(final_extracted_data, f_out, indent=4, ensure_ascii=False)
            print(f"\nData successfully saved to {OUTPUT_FILENAME}")
        except IOError as e_io:
            print(f"Error saving data to file {OUTPUT_FILENAME}: {e_io}")
        except Exception as e_json:
            print(f"Error during JSON serialization or file writing: {e_json}")
    else:
        print("No significant data was extracted after all retries. Nothing to save to file.")



ModuleNotFoundError: No module named 'webdriver_manager'