# 初始化设置与工具函数

In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
import random
import pandas as pd
import os
import socket

# 配置Chrome浏览器
options = webdriver.ChromeOptions()
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 添加用户配置，减少被识别为爬虫的概率
options.add_argument(r"C:\Users\ZhaoQY\Anaconda3\chromedriver.exe")  # 替换为你的Chrome用户数据路径


# 本地chromedriver路径（根据你的环境修改）
chromedriver_path = r'C:\Users\ZhaoQY\Anaconda3\chromedriver.exe'
if not os.path.exists(chromedriver_path):
    raise FileNotFoundError(f"未找到chromedriver: {chromedriver_path}")

# 目标URL
base_url = "https://xueqiu.com/hq/detail?market=CN&first_name=0&second_name=0&type=sh_sz"

# 检查网络连接
def check_internet_connection():
    try:
        socket.create_connection(("8.8.8.8", 53), timeout=3)
        print("网络连接正常")
        return True
    except OSError:
        print("网络连接失败，请检查网络设置")
        return False

# 验证URL是否可访问
def verify_url_accessible(url):
    try:
        domain = url.split('/')[2]
        socket.gethostbyname(domain)
        print(f"域名 {domain} 解析正常")
        return True
    except Exception as e:
        print(f"域名解析失败: {e}")
        return False

# 随机等待时间，模拟人类浏览行为
def random_sleep(min_seconds=2, max_seconds=5):
    sleep_time = random.uniform(min_seconds, max_seconds)
    time.sleep(sleep_time)
    return sleep_time

# 股票信息提取函数

In [28]:
# 评论分页函数（兼容所有调用方式，彻底去除页数限制）
def extract_all_comments(driver, *args, **kwargs):  # 兼容任意参数，避免调用错误
    all_comments = []
    retry_count = 0
    page = 1
    
    # 通用路径模板
    content_xpath_templates = [
        "{article}/div[2]/div[2]/div/div/div",
        "{article}/div[2]/div[2]/div[1]/div/div",
        "{article}/div[2]/div[2]/div[1]/div[1]/div",
        "{article}/div[2]/div[2]/div/div",
        "{article}/div[2]/div[2]/div"
    ]
    
    expand_xpath_templates = [
        "{article}/div[2]/div[2]/div/div/a",
        "{article}/div[2]/div[2]/div[1]/div/a",
        "{article}/div[2]/div[2]/div[1]/div[1]/a",
        "{article}/div[2]/div[2]/div/a"
    ]
    
    while retry_count < 3:
        try:
            comment_container_xpath = "//*[@id='app']/div[2]/div[2]/div[8]/div[3]"
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.XPATH, comment_container_xpath))
            )
            print("✅ 评论区容器已加载")
            
            while True:  # 无页数限制，仅通过下一页按钮控制
                print(f"\n===== 开始处理评论第{page}页 =====")
                page_comments = []
                
                for comment_idx in range(1, 11):
                    try:
                        article_xpath = f"//*[@id='app']/div[2]/div[2]/div[8]/div[3]/article[{comment_idx}]"
                        print(f"\n----- 尝试提取第{page}页第{comment_idx}条 -----")
                        print(f"基础路径: {article_xpath}")
                        
                        # 等待评论元素加载
                        try:
                            WebDriverWait(driver, 10).until(
                                EC.presence_of_element_located((By.XPATH, article_xpath))
                            )
                        except TimeoutException:
                            print(f"ℹ️ 第{comment_idx}条评论未加载，跳过")
                            continue
                        
                        # 尝试点击展开按钮
                        expand_button = None
                        for exp_template in expand_xpath_templates:
                            exp_xpath = exp_template.format(article=article_xpath)
                            buttons = driver.find_elements(By.XPATH, exp_xpath)
                            if buttons and buttons[0].is_displayed():
                                expand_button = buttons[0]
                                print(f"✅ 找到展开按钮（匹配模板：{exp_template}）")
                                break
                        
                        if expand_button:
                            try:
                                driver.execute_script("arguments[0].click();", expand_button)
                                print("✅ 已点击展开按钮")
                                time.sleep(random.uniform(0.8, 1.2))
                            except:
                                print("⚠️ 展开按钮点击失败，继续提取")
                        
                        # 提取评论内容
                        comment_content = None
                        for cont_template in content_xpath_templates:
                            cont_xpath = cont_template.format(article=article_xpath)
                            print(f"尝试内容模板: {cont_template}")
                            elements = driver.find_elements(By.XPATH, cont_xpath)
                            if elements and elements[0].text.strip():
                                comment_content = elements[0].text.strip()
                                print(f"✅ 提取到内容（长度: {len(comment_content)}）")
                                break
                        
                        if comment_content:
                            page_comments.append({
                                "页码": page,
                                "评论序号": comment_idx,
                                "内容": comment_content
                            })
                            print(f"✅ 第{page}页第{comment_idx}条提取完成")
                        else:
                            print(f"ℹ️ 第{comment_idx}条未匹配到有效内容路径")
                    
                    except Exception as e:
                        print(f"❌ 第{comment_idx}条处理失败: {e}")
                        continue
                
                if page_comments:
                    all_comments.extend(page_comments)
                    print(f"✅ 第{page}页共提取 {len(page_comments)} 条评论")
                else:
                    print(f"⚠️ 第{page}页未提取到评论，尝试翻页")
                
                # 定位下一页按钮
                next_button_xpath = "//*[@id='app']/div[2]/div[2]/div[8]/div[4]/a[contains(text(), '下一页')]"
                print(f"尝试定位第{page}页下一页按钮（通过文字：{next_button_xpath}）")
                
                try:
                    next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, next_button_xpath))
                    )
                    print(f"✅ 找到第{page}页下一页按钮（文本：{next_button.text.strip()}）")
                except TimeoutException:
                    print("❌ 未找到“下一页”按钮，已到最后一页")
                    break
                
                # 检查按钮是否不可用
                is_disabled = False
                try:
                    btn_class = next_button.get_attribute("class")
                    if "disabled" in btn_class or "gray" in btn_class:
                        is_disabled = True
                        print("⚠️ 下一页按钮不可用（已到最后一页）")
                except:
                    pass
                if is_disabled:
                    break
                
                # 点击下一页
                driver.execute_script("arguments[0].click();", next_button)
                print(f"✅ 已点击“下一页”按钮，前往第{page+1}页")
                
                # 等待新页面加载
                try:
                    WebDriverWait(driver, 15).until(
                        EC.presence_of_element_located((By.XPATH, comment_container_xpath))
                    )
                    print(f"✅ 第{page+1}页加载完成")
                except TimeoutException:
                    print(f"⚠️ 第{page+1}页加载超时，重试")
                    next_button.click()
                    time.sleep(3)
                
                page += 1
                time.sleep(random.uniform(2, 4))
            
            if all_comments:
                break
            else:
                retry_count += 1
                print(f"⚠️ 未提取到评论，第{retry_count}次重试...")
                time.sleep(5)
        
        except Exception as e:
            retry_count += 1
            print(f"❌ 评论提取出错，第{retry_count}次重试: {e}")
            driver.refresh()
            time.sleep(5)
    
    print(f"\n===== 评论提取完成，共提取 {len(all_comments)} 条评论 =====")
    return all_comments
    

In [21]:

# 1. 提取股票基本信息
def extract_stock_basic_info(driver):
    try:
        # 雪球股票详情页基本信息定位（兼容不同结构）
        name_code_xpaths = [
            "//div[@class='stock-name']",
            "//h1[@class='name']",
            "//div[@class='stock-title']"
        ]
        price_change_xpaths = [
            "//div[@class='price-change']",
            "//span[@class='change']",
            "//div[@class='percent']"
        ]
        
        # 提取名称和代码
        name_code_text = ""
        for xpath in name_code_xpaths:
            try:
                element = WebDriverWait(driver, 8).until(
                    EC.presence_of_element_located((By.XPATH, xpath))
                )
                name_code_text = element.text.strip()
                break
            except:
                continue
        
        # 解析股票代码（格式：股票名称(SH60XXXX) 或 股票名称(SZ30XXXX)）
        code_match = re.search(r'\((SH|SZ)\d{6}\)', name_code_text)
        stock_code = code_match.group().replace("(", "").replace(")", "") if code_match else ""
        pure_name = name_code_text.replace(f"({stock_code})", "").strip() if code_match else name_code_text
        
        # 提取涨跌幅
        change_text = ""
        for xpath in price_change_xpaths:
            try:
                element = driver.find_element(By.XPATH, xpath)
                change_text = element.text.strip()
                break
            except:
                continue
        
        return {
            "股票名称": pure_name,
            "股票代码": stock_code,
            "涨跌幅": change_text
        }
    
    except Exception as e:
        print(f"提取基本信息出错: {e}")
        return {"股票名称": "", "股票代码": "", "涨跌幅": ""}


# 2. 评论分页爬取（带进度记忆恢复功能）
def extract_all_comments(driver, stock_name, stock_code):
    all_comments = []
    max_retry = 3
    retry_count = 0
    last_page = 1
    last_comment_idx = 0
    main_window = driver.current_window_handle
    
    # 路径模板（保持不变）
    content_xpath_templates = [
        "{article}/div[2]/div[2]/div/div/div",
        "{article}/div[2]/div[2]/div[1]/div/div",
        "{article}/div[2]/div[2]/div[1]/div[1]/div",
        "{article}/div[2]/div[2]/div/div",
        "{article}/div[2]/div[2]/div"
    ]
    
    expand_xpath_templates = [
        "{article}/div[2]/div[2]/div/div/a",
        "{article}/div[2]/div[2]/div[1]/div/a",
        "{article}/div[2]/div[2]/div[1]/div[1]/a",
        "{article}/div[2]/div[2]/div/a"
    ]
    
    while retry_count < max_retry:
        try:
            if not driver.window_handles:
                raise NoSuchWindowException("页面已关闭")
            
            comment_container_xpath = "//*[@id='app']/div[2]/div[2]/div[8]/div[3]"
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, comment_container_xpath))
            )
            print("✅ 评论区容器已加载")
            
            # 关键修复：初始化current_page（解决未赋值错误）
            current_page = last_page  # 从上次记录的页码开始
            
            # 恢复进度
            if last_page > 1 or last_comment_idx > 0:
                print(f"⏳ 恢复进度：第{last_page}页，从第{last_comment_idx + 1}条开始")
                # 跳转到目标页码（仅当current_page < last_page时需要跳转）
                while current_page < last_page:
                    next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), '下一页')]"))
                    )
                    driver.execute_script("arguments[0].click();", next_button)
                    WebDriverWait(driver, 15).until(
                        EC.presence_of_element_located((By.XPATH, comment_container_xpath))
                    )
                    current_page += 1
                    print(f"已跳转至第{current_page}页")
            
            # 爬取评论
            while True:
                print(f"\n===== 开始处理{stock_name}评论第{current_page}页 =====")
                page_comments = []
                
                # 从上次中断位置继续
                start_idx = last_comment_idx + 1 if current_page == last_page else 1
                for comment_idx in range(start_idx, 11):
                    try:
                        article_xpath = f"//*[@id='app']/div[2]/div[2]/div[8]/div[3]/article[{comment_idx}]"
                        print(f"\n----- 尝试提取第{current_page}页第{comment_idx}条 -----")
                        
                        # 等待评论元素加载（延长等待时间至15秒）
                        try:
                            WebDriverWait(driver, 15).until(
                                EC.presence_of_element_located((By.XPATH, article_xpath))
                            )
                        except TimeoutException:
                            print(f"ℹ️ 第{comment_idx}条评论未加载，跳过")
                            continue
                        
                        # 尝试点击展开按钮
                        expand_button = None
                        for exp_template in expand_xpath_templates:
                            exp_xpath = exp_template.format(article=article_xpath)
                            buttons = driver.find_elements(By.XPATH, exp_xpath)
                            if buttons and buttons[0].is_displayed():
                                expand_button = buttons[0]
                                print(f"✅ 找到展开按钮（匹配模板：{exp_template}）")
                                break
                        
                        if expand_button:
                            try:
                                driver.execute_script("arguments[0].click();", expand_button)
                                time.sleep(random.uniform(1.5, 2.5))  # 延长等待，模拟人工
                            except:
                                print("⚠️ 展开按钮点击失败，继续提取")
                        
                        # 提取评论内容
                        comment_content = None
                        for cont_template in content_xpath_templates:
                            cont_xpath = cont_template.format(article=article_xpath)
                            elements = driver.find_elements(By.XPATH, cont_xpath)
                            if elements and elements[0].text.strip():
                                comment_content = elements[0].text.strip()
                                print(f"✅ 提取到内容（长度: {len(comment_content)}）")
                                break
                        
                        if comment_content:
                            page_comments.append({
                                "页码": current_page,
                                "评论序号": comment_idx,
                                "内容": comment_content
                            })
                            print(f"✅ 第{current_page}页第{comment_idx}条提取完成")
                            last_comment_idx = comment_idx  # 更新进度
                            last_page = current_page  # 同步页码记录
                        else:
                            print(f"ℹ️ 第{comment_idx}条未匹配到有效内容路径")
                    
                    except Exception as e:
                        print(f"❌ 第{comment_idx}条处理失败: {e}")
                        continue
                
                if page_comments:
                    all_comments.extend(page_comments)
                    print(f"✅ 第{current_page}页共提取 {len(page_comments)} 条评论")
                
                # 定位下一页按钮
                next_button_xpath = "//*[@id='app']/div[2]/div[2]/div[8]/div[4]/a[contains(text(), '下一页')]"
                try:
                    next_button = WebDriverWait(driver, 15).until(  # 延长等待时间
                        EC.element_to_be_clickable((By.XPATH, next_button_xpath))
                    )
                except TimeoutException:
                    print("❌ 未找到“下一页”按钮，已到最后一页")
                    break
                
                # 检查按钮是否不可用
                is_disabled = False
                try:
                    btn_class = next_button.get_attribute("class")
                    if "disabled" in btn_class or "gray" in btn_class:
                        is_disabled = True
                        print("⚠️ 下一页按钮不可用（已到最后一页）")
                except:
                    pass
                if is_disabled:
                    break
                
                # 点击下一页（增加延迟，避免过快操作）
                driver.execute_script("arguments[0].click();", next_button)
                print(f"✅ 已点击“下一页”按钮，前往第{current_page + 1}页")
                time.sleep(random.uniform(3, 5))  # 延长翻页间隔，减少反爬
                
                # 等待新页面加载
                try:
                    WebDriverWait(driver, 20).until(  # 延长加载等待
                        EC.presence_of_element_located((By.XPATH, comment_container_xpath))
                    )
                    print(f"✅ 第{current_page + 1}页加载完成")
                except TimeoutException:
                    print(f"⚠️ 第{current_page + 1}页加载超时，重试点击下一页")
                    next_button.click()
                    time.sleep(5)
                
                current_page += 1
                last_comment_idx = 0  # 新页面从第0条开始
            
            if all_comments:
                break
            else:
                retry_count += 1
                print(f"⚠️ 未提取到评论，第{retry_count}次重试当前股票...")
                time.sleep(5)
        
        except Exception as e:
            retry_count += 1
            print(f"❌ 评论提取出错，第{retry_count}次重试: {e}")
            
            # 处理会话失效（核心修复）
            if "session" in str(e).lower() or "window" in str(e).lower():
                print(f"⚠️ 会话失效/页面关闭，尝试重新初始化浏览器...")
                # 重新初始化浏览器
                driver.quit()
                options = webdriver.ChromeOptions()
                options.add_argument("--start-maximized")
                options.add_experimental_option("excludeSwitches", ["enable-automation"])
                options.add_experimental_option("useAutomationExtension", False)
                driver = webdriver.Chrome(options=options)
                driver.get(base_url)
                time.sleep(5)
                print("请重新完成登录/验证（30秒内）...")
                time.sleep(30)  # 给用户时间手动处理验证
                # 重新打开股票页面
                try:
                    stock_link = WebDriverWait(driver, 15).until(
                        EC.element_to_be_clickable((By.XPATH, f"//*[contains(text(), '{stock_name}')]/ancestor::a"))
                    )
                    stock_link.click()
                    time.sleep(3)
                    driver.switch_to.window(driver.window_handles[-1])
                    main_window = driver.window_handles[0]
                except:
                    print(f"❌ 重新打开{stock_name}失败，跳过")
                    return all_comments
            else:
                driver.refresh()
                time.sleep(5)
    
    print(f"\n===== {stock_name}评论提取完成，共提取 {len(all_comments)} 条评论 =====")
    return all_comments

# 3. 保存数据到Excel
def save_to_excel(all_stock_data):
    try:
        excel_data = []
        for stock in all_stock_data:
            basic_info = stock["基本信息"]
            comments = stock["评论"]
            
            for comment in comments:
                row = {
                    "股票代码": basic_info.get("股票代码", ""),
                    "股票名称": basic_info.get("股票名称", ""),
                    "涨跌幅": basic_info.get("涨跌幅", ""),
                    "评论页码": comment.get("页码", ""),
                    "评论序号": comment.get("评论序号", ""),
                    "评论内容": comment.get("内容", "")
                }
                excel_data.append(row)
        
        # 创建DataFrame并保存
        df = pd.DataFrame(excel_data)
        excel_path = "雪球股票评论.xlsx"
        df.to_excel(excel_path, index=False)
        print(f"\n成功保存 {len(excel_data)} 条评论数据到 {excel_path}")
        
    except Exception as e:
        print(f"保存Excel失败: {e}")



# 主爬虫程序

In [None]:
def main():
    all_stock_data = []
    driver = None
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    # 增加反爬参数（模拟真实浏览器）
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.7204.185 Safari/537.36")
    
    try:
        if not check_internet_connection():
            raise ConnectionError("网络连接失败")
        if not verify_url_accessible(base_url):
            raise ConnectionError("目标URL无法访问")
        
        # 初始化浏览器
        service = Service(chromedriver_path)
        driver = webdriver.Chrome(service=service, options=options)
        driver.maximize_window()
        driver.get(base_url)
        time.sleep(random.uniform(5, 8))  # 延长初始加载时间
        
        # 处理验证（明确提示滑动验证）
        print("请检查浏览器窗口，如有滑动验证请手动完成，完成后按Enter继续...")
        input("确认验证完成后按Enter继续...")
        time.sleep(random.uniform(3, 5))
        
        # 爬取参数
        total_stock_pages = 2
        stocks_per_page = 5
        max_stock_retries = 2  # 单支股票重试次数
        
        for stock_page in range(1, total_stock_pages + 1):
            print(f"\n===== 正在爬取股票列表第{stock_page}页 =====")
            
            try:
                if stock_page > 1:
                    try:
                        page_button = WebDriverWait(driver, 15).until(
                            EC.element_to_be_clickable((By.XPATH, f"//div[@class='hq-pager']//a[text()='{stock_page}']"))
                        )
                        page_button.click()
                    except:
                        print(f"⚠️ 定位第{stock_page}页失败，尝试下一页按钮")
                        next_page_btn = WebDriverWait(driver, 10).until(
                            EC.element_to_be_clickable((By.XPATH, "//div[@class='hq-pager']//a[contains(text(), '下一页')]"))
                        )
                        next_page_btn.click()
                    time.sleep(random.uniform(5, 8))  # 延长分页等待
                
                for stock_idx in range(1, stocks_per_page + 1):
                    stock_retry = 0
                    stock_success = False
                    
                    while stock_retry <= max_stock_retries and not stock_success:
                        try:
                            print(f"\n----- 正在爬取第{stock_page}页第{stock_idx}支股票（第{stock_retry + 1}次尝试） -----")
                            
                            # 检查会话有效性（核心修复）
                            if not driver.session_id or not driver.window_handles:
                                print("⚠️ 会话失效，重新初始化浏览器...")
                                driver.quit()
                                driver = webdriver.Chrome(service=service, options=options)
                                driver.get(base_url)
                                time.sleep(5)
                                print("请重新完成登录/验证（30秒）...")
                                time.sleep(30)  # 给时间处理验证
                            
                            # 定位股票链接
                            stock_link = WebDriverWait(driver, 20).until(  # 延长等待
                                EC.element_to_be_clickable((By.XPATH,
                                    f"//*[@id='app']/div/div[2]/div/div/div[2]/div[2]/div[1]/div/table/tbody/tr[{stock_idx}]/td[2]/span/a"))
                            )
                            stock_name = stock_link.text
                            print(f"正在处理股票: {stock_name}")
                            
                            stock_link.click()
                            time.sleep(random.uniform(5, 8))  # 延长打开时间
                            
                            if len(driver.window_handles) > 1:
                                driver.switch_to.window(driver.window_handles[-1])
                            else:
                                print("⚠️ 未打开新窗口，使用当前窗口")
                            
                            # 提取基本信息
                            stock_basic = extract_stock_basic_info(driver)
                            if not stock_basic["股票名称"]:
                                stock_basic["股票名称"] = stock_name
                            print(f"股票基本信息: {stock_basic}")
                            
                            # 提取评论
                            stock_comments = extract_all_comments(driver, stock_basic["股票名称"], stock_basic["股票代码"])
                            print(f"共提取到{len(stock_comments)}条评论（当前股票）")
                            
                            # 保存数据
                            stock_data = {
                                "基本信息": stock_basic,
                                "评论": stock_comments
                            }
                            all_stock_data.append(stock_data)
                            stock_success = True
                            
                            # 关闭详情页
                            if len(driver.window_handles) > 1:
                                driver.close()
                                driver.switch_to.window(driver.window_handles[0])
                            time.sleep(random.uniform(3, 5))
                            
                        except Exception as e:
                            stock_retry += 1
                            print(f"处理第{stock_page}页第{stock_idx}支股票时出错（第{stock_retry}次）: {e}")
                            
                            # 出错后恢复会话
                            try:
                                if driver and len(driver.window_handles) > 1:
                                    driver.close()
                                    driver.switch_to.window(driver.window_handles[0])
                                time.sleep(3)
                            except:
                                pass
                            
                            if stock_retry > max_stock_retries:
                                print(f"⚠️ 已达最大重试次数，跳过当前股票")
        
            except Exception as e:
                print(f"爬取第{stock_page}页出错: {e}")
                driver.refresh()
                time.sleep(5)
                continue
        
        save_to_excel(all_stock_data)
        print("所有爬取任务已完成")
        
    except Exception as e:
        print(f"爬取过程出错：{e}")
    finally:
        if driver:
            driver.quit()
            print("浏览器已关闭")

if __name__ == "__main__":
    main()

网络连接正常
域名 xueqiu.com 解析正常
请检查浏览器窗口，如有滑动验证请手动完成，完成后按Enter继续...


确认验证完成后按Enter继续... 



===== 正在爬取股票列表第1页 =====

----- 正在爬取第1页第1支股票（第1次尝试） -----
正在处理股票: N天富龙
股票基本信息: {'股票名称': 'N天富龙', '股票代码': '', '涨跌幅': ''}
✅ 评论区容器已加载

===== 开始处理N天富龙评论第1页 =====

----- 尝试提取第1页第1条 -----
ℹ️ 第1条评论未加载，跳过

----- 尝试提取第1页第2条 -----
ℹ️ 第2条评论未加载，跳过

----- 尝试提取第1页第3条 -----
ℹ️ 第3条评论未加载，跳过

----- 尝试提取第1页第4条 -----
ℹ️ 第4条评论未加载，跳过

----- 尝试提取第1页第5条 -----
ℹ️ 第5条评论未加载，跳过

----- 尝试提取第1页第6条 -----
ℹ️ 第6条评论未加载，跳过

----- 尝试提取第1页第7条 -----
