### 爬取单个帖子的评论

In [None]:
import requests
from lxml import etree
import csv
import time
import re
from datetime import datetime

from bs4 import BeautifulSoup  # 需要导入 BeautifulSoup

def get_comments(post_url, page_num, csvwrite, header):
    # 逐页请求评论
    for page in range(1, page_num + 1):
        # 构造当前页的URL，pn=page表示页码
        current_url = f"{post_url}&pn={page}"
        print(f"Fetching comments from page {page}: {current_url}")
        res = requests.get(current_url, headers=header)
        parser = etree.HTMLParser(encoding='utf-8')
        content = etree.HTML(res.text, parser=parser)
        res.close()
        
        html = res.text
        soup = BeautifulSoup(html, 'html.parser')
        posts = soup.find_all('div', class_='l_post')

        for post in posts:
            # 获取一级评论
            comment_text = post.find('div', class_='d_post_content').text.strip()
            user_name = post.find('a', class_='p_author_name').text.strip()
            post_time = post.find('span', class_='tail-info').text.strip()

            # 只保存有效的评论（长度大于2）
            if len(comment_text) > 2:
                dic = {'name': user_name,
                       'comment': comment_text,
                       'time': post_time,
                       'level': 1}  # 一级评论
                csvwrite.writerow(dic.values())

        # 获取二级评论
        num = post_url.split('/')[-1].split('?')[0]
        url2 = f"https://tieba.baidu.com/p/totalComment?t=1729087530500&tid={num}&fid=422204&pn={page}&see_lz=0"
        res2 = requests.get(url2, headers=header)
        content2 = res2.json()
        res2.close()

        # 获取二级评论列表
        comment_list = content2['data']['comment_list']
        for i in comment_list:
            comment_info = comment_list[i]['comment_info']
            cot = comment_list[i]['comment_list_num']
            if cot > 0:
                for j in range(0, len(comment_info)-1):
                    if len(str(comment_info[j]['content'])) < 2:
                        continue
                    if str(comment_info[j]['content'])[0:2] == "回复":
                        # 过滤HTML标签
                        result = re.sub(r'<.*?>', '', str(comment_info[j]['content']), re.S)
                        result = re.sub(r'.*?:', '', result, re.S)
                        dic = {
                            'name': str(comment_info[j]['show_nickname']),
                            'comment': str(result),
                            'time': str(datetime.fromtimestamp(comment_info[j]['now_time'])),
                            'level': 2
                        }
                        csvwrite.writerow(dic.values())
                        continue
                    dic = {
                        'name': str(comment_info[j]['show_nickname']),
                        'comment': str(comment_info[j]['content']).split("<")[0],
                        'time': str(datetime.fromtimestamp(comment_info[j]['now_time'])),
                        'level': 2
                    }
                    csvwrite.writerow(dic.values())

def main(post_url, page_num, header):
    # 打开csv文件
    with open("deepseek.csv", mode="w", encoding="utf-8-sig", newline="") as f:
        csvwrite = csv.writer(f)
        get_comments(post_url, page_num, csvwrite, header)

if __name__ == '__main__':
    # 设置目标帖子的URL和请求头
    post_url = "https://tieba.baidu.com/p/9378684167?"  # 替换成你要爬取的帖子链接
    page_num = 5  # 设置要爬取的页数
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
        "Cookie": 'XFI=fb2fdf90-16ac-11f0-b72e-573b3b145be3; XFCS=68140E4D4B133E1E85DADE9CA5C1A592347400332ADBE75F1691FF5F3C0D0376; XFT=B7fLWdv+aHEh7MQKOg6CUmC4DZQFw7hFywm19S040DE=; BIDUPSID=CB8F163C8C1D9DE7045CCDBBBDD0DF58; PSTM=1727798294; BAIDUID=DFC53B2DFB6DC2AEEAA63B0E89E869BF:FG=1; BAIDUID_BFESS=DFC53B2DFB6DC2AEEAA63B0E89E869BF:FG=1; H_PS_PSSID=60273_61027_61098_61134_61140_61156_61178_61210_61213_61209_61243_61279_61296; BDUSS=hoYWZTUzljbXlGMkJmQ3dKeHZSOXZjazhaWWsyOGRmMm1vQ2R2R2J1WkN1LWRuSVFBQUFBJCQAAAAAAAAAAAEAAAC2y59syb3A783exL7T0Le2tvkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEIuwGdCLsBnQz; BDUSS_BFESS=hoYWZTUzljbXlGMkJmQ3dKeHZSOXZjazhaWWsyOGRmMm1vQ2R2R2J1WkN1LWRuSVFBQUFBJCQAAAAAAAAAAAEAAAC2y59syb3A783exL7T0Le2tvkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEIuwGdCLsBnQz; H_WISE_SIDS=110085_1992049_626068_628198_632154_632300_633613_639038_632291_639930_639976_640333_637861_641078_641171_641425_641463_637756_640773_642206_642323_641767_642407_641423_642220_642662_642537_639679_642951_642987_642976_642147_643201_643245_643255_643276_643580_641262_643762_643765_643920_643404_643983_643815_643445_644084_644324_644323_644343_644367_644369_644316_644408_644457_644497_644552_643221_644570_644581_644622_644717_644645_644732_643284_627286_644752_644843_644864_644900_644921_644956_644939_644973_644960_644895_645089_643279_643294_645233_645259_645327_645348_645362_645379_645459_645315_645426_645485; H_WISE_SIDS_BFESS=110085_1992049_626068_628198_632154_632300_633613_639038_632291_639930_639976_640333_637861_641078_641171_641425_641463_637756_640773_642206_642323_641767_642407_641423_642220_642662_642537_639679_642951_642987_642976_642147_643201_643245_643255_643276_643580_641262_643762_643765_643920_643404_643983_643815_643445_644084_644324_644323_644343_644367_644369_644316_644408_644457_644497_644552_643221_644570_644581_644622_644717_644645_644732_643284_627286_644752_644843_644864_644900_644921_644956_644939_644973_644960_644895_645089_643279_643294_645233_645259_645327_645348_645362_645379_645459_645315_645426_645485; STOKEN=ba94bd1042065569cad1fc249cab9ba6f718da143def918dd8c1b6b7d9049bed; BAIDU_WISE_UID=wapp_1741955573440_98; USER_JUMP=-1; Hm_lvt_292b2e1608b0823c1cb6beef7243ef34=1741955573,1744359239; HMACCOUNT=C2A9454786D9FDA0; BAIDU_SSP_lcr=https://www.google.com/; st_key_id=17; arialoadData=false; 1822411702_FRSVideoUploadTip=1; video_bubble1822411702=1; XFI=ef682050-16ac-11f0-b242-57f1d14b4308; XFCS=CEBEED9E276A3EA0F15827EED390F3DD489023C1BD2A4D1D49517909A467B21E; XFT=3Gj58dRh+76bSGssyIzSvN/iBA0zSzV416m4EQfEin8=; wise_device=0; ZFY=YHEshGVaGm6adicpK92N9BPfDZEYwAibtaCiekhhDjU:C; Hm_lpvt_292b2e1608b0823c1cb6beef7243ef34=1744359265; st_data=7f223ef17d468d9c5cb3c7858271e7774d01e5139f5fa87b7ec7fdcdf47ef9e59124b7c0d35bd70dc2a70c7eb59de18ac513922a332dd8b764a557adc910409225ffb4b0f6d9639d8deddcbec8da28ef8e4b2b0972887d710f0e4b46a169f0c04c15b7a4efb5c679154df62c9a3c1b79732cce8658ce6f9a660aec033461e3937a01d328e076d5226b8046692caa932b; st_sign=79e3145b; RT="z=1&dm=baidu.com&si=c5cabe2f-b6df-482a-ae47-b51e02cc17e8&ss=m9cigtpp&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=4x6&ul=6u0my',  # 替换为你的cookie
    }
    main(post_url, page_num, header)


### 爬取单个帖子的评论(改成自动识别帖子页数)

In [None]:
import requests
from lxml import etree
import csv
import time
import re
from datetime import datetime
from bs4 import BeautifulSoup

def detect_total_pages(post_url, header):
    """
    自动检测目标帖子共有多少页。
    通过请求第一页，用 etree 解析 HTML，并用 XPath 定位分页节点来获取总页数。
    如果解析失败，则默认只有一页。
    """
    current_url = f"{post_url}&pn=1"
    print(f"Detecting total pages from: {current_url}")
    res = requests.get(current_url, headers=header)
    parser = etree.HTMLParser(encoding='utf-8')
    content = etree.HTML(res.text, parser=parser)
    res.close()
    try:
        # 根据实际情况修改 XPath，这里参考之前的写法：
        total_pages = int(content.xpath("/html/body/div[2]/div/div[2]/div/div[3]/div[1]/ul/li[2]/span[2]/text()")[0])
    except Exception as e:
        total_pages = 1
    return total_pages

def get_comments(post_url, page_num, csvwrite, header):
    """
    根据传入的总页数，逐页抓取评论数据。
    先用 BeautifulSoup 获取页面中一级评论，
    再请求对应的二级评论接口提取评论回复内容。
    """
    for page in range(1, page_num + 1):
        current_url = f"{post_url}&pn={page}"
        print(f"Fetching comments from page {page}: {current_url}")
        res = requests.get(current_url, headers=header)
        soup = BeautifulSoup(res.text, 'html.parser')
        res.close()
        
        # 获取一级评论部分
        posts = soup.find_all('div', class_='l_post')
        for post in posts:
            comment_div = post.find('div', class_='d_post_content')
            # 考虑到部分帖子可能为空
            comment_text = comment_div.get_text(strip=True) if comment_div else ""
            author_tag = post.find('a', class_='p_author_name')
            user_name = author_tag.get_text(strip=True) if author_tag else ""
            time_tag = post.find('span', class_='tail-info')
            post_time = time_tag.get_text(strip=True) if time_tag else ""
            
            if len(comment_text) > 2:
                dic = {
                    'name': user_name,
                    'comment': comment_text,
                    'time': post_time,
                    'level': 1  # 一级评论
                }
                csvwrite.writerow(dic.values())

        # 获取二级评论（评论回复）
        # 这里从帖子 URL 中提取帖子编号 tid
        num = post_url.split('/')[-1].split('?')[0]
        url2 = f"https://tieba.baidu.com/p/totalComment?t=1729087530500&tid={num}&fid=422204&pn={page}&see_lz=0"
        res2 = requests.get(url2, headers=header)
        try:
            content2 = res2.json()
        except Exception as e:
            content2 = {}
        res2.close()
        
        if 'data' in content2 and 'comment_list' in content2['data']:
            comment_list = content2['data']['comment_list']
            for key in comment_list:
                comment_info = comment_list[key].get('comment_info', [])
                cot = comment_list[key].get('comment_list_num', 0)
                if cot > 0:
                    # 注意这里遍历评论列表时减去1（可根据实际情况调整遍历范围）
                    for idx in range(0, len(comment_info)-1):
                        content_str = str(comment_info[idx].get('content', ''))
                        if len(content_str) < 2:
                            continue
                        if content_str.startswith("回复"):
                            # 过滤 HTML 标签，保留回复内容
                            result = re.sub(r'<.*?>', '', content_str, flags=re.S)
                            result = re.sub(r'.*?:', '', result, flags=re.S)
                            dic = {
                                'name': str(comment_info[idx].get('show_nickname', '')),
                                'comment': result,
                                'time': str(datetime.fromtimestamp(comment_info[idx].get('now_time', 0))),
                                'level': 2
                            }
                            csvwrite.writerow(dic.values())
                            continue
                        dic = {
                            'name': str(comment_info[idx].get('show_nickname', '')),
                            'comment': content_str.split("<")[0],
                            'time': str(datetime.fromtimestamp(comment_info[idx].get('now_time', 0))),
                            'level': 2
                        }
                        csvwrite.writerow(dic.values())

        # 为了防止访问过快，适当延时
        time.sleep(1 + 0.5 * time.time() % 1)

def main(post_url, header):
    # 自动检测帖子总页数
    total_pages = detect_total_pages(post_url, header)
    print(f"Total pages detected: {total_pages}")
    # 打开 CSV 文件写入评论数据
    with open("deepseek.csv", mode="w", encoding="utf-8-sig", newline="") as f:
        csvwrite = csv.writer(f)
        get_comments(post_url, total_pages, csvwrite, header)

if __name__ == '__main__':
    # 替换为你目标帖子的链接，注意帖子的 URL 需要带上问号以便后续附加参数
    post_url = "https://tieba.baidu.com/p/9378684167?"
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
        "Cookie": "YOUR_COOKIE_HERE"  # 替换为有效的 Cookie
    }
    main(post_url, header)


Detecting total pages from: https://tieba.baidu.com/p/9378684167?
Total pages detected: 1
Fetching comments from page 1: https://tieba.baidu.com/p/9378684167?&pn=1


### 爬取一个贴吧下的所有帖子的链接

In [8]:
import requests
import re
import time
import random
import logging
import io
import sys

class TiebaSpider:
    def __init__(self):
        self.all_tb_urls = []  # 存储所有帖子链接
        self.search_urls = {
            'deepseek': 'https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0'  # 目标贴吧的第一页URL
        }
        logging.basicConfig(level=logging.INFO)

    def get_random_headers(self):
        """返回随机的 User-Agent header"""
        return {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
        }

    def fetch_page_content(self, url, headers=None, retries=3):
        """获取网页内容"""
        headers = headers or self.get_random_headers()
        for attempt in range(retries):
            try:
                response = requests.get(url, headers=headers, timeout=10)
                response.encoding = response.apparent_encoding
                return response.text
            except requests.RequestException as e:
                time.sleep(2)
                logging.warning(f"请求 {url} 失败 ({attempt + 1}/{retries})：{e}")

    def parse_tburl_re(self, html_content):
        """使用正则表达式提取帖子 URL，并保存到 all_tb_urls 列表中"""
        try:
            # 正则模式，匹配 /p/ 后面跟随数字的部分
            pattern = r'/p/\d+'
            urls = re.findall(pattern, html_content)

            # 去重并拼接成完整的 URL
            full_urls = [f"https://tieba.baidu.com{url}" for url in set(urls)]
            self.all_tb_urls.extend(full_urls)

            logging.info(f"提取到 {len(full_urls)} 个帖子 URL")
        except Exception as e:
            logging.error(f"正则解析失败: {e}")

    def save_url(self, file_path="url.txt"):
        """将 all_tb_urls 列表中的所有 URL 保存到本地文本文件"""
        with open(file_path, "w", encoding="utf-8") as f:
            for url in self.all_tb_urls:
                f.write(url + "\n")
        logging.info(f"所有帖子 URL 已保存到 {file_path}")

    def run(self):
        """入口函数，爬取所有分页的 URL，并保存结果"""
        url_counter = 0  # 计数器，统计已爬取URL数量
        # 获取目标贴吧的 URL
        for category, url in self.search_urls.items():
            logging.info(f"开始爬取贴吧 {category} ，目标URL: {url}")
            url_counter += 1
            ############################## 翻页逻辑 ############################
            # 逐页爬取，页面从 pn=0 开始，每次增加 50 直到 pn=13200
            for i in range(0, 13250, 50):  # pn=13200 为最后一页
                root_url = f"{url}&pn={i}"
                logging.info(f"爬取分页 URL: {root_url}")
                html_content = self.fetch_page_content(root_url)
                if html_content:
                    self.parse_tburl_re(html_content)
                time.sleep(random.uniform(3, 5))  # 随机延时，避免反爬

        # 保存爬取到的 URL
        self.save_url()

if __name__ == '__main__':
    # 输入你的 cookie
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
        "Cookie": 'XFI=fb2fdf90-16ac-11f0-b72e-573b3b145be3; XFCS=68140E4D4B133E1E85DADE9CA5C1A592347400332ADBE75F1691FF5F3C0D0376; XFT=B7fLWdv+aHEh7MQKOg6CUmC4DZQFw7hFywm19S040DE=; BIDUPSID=CB8F163C8C1D9DE7045CCDBBBDD0DF58; PSTM=1727798294; BAIDUID=DFC53B2DFB6DC2AEEAA63B0E89E869BF:FG=1; BAIDUID_BFESS=DFC53B2DFB6DC2AEEAA63B0E89E869BF:FG=1; H_PS_PSSID=60273_61027_61098_61134_61140_61156_61178_61210_61213_61209_61243_61279_61296; BDUSS=hoYWZTUzljbXlGMkJmQ3dKeHZSOXZjazhaWWsyOGRmMm1vQ2R2R2J1WkN1LWRuSVFBQUFBJCQAAAAAAAAAAAEAAAC2y59syb3A783exL7T0Le2tvkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEIuwGdCLsBnQz; BDUSS_BFESS=hoYWZTUzljbXlGMkJmQ3dKeHZSOXZjazhaWWsyOGRmMm1vQ2R2R2J1WkN1LWRuSVFBQUFBJCQAAAAAAAAAAAEAAAC2y59syb3A783exL7T0Le2tvkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEIuwGdCLsBnQz; H_WISE_SIDS=110085_1992049_626068_628198_632154_632300_633613_639038_632291_639930_639976_640333_637861_641078_641171_641425_641463_637756_640773_642206_642323_641767_642407_641423_642220_642662_642537_639679_642951_642987_642976_642147_643201_643245_643255_643276_643580_641262_643762_643765_643920_643404_643983_643815_643445_644084_644324_644323_644343_644367_644369_644316_644408_644457_644497_644552_643221_644570_644581_644622_644717_644645_644732_643284_627286_644752_644843_644864_644900_644921_644956_644939_644973_644960_644895_645089_643279_643294_645233_645259_645327_645348_645362_645379_645459_645315_645426_645485; H_WISE_SIDS_BFESS=110085_1992049_626068_628198_632154_632300_633613_639038_632291_639930_639976_640333_637861_641078_641171_641425_641463_637756_640773_642206_642323_641767_642407_641423_642220_642662_642537_639679_642951_642987_642976_642147_643201_643245_643255_643276_643580_641262_643762_643765_643920_643404_643983_643815_643445_644084_644324_644323_644343_644367_644369_644316_644408_644457_644497_644552_643221_644570_644581_644622_644717_644645_644732_643284_627286_644752_644843_644864_644900_644921_644956_644939_644973_644960_644895_645089_643279_643294_645233_645259_645327_645348_645362_645379_645459_645315_645426_645485; STOKEN=ba94bd1042065569cad1fc249cab9ba6f718da143def918dd8c1b6b7d9049bed; BAIDU_WISE_UID=wapp_1741955573440_98; USER_JUMP=-1; Hm_lvt_292b2e1608b0823c1cb6beef7243ef34=1741955573,1744359239; HMACCOUNT=C2A9454786D9FDA0; BAIDU_SSP_lcr=https://www.google.com/; st_key_id=17; arialoadData=false; 1822411702_FRSVideoUploadTip=1; video_bubble1822411702=1; XFI=ef682050-16ac-11f0-b242-57f1d14b4308; XFCS=CEBEED9E276A3EA0F15827EED390F3DD489023C1BD2A4D1D49517909A467B21E; XFT=3Gj58dRh+76bSGssyIzSvN/iBA0zSzV416m4EQfEin8=; wise_device=0; ZFY=YHEshGVaGm6adicpK92N9BPfDZEYwAibtaCiekhhDjU:C; Hm_lpvt_292b2e1608b0823c1cb6beef7243ef34=1744359265; st_data=7f223ef17d468d9c5cb3c7858271e7774d01e5139f5fa87b7ec7fdcdf47ef9e59124b7c0d35bd70dc2a70c7eb59de18ac513922a332dd8b764a557adc910409225ffb4b0f6d9639d8deddcbec8da28ef8e4b2b0972887d710f0e4b46a169f0c04c15b7a4efb5c679154df62c9a3c1b79732cce8658ce6f9a660aec033461e3937a01d328e076d5226b8046692caa932b; st_sign=79e3145b; RT="z=1&dm=baidu.com&si=c5cabe2f-b6df-482a-ae47-b51e02cc17e8&ss=m9cigtpp&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=4x6&ul=6u0my',  # 替换为你的cookie
    }
    
    spider = TiebaSpider()
    spider.run()


INFO:root:开始爬取贴吧 deepseek ，目标URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=0
INFO:root:提取到 50 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=50
INFO:root:提取到 52 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=100
INFO:root:提取到 50 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=150
INFO:root:提取到 52 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=200
INFO:root:提取到 51 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=250
INFO:root:提取到 51 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=300
INFO:root:提取到 51 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=350
INFO:root:提取到 51 个帖子 URL
INFO:root:爬取分页 URL: https://tieba.baidu.com/f?kw=deepseek&ie=utf-8&pn=0&pn=400
INFO:r

In [None]:
import requests
import random
import time
import csv
from datetime import datetime
from bs4 import BeautifulSoup
import os

def get_proxy():
    """获取新的代理 IP"""
    api_url = "https://dps.kdlapi.com/api/getdps/?secret_id=o9usi6thnwncxf61fkz2&signature=d6hlpaqm2jnb9t3v6r46e8ee71iq15n4&num=1&pt=1&sep=1"
    
    try:
        # 获取API接口返回的代理IP
        proxy_ip = requests.get(api_url).text.strip()
        print(f"代理 IP: {proxy_ip}")  # 打印返回的代理 IP，调试用

        if proxy_ip:
            # 用户名密码认证(私密代理/独享代理)
            username = ""  # 代理用户名
            password = ""  # 代理密码
            proxies = {
                "http": f"http://{username}:{password}@{proxy_ip}/",
                "https": f"http://{username}:{password}@{proxy_ip}/"
            }

            # 验证代理是否有效
            test_url = "https://httpbin.org/ip"
            try:
                test_res = requests.get(test_url, proxies=proxies, timeout=5)
                if test_res.status_code == 200:
                    print("代理连接正常")
                    return proxies
                else:
                    print("代理连接失败，重试中...")
                    return None
            except requests.RequestException:
                print("代理不可用，重试中...")
                return None
        else:
            print("未能获取代理 IP")
            return None
    except Exception as e:
        print(f"获取代理时发生错误: {e}")
        return None

def safe_request(url, headers, proxy, max_retries=3):
    """
    封装一个带重试的requests.get，请求失败或代理过期时，会重新获取代理。
    返回成功的响应 res 和最新的可用代理 proxies。
    如果多次重试仍失败，则抛出异常。
    """
    import requests

    attempt = 0
    current_proxy = proxy

    while attempt < max_retries:
        try:
            res = requests.get(url, headers=headers, proxies=current_proxy, timeout=10)
            res.raise_for_status()
            # 如果请求成功，就返回
            return res, current_proxy
        except (requests.exceptions.RequestException, requests.exceptions.ProxyError) as e:
            print(f"请求失败/代理失效，正在更换代理...（重试次数：{attempt+1}）")
            current_proxy = get_proxy()  # 重新获取代理
            if not current_proxy:
                print("获取新代理失败，无法继续。")
            attempt += 1

    # 如果执行到这里，说明多次尝试仍失败
    raise RuntimeError("多次请求失败，放弃。")

def detect_total_pages(post_url, header, proxy):
    """自动检测目标帖子共有多少页"""
    current_url = f"{post_url}?&pn=1"
    print(f"正在检测总页数: {current_url}")

    # 使用 safe_request，最多重试3次
    res, new_proxy = safe_request(current_url, header, proxy, max_retries=3)
    # 将返回的新proxy（可能重试时已经重新获取了）更新回去
    soup = BeautifulSoup(res.text, 'html.parser')
    
    try:
        # 获取总页数
        total_pages = int(
            soup.find("li", class_="l_reply_num").find_all("span", class_="red")[1].text.strip()
        )
        # 如果总页数超过100页，则限制为100页
        if total_pages > 10:
            total_pages = 10
    except Exception as e:
        total_pages = 1

    return total_pages, new_proxy

def get_comments(post_url, page_num, csvwrite, header, visited_urls, proxy):
    """逐页抓取评论数据，避免重复爬取"""
    page_num = min(page_num, 10)  # 限制最多爬取10页
    
    for page in range(1, page_num + 1):
        current_url = f"{post_url}?&pn={page}"
        if current_url in visited_urls:
            print(f"跳过已访问的 URL: {current_url}")
            continue

        print(f"正在抓取第 {page} 页的评论: {current_url}")
        visited_urls.add(current_url)  # 记录已爬取的 URL

        # 改用 safe_request
        res, proxy = safe_request(current_url, header, proxy, max_retries=3)
        soup = BeautifulSoup(res.text, 'html.parser')

        posts = soup.find_all('div', class_='l_post')

        for post in posts:
            # 获取一级评论
            comment_text = post.find('div', class_='d_post_content').text.strip()
            user_name = post.find('a', class_='p_author_name').text.strip()
            post_time = post.find('span', class_='tail-info').text.strip()

            dic = {'用户名': user_name,
                   '评论': comment_text,
                   '时间': post_time,
                   '级别': 1}
            if len(comment_text) > 2:
                csvwrite.writerow(dic.values())
            
            print(dic)

        # 再抓取二级评论
        num = post_url.split('/')[-1].split('?')[0]
        url2 = f"https://tieba.baidu.com/p/totalComment?t=1729087530500&tid={num}&fid=422204&pn={page}&see_lz=0"

        # 同样用 safe_request
        res2, proxy = safe_request(url2, header, proxy, max_retries=3)
        try:
            content2 = res2.json()
        except Exception as e:
            content2 = {}

        if 'data' in content2 and 'comment_list' in content2['data']:
            comment_list = content2['data']['comment_list']
            for key in comment_list:
                comment_info = comment_list[key].get('comment_info', [])
                cot = comment_list[key].get('comment_list_num', 0)
                if cot > 0:
                    for idx in range(0, len(comment_info)-1):
                        content_str = str(comment_info[idx].get('content', ''))
                        if len(content_str) < 2:
                            continue
                        if content_str.startswith("回复"):
                            result = re.sub(r'<.*?>', '', content_str, flags=re.S)
                            result = re.sub(r'.*?:', '', result, flags=re.S)
                            dic = {
                                '用户名': str(comment_info[idx].get('show_nickname', '')),
                                '评论': result,
                                '时间': str(datetime.fromtimestamp(comment_info[idx].get('now_time', 0))),
                                '级别': 2
                            }
                            csvwrite.writerow(dic.values())
                            continue
                        dic = {
                            '用户名': str(comment_info[idx].get('show_nickname', '')),
                            '评论': content_str.split("<")[0],
                            '时间': str(datetime.fromtimestamp(comment_info[idx].get('now_time', 0))),
                            '级别': 2
                        }
                        csvwrite.writerow(dic.values())

        time.sleep(random.uniform(2, 5))  # 防止访问过快

    # 函数末尾可以把最新的 proxy 返回上去（如果想在外面继续用）
    return proxy

def main(header):
    visited_urls = set()
    if os.path.exists("visited_urls.txt"):
        with open("visited_urls.txt", "r", encoding="utf-8") as file:
            visited_urls = {url.strip() for url in file.readlines()}

    proxy = get_proxy()  # 初始代理
    if not proxy:
        print("初始化代理请求失败，停止运行...")
        return

    with open("temp.txt", "r", encoding="utf-8") as f:
        urls = f.readlines()

    with open("deepseek吧.csv", mode="a", encoding="utf-8-sig", newline="") as f:
        csvwrite = csv.writer(f)

        for index, post_url in enumerate(urls, 1):
            post_url = post_url.strip()
            if not post_url or post_url in visited_urls:
                continue

            print(f"正在处理 {post_url}")
            
            # 这里detect_total_pages也用safe_request
            # 但为了保持现有结构，我们在detect_total_pages内部就会调用safe_request了
            # 并返回更新过的proxy
            total_pages, proxy = detect_total_pages(post_url, header, proxy)
            print(f"检测到的总页数: {total_pages}")

            # get_comments 也返回一个proxy
            proxy = get_comments(post_url, total_pages, csvwrite, header, visited_urls, proxy)

            # 爬完一个帖子，写入visited_urls
            with open("visited_urls.txt", "a", encoding="utf-8") as file:
                file.write(post_url + "\n")

            # 每爬 20 个帖子更换一次代理（按需保留）
            if index % 20 == 0:
                newp = get_proxy()
                if newp:
                    proxy = newp
                else:
                    print("更换代理失败，停止运行...")
                    return

            time.sleep(random.uniform(2, 6))

            # 将当前已爬取的 URL 保存到 visited_urls.txt
            with open("visited_urls.txt", "a", encoding="utf-8") as file:
                file.write(post_url + "\n")
                
if __name__ == '__main__':
    # 输入你的 cookie
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
        "Cookie": 'BAIDUID_BFESS=F335713ECCD2C666EFBBCCE5DEC39F99:FG=1; BAIDU_WISE_UID=wapp_1744361269880_973; USER_JUMP=-1; Hm_lvt_292b2e1608b0823c1cb6beef7243ef34=1744361269; HMACCOUNT=840D496E89D39F23; BAIDU_SSP_lcr=https://www.bing.com/; st_key_id=17; ppfuid=FOCoIC3q5fKa8fgJnwzbE67EJ49BGJeplOzf+4l4EOvDuu2RXBRv6R3A1AZMa49I27C0gDDLrJyxcIIeAeEhD8JYsoLTpBiaCXhLqvzbzmvy3SeAW17tKgNq/Xx+RgOdb8TWCFe62MVrDTY6lMf2GrfqL8c87KLF2qFER3obJGm51EODDlnqgz44AdUN5VVLGEimjy3MrXEpSuItnI4KD46xh6KcipCBP3WBNo1ZhUI4n1pPf9rvqnYNm2WK8+4sTzI0INRFxIzG2DKu3k+1Wd4MsFPXec0tFo4TuM3rq1TGgLbz7OSojK1zRbqBESR5Pdk2R9IA3lxxOVzA+Iw1TWLSgWjlFVG9Xmh1+20oPSbrzvDjYtVPmZ+9/6evcXmhcO1Y58MgLozKnaQIaLfWRIXZy/8llaWZittcIfh39p9be1wmaetIVuYIt+6PfjR+9o+ncGd7mqfxmbDMsEYcm3eTNkbS2el0J2+pbyoXJb3meBF6m1/WfXOHFmmRXSJdqfhwWryYacZfOfIdenDLSkJsc4rBzsbBPyjKAzWGBO7nCxNtgYtDo26K+8ukl31Y+/geIrmTvn+xVA1gAbbf1lkKhylX1zGsOVlJip30kecMEGvjdNWpsel/qfsfe5JBpqDTksMVoBr7nszRboiUHbedcq1mi/UXvX2b3lxbCLv4Mxoy+dFS3Fr9jSAmssiPARPZutqXQT8krr+KVakPUpdbkwv/8CHDu0C/Z5vtDeiYLQpEgFjmQoey69Fz+kM7Y5cg925MGCeBU4jWp2g2g01EE//Fu8ees+YfjecK5f1GFLdNDYkT5rF/Db0bOHGA/eECxI2daONtpFiihnGHCEhpOuhw8bYyjFOBzzWtHbYRe1LB3+zVvXhPoMLk0Emuxi1nzsvfmqeIhQULJo6Xn4DhfPYwBy251VHo7a6qr3VUkP+IMyTm9Axs3hGeRnsjM4C5HoV4JNFGc/zMnn1DkF5Vd39B2V0qbksU5wyvYVy2RLgR4VNDG2xA+Qxeuf4f4AUZAz08LV42LYM5lu+JtHw1NjGWsaQWWRfGLq36HGHJg3XMRSnqCBUJlagKAFPt2HF0LdsSk4WWcldb97Ar584nVGbSjPXEUVH0VgbUEm+dADzPoLP+NPMYOyhwgfADiqWaXyKT4UNESYXsPBkdGk6mLCaNSEQsDN1G2677Se3qjzDcyXBnEmHEFptRbmyJzKJ73veHPqfFYtsHO9jH0XnhYk8zKdRuqQ7dnuNIDwxm3UCPo22uFI0ZcgPvQm013qNuYC5cBHgQZWyv4MIy38pMuhT3p2yYLf70CvUwIkw='
    }

    main(header)


代理 IP: 58.19.54.137:40725
代理连接正常
正在处理 https://tieba.baidu.com/p/9641692393
正在检测总页数: https://tieba.baidu.com/p/9641692393?&pn=1
检测到的总页数: 1
正在抓取第 1 页的评论: https://tieba.baidu.com/p/9641692393?&pn=1
{'用户名': '春·正·夫', '评论': '这才是世上最绝望的死法为什么别人的deepseek一天能回答4，50个问题，我问了3个他就一直繁忙啊，半夜凌晨1点多也是繁忙（这个是我在应用商城里下的，有什么办法解决，还是要换一个）', '时间': '来自Android客户端', '级别': 1}
{'用户名': '那又怎么样z', '评论': '下载个问小白', '时间': '来自Android客户端', '级别': 1}
{'用户名': '贴吧用户_aJe1e2Q', '评论': '高强度使用，使用api', '时间': '来自Android客户端', '级别': 1}
{'用户名': '贴吧用户_aJe1e2Q', '评论': '偶尔使用，忍着', '时间': '来自Android客户端', '级别': 1}
{'用户名': 'MorgenSE', '评论': '如果你不进行文字创作的话，可以下个腾讯元宝，那个版本ds基本秒回，智商差不多', '时间': '来自Android客户端', '级别': 1}
{'用户名': 'Daisy-Mo', '评论': '遇到服务器繁忙，需要新开一个话题界面，不然会一直服务器繁忙。', '时间': '来自Android客户端', '级别': 1}
{'用户名': '云浓风重', '评论': 'xiaoyi.huawei.com', '时间': '来自Android客户端', '级别': 1}
{'用户名': '吉良吉影', '评论': '用别的', '时间': '来自Android客户端', '级别': 1}
{'用户名': '荒\u2060', '评论': '用元宝啊', '时间': '来自Android客户端', '级别': 1}
正在处理 https://tieba.baidu.com/p/9644160679
正在检测总页数: http

KeyboardInterrupt: 