# 安装必要的库

In [7]:
pip install pandas
pip install pillow
pip install requests
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


# 对小红书首页及其二级页面进行数据爬取及解析操作，并进行数据保存

In [16]:
import requests
from bs4 import BeautifulSoup
import re
import time
import csv

# 要爬取的小红书帖子链接
base_url = "https://www.xiaohongshu.com/explore"
url_list = ['homefeed_recommend', 'homefeed.movie_and_tv_v3', 'homefeed.gaming_v3']

def scrape_posts(url, csv_writer):
    """爬取指定页面的帖子，并将数据写入CSV文件"""
    print(f"正在爬取页面：{url}")
    
    # 发送GET请求获取页面内容
    response = requests.get(url)
    
    # 检查请求是否成功
    if response.status_code == 200:
        html_content = response.text
    else:
        print(f"请求失败，状态码：{response.status_code}")
        return 
    
    # 创建 BeautifulSoup 对象并解析 HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 查找所有帖子项
    note_items = soup.find_all('section', class_='note-item')
    
    # 遍历每个帖子项，提取信息
    for i, note in enumerate(note_items[:120], 1):  # 提取前10篇帖子的信息
        # 提取帖子标题
        post_title = note.find('a', class_='title')
        if post_title:
            post_title = post_title.find('span').text
        else:
            post_title = "标题未找到"

        # 提取作者信息
        author = note.find('span', class_='name')
        if author:
            author = author.text
        else:
            author = "作者未找到"

        # 提取点赞数
        like_count = note.find('span', class_='count')
        if like_count:
            like_count = like_count.text
        else:
            like_count = "点赞数未找到"

        # 提取二级页面链接
        post_link_element = note.find('a', class_='cover mask ld')
        if post_link_element and 'href' in post_link_element.attrs:
            href = post_link_element['href']
            full_post_link = f"https://www.xiaohongshu.com{href}"
        else:
            full_post_link = "链接未找到"

        # 提取封面图链接
        cover_image_element = note.find('img')
        if cover_image_element and 'src' in cover_image_element.attrs:
            cover_image_url = cover_image_element['src']
        else:
            cover_image_url = "封面图未找到"

        # 发送请求获取二级页面内容
        if full_post_link != "链接未找到":
            response = requests.get(full_post_link)
            if response.status_code == 200:
                secondary_soup = BeautifulSoup(response.text, 'html.parser')
                # 提取正文内容
                post_desc = secondary_soup.find('div', class_='desc')
                if post_desc:
                    # 保留原文内容并移除标签
                    original_text = post_desc.text.strip()
                    # 提取正文内容中的标签
                    tags = re.findall(r'#(\S+)', original_text)
                    if tags:
                        # 移除原文中的标签
                        cleaned_text = re.sub(r'#\S+', '', original_text)
                        cleaned_text = ' '.join(cleaned_text.split())
                    else:
                        cleaned_text = original_text
                        tags = []
                else:
                    cleaned_text = "正文内容未找到"
                    tags = []
            else:
                cleaned_text = "无法获取内容"
                tags = []
        else:
            cleaned_text = "链接未找到"
            tags = []

        # 将数据写入CSV文件
        csv_writer.writerow([
            post_title,
            author,
            like_count,
            full_post_link,
            cleaned_text,
            ', '.join(tags),
            cover_image_url  # 新增封面图链接
        ])

        # 避免请求过于频繁
        time.sleep(1)

def main():
    # 创建CSV文件并写入表头
    with open('xiaohongshu_posts.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['标题', '作者', '点赞数', '链接', '正文内容', '标签', '封面图链接'])
        
        # 爬取首页
        scrape_posts(base_url, csv_writer)
        
        # 爬取 url_list 中的页面
        for channel_id in url_list:
            channel_url = f"{base_url}?channel_id={channel_id}"
            scrape_posts(channel_url, csv_writer)
            time.sleep(2)  # 避免请求过于频繁
    
    print("爬取完成！")

if __name__ == "__main__":
    main()

正在爬取页面：https://www.xiaohongshu.com/explore
正在爬取页面：https://www.xiaohongshu.com/explore?channel_id=homefeed_recommend
正在爬取页面：https://www.xiaohongshu.com/explore?channel_id=homefeed.movie_and_tv_v3
正在爬取页面：https://www.xiaohongshu.com/explore?channel_id=homefeed.gaming_v3
爬取完成！
