In [2]:
# Tag page data crawl code
import requests
import csv
import json
from datetime import datetime

def scrape_xiaohongshu(base_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
    }

    with open('Xiaohongshu Tag Page Data.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Note ID', 'Title', 'Likes', 'User Nickname', 'User ID', 'Image URL', 'Create Time'])

        cursor = ''
        while True:
            url = f"{base_url}&cursor={cursor}" if cursor else base_url
            response = requests.get(url, headers=headers)
            try:
                data = response.json()
            except json.JSONDecodeError:
                print("Unable to parse JSON: ", response.text)
                break

            if not data['data']['notes']:
                break

            for note in data['data']['notes']:
                note_id = note['id']
                title = note['title']
                likes = note['likes']
                user_nickname = note['user']['nickname']
                user_id = note['user']['userid']
                image_url = note['images_list'][0]['url'] if note['images_list'] else ''
                create_time_ms = note['create_time']
                create_time = datetime.fromtimestamp(create_time_ms / 1000).strftime('%Y-%m-%d %H:%M:%S')

                writer.writerow([note_id, title, likes, user_nickname, user_id, image_url, create_time])

            cursor = data['data']['notes'][-1]['cursor']

base_url = 'https://www.xiaohongshu.com/web_api/sns/v3/page/notes?page_size=6&sort=hot&page_id=64fe7e93e0687e0001d2dcb3'
scrape_xiaohongshu(base_url)

In [None]:
# Specific notes comment data crawl code
import requests
import csv

def scrape_comments(note_id, initial_cursor=''):
    base_url = "https://edith.xiaohongshu.com/api/sns/web/v2/comment/page"
    params = {
        'note_id': note_id,
        'cursor': initial_cursor,
        'image_formats': 'jpg,webp,avif'
    }
    headers = {
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Cookie': 'abRequestId=0c3c6cb4-55a3-5fe1-9e83-b8256aae2d84; a1=18b4ff17de8wc6dr7viwf7ow22xwvs8s8j1yb1sng30000326940; webId=68369f8f2b369f65ba484733d2749b18; gid=yYD4iiyYJKY0yYD4iiyWfVyMdYESKfFWh3EiyIkWlEJJCvq8EhMYMD888qJKj48804DKD8Y4; unread={%22ub%22:%22656bf955000000000602378a%22%2C%22ue%22:%2265727c500000000038023457%22%2C%22uc%22:23}; xsecappid=xhs-pc-web; web_session=040069b4473273f90a70bb824f374b332cda18; websectiga=2a3d3ea002e7d92b5c9743590ebd24010cf3710ff3af8029153751e41a6af4a3; sec_poison_id=35fcb4c8-c29d-4482-a53e-001baa796971; webBuild=3.20.2'
    }

    processed_comments = set() 

    with open('Xiaohongshu Note Comments Data.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['User ID', 'Nickname', 'Content', 'Like Count', 'IP Location'])

        while True:
            response = requests.get(base_url, params=params, headers=headers)
            data = response.json()

            if not data['data']['comments']:
                break

            process_comments(data['data']['comments'], writer, note_id, processed_comments)

            params['cursor'] = data['data']['cursor']

def process_comments(comments, writer, note_id, processed_comments):
    for comment in comments:
        if comment['id'] not in processed_comments:
            write_comment(comment, writer)
            processed_comments.add(comment['id'])

            if 'sub_comments' in comment and comment['sub_comments']:
                process_comments(comment['sub_comments'], writer, note_id, processed_comments)
        
            if comment.get('sub_comment_has_more'):
                fetch_and_process_additional_sub_comments(note_id, comment['id'], writer, processed_comments)

def write_comment(comment, writer):
    user_id = comment['user_info']['user_id']
    nickname = comment['user_info']['nickname']
    content = comment['content']
    like_count = comment.get('like_count', 0)
    ip_location = comment.get('ip_location', '')
    writer.writerow([user_id, nickname, content, like_count, ip_location])

def fetch_and_process_additional_sub_comments(note_id, root_comment_id, writer, processed_comments):
    sub_comments_url = "https://edith.xiaohongshu.com/api/sns/web/v2/comment/sub/page"
    cursor = ""

    while True:
        params = {
            'note_id': note_id,
            'root_comment_id': root_comment_id,
            'num': 10,
            'cursor': cursor,
            'image_formats': 'jpg,webp,avif'
        }
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Cookie': 'abRequestId=0c3c6cb4-55a3-5fe1-9e83-b8256aae2d84; a1=18b4ff17de8wc6dr7viwf7ow22xwvs8s8j1yb1sng30000326940; webId=68369f8f2b369f65ba484733d2749b18; gid=yYD4iiyYJKY0yYD4iiyWfVyMdYESKfFWh3EiyIkWlEJJCvq8EhMYMD888qJKj48804DKD8Y4; unread={%22ub%22:%22656bf955000000000602378a%22%2C%22ue%22:%2265727c500000000038023457%22%2C%22uc%22:23}; xsecappid=xhs-pc-web; web_session=040069b4473273f90a70bb824f374b332cda18; websectiga=2a3d3ea002e7d92b5c9743590ebd24010cf3710ff3af8029153751e41a6af4a3; sec_poison_id=35fcb4c8-c29d-4482-a53e-001baa796971; webBuild=3.20.2'
        }

        response = requests.get(sub_comments_url, params=params, headers=headers)
        data = response.json()

        if not data['data']['comments']:
            break

        process_comments(data['data']['comments'], writer, note_id, processed_comments)

        cursor = data['data'].get('cursor')
        if not cursor:
            break

note_id = '655f4fd4000000001b00dafd' 
scrape_comments(note_id)