1. Final script crawler

In [None]:
from DrissionPage import ChromiumPage, WebPage
import time
from urllib.parse import quote
from bs4 import BeautifulSoup
import random
from tqdm import tqdm
import pandas as pd
from datetime import datetime

In [None]:
# As the crawling of Xiaohongshu requires logging in with an account, we have customized a sign in function.
def sign_in():
    sign_in_page = ChromiumPage()
    sign_in_page.get('https://www.xiaohongshu.com')
    print("please scan the qr-code to sign in")
    time.sleep(20)

In [None]:
# Convert the keyword into a format that can be read as a link
def search(keyword):
    global page
    page = ChromiumPage()
    page.get(f'https://www.xiaohongshu.com/search_result?keyword={quote(keyword)}&source=web_search_result_notes')
    time.sleep(5)

In [None]:
# Define a function that grabs a link to a post that appears in the search results screen.
def get_links():
    global links
    links = []
    try:
        container = page.ele('.feeds-page')
        sections = container.eles('.note-item')
        for section in sections:
            soup = BeautifulSoup(section.html, 'html.parser')
            note_link_element = soup.find('a', class_='cover ld mask')
            href = note_link_element.get('href')  
            if href:  
                note_link = "https://www.xiaohongshu.com" + href
                links.append(note_link)
    except Exception as e:
        print(f"Error in get_links: {e}")

In [None]:
# Xiaohongshu's search interface is not loaded all at once, so we need to add a function that simulates scrolling down.
def page_scroll_down():
    print("********scroll down the page********")
    random_time = random.uniform(0.5, 1.5)
    time.sleep(random_time)
    page.scroll.to_bottom()

In [None]:
# After sliding down, Xiaohongshu still needs some time to load the content, otherwise it may not be able to crawl the content.
def craw(times):
    for i in tqdm(range(1, times + 1)):
        get_links()
        page_scroll_down()
        time.sleep(5)

In [None]:
# Crawl post links for ten keywords
keywords = ["婚姻", "结婚", "婚姻观", "结婚的意义", "婚姻关系", "不婚", "未婚", "已婚", "夫妻", "婚姻家庭"]
# A search term interface usually needs to be scrolled down 20 times to reach the bottom of the page.
times = 20

sign_in()
df1 = []

for keyword in keywords:
    search(keyword)
    craw(times)
    # combine crawled post links and corresponding search keywords into a data frame
    data_for_keyword = pd.DataFrame(links, columns=['note_link'])
    data_for_keyword['source_keyword'] = keyword  
    df1.append(data_for_keyword)
link_df = pd.concat(df1, ignore_index=True) 
#print(link_df)

In [None]:
def open_url(url):
    global page
    # Xiaohongshu will enter the slider verification(=滑块验证) module if too much is scraped at once
    page = WebPage('s')
    page.get(f'{url}')
    if page.title == '滑块验证':
        page.change_mode()
        page.get(f'{url}')

In [None]:
def get_author_info(page):
    # Locate author information
    div_author = page.ele('.author-container', timeout=0)
    div_info = div_author.ele('.info', timeout=0)
    author_name = div_info.ele('.username', timeout=0).text
    author_info = {'author_name': author_name}
    return author_info

In [None]:
def get_note_content(page):
    # Position the div containing the note details
    note_content = page.ele('.note-content', timeout=0)

    # title
    try:
        note_title = note_content.ele('.title', timeout=0).text
    except:
        note_title = ""

    # description= post content
    try:
        note_desc = note_content.ele('.desc', timeout=0).text
    except:
        note_desc = ""

    # tag
    tags = []
    try:
        note_tags = note_content.eles('.tag', timeout=0)
        for tag in note_tags:
            tag_text = tag.texts()[0]
            tags.append(tag_text)
    except:
        pass

    # publish data
    note_date = note_content.ele('.bottom-container', timeout=0).text
    # Extract date
    if "编辑于" in note_date:
        parts = note_date.split(" ")
        date = parts[1]
        count = date.count('-')
        if count == 1:
            # fill the year
            current_year = datetime.now().year
            date = str(current_year) + "-" + date
    else:
        date = note_date
        count = date.count('-')
        if count == 1:
            current_year = datetime.now().year
            date = str(current_year) + "-" + date

    content = {'note_title': note_title, 'note_desc': note_desc,
               'tags': tags, 'note_date': date}
    return content

In [None]:
def get_count(page):
    """Get Likes, Collections, Comments counts"""
    html = page.html
    soup = BeautifulSoup(html, 'html.parser')
    like_count = soup.find_all('meta', attrs={'name': 'og:xhs:note_like'})[0]['content']
    collect_count = soup.find_all('meta', attrs={'name': 'og:xhs:note_collect'})[0]['content']
    chat_count = soup.find_all('meta', attrs={'name': 'og:xhs:note_comment'})[0]['content']

    count = {'like_count': like_count, 'collect_count': collect_count, 'chat_count': chat_count}
    return count

In [None]:
def get_note_page_info(url):
    open_url(url)
    try:
        # extract the author info& note details &counts
        author_info = get_author_info(page)
        content = get_note_content(page)
        count = get_count(page)

        # extract information  
        author_name = author_info['author_name']
        note_title = content['note_title']
        note_desc = content['note_desc']
        tags = content['tags']
        date_str = content['note_date']
        like_count = count['like_count']
        collect_count = count['collect_count']
        comment_count = count['chat_count']

        # construct the data dictionary to be returned, unify column names, and fill in note_url and source_keyword
        note_info_dict = {'title': note_title, 'content': note_desc, 'time': date_str,
                          'author_name': author_name, 'liked_count': like_count,
                          'collected_count': collect_count, 'comment_count': comment_count,
                          'tag_list': tags, 'note_url': url, 'source_keyword': ""}
        return note_info_dict
    except:
        return None

In [None]:
# Crawl the required information from the URLs in the link_df, such as titles, content, etc.
all_note_info = []
for index, row in link_df.iterrows():
    note_info = get_note_page_info(row["note_link"])
    if note_info:
        # extract the note_url and source_keyword column data to fill in the corresponding positions
        note_info['source_keyword'] = row['source_keyword']
        all_note_info.append(note_info)

result_df = pd.DataFrame(all_note_info)
#print(result_df)

In [None]:
# Save the data to a CSV file with a lowercase filename
result_df.to_csv('xhs_content.csv', index=False)