# Thread Scraping

Here we provide an example of how to scrape threads using donbot and the `scrapy` library. Scrapy can scrape threads much more efficiently than donbot, because it's designed to make multiple requests in parallel, and it's also designed to be able to scrape multiple pages of a website. All this means that it can scrape threads much faster than donbot can.

This example demonstrates the interoperability of donbot with other libraries and its usefulness for basic research activities. In this case, we use scrapy to manage requests across multiple threads and store asynchronously collected posts data, and donbot to parse the HTML and extract the posts.

In [None]:
# @title ## Provide a thread url . Press the play button to the left to generate!
thread = "https://forum.mafiascum.net/viewtopic.php?t=12551" # @param {type:"string"}
urls = [thread]

try:
    import scrapy
except ImportError:
    !pip install -q scrapy
    import scrapy
import math
from math import floor
import logging
import json
from scrapy.crawler import CrawlerProcess
from lxml import html
from tqdm import tqdm
import os
from google.colab import files
from lxml import html
from lxml.html import HtmlElement

posts_per_page = 25


def count_posts(thread_html: HtmlElement) -> int:
    """
    Counts the number of posts in the specified thread.

    Parameters
    ----------
    thread_html : HtmlElement
        The HTML of a page from the thread to count posts in.

    Returns
    -------
    int
        The number of posts in the specified thread.
    """
    post_count_path = "//div[@class='pagination']/text()"
    post_count_element = next(
        el for el in thread_html.xpath(post_count_path) if el.strip()
    )
    return int("".join([c for c in post_count_element if c.isdigit()]))


def get_thread_page_urls(
    thread: str, thread_page_html: HtmlElement, start: int = 0, end: int = -1
) -> list[str]:
    """
    Get the URLs of the pages of a thread.

    Parameters
    ----------
    thread : str
        The URL of the thread.
    thread_page_html : HtmlElement
        The HTML of a page from the thread.
    end : int
        The number of pages to retrieve.

    Returns
    -------
    list[str]
        The URLs of the pages of the thread.
    """
    end = end if end != -1 else count_posts(thread_page_html)

    posts_per_page = 25
    start_page_id = floor(start / posts_per_page) * posts_per_page
    end_page_id = floor(end / posts_per_page) * posts_per_page

    return [
        f"{thread}&start={str(page_id)}"
        for page_id in range(start_page_id, end_page_id + 1, posts_per_page)
    ]


def get_post(post_html: HtmlElement) -> dict:  # sourcery skip: merge-dict-assign
    """
    Extracts the data of a post from the post HTML.

    Parameters
    ----------
    post_html : HtmlElement
        The HTML of a post.

    Returns
    -------
    dict
        The post's data, including post `id`, `number`, `user, `time`, and `content`.
    """
    post_number_path = ".//span[@class='post-number-bolded']//text()"
    post_user_path = ".//a[@class='username' or @class='username-coloured']/text()"
    post_user_id_path = ".//a[@class='username' or @class='username-coloured']/@href"
    post_content_path = ".//div[@class='content']"
    post_timestamp_path = ".//p[@class='author modified']/text()"
    post_id_path = ".//a/@href"

    post = {}
    post["number"] = int(post_html.xpath(post_number_path)[0][1:])
    post["id"] = post_html.xpath(post_id_path)[0]
    post["id"] = post["id"][post["id"].rfind("#") + 2 :]
    post["user"] = post_html.xpath(post_user_path)[0]
    post["user_id"] = post_html.xpath(post_user_id_path)[0]
    post["user_id"] = post["user_id"][post["user_id"].rfind("=") + 1 :]
    post["content"] = html.tostring(post_html.xpath(post_content_path)[0])
    post["content"] = post["content"].decode("UTF-8").strip()[21:-6]
    post["time"] = post_html.xpath(post_timestamp_path)[-1]
    post["time"] = post["time"][post["time"].find("» ") + 2 :].strip()
    return post


def get_posts(
    thread_page_html: HtmlElement, start: int = 0, end: int | float = -1
) -> list[dict]:
    """
    Retrieve posts from a thread.

    Parameters
    ----------
    thread_page_html : HtmlElement
        The HTML of a page from the thread to retrieve posts from.
    start : int
        Lowest post number to retrieve.
    end : int, optional
        Highest post number to retrieve.

    Returns
    -------
    list[dict]
        Each post's data, including post `id`, `number`, `user, `time`, and `content`.
    """
    posts = []
    end = end if end != -1 else float("inf")
    for raw_post in thread_page_html.xpath("//div[@class='postbody']"):
        post = get_post(raw_post)
        if post["number"] >= start and post["number"] <= end:
            posts.append(post)
    return posts


class PostItem(scrapy.Item):
    number = scrapy.Field()
    id = scrapy.Field()
    user = scrapy.Field()
    content = scrapy.Field()
    time = scrapy.Field()
    pagelink = scrapy.Field()
    forum = scrapy.Field()
    thread = scrapy.Field()


# The following pipeline stores all scraped items (from all spiders)
# into a single jsonl file, containing one item per line serialized
# in JSON format:
class JsonWriterPipeline(object):
    # operations performed when spider starts
    def open_spider(self, spider):
        self.file = open("posts.jsonl", "w")

    # when the spider finishes
    def close_spider(self, spider):
        self.file.close()

    # when the spider yields an item
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item


class MafiaScumSpider(scrapy.Spider):
    name = "mafiascum"

    # settings
    custom_settings = {
        "LOG_LEVEL": logging.WARNING,
        "ITEM_PIPELINES": {"__main__.JsonWriterPipeline": 1},
    }

    def start_requests(self):
        "Generates scrapy.Request objects for each URL in the 'archive.txt' file."

        for url in tqdm(urls):
            yield scrapy.Request(url=url, callback=self.request_each_page)

    def request_each_page(self, response):
        "Generates scrapy.Request objects for each page of a thread."
        try:
            thread = response.url
            post_count = count_posts(html.fromstring(response.body))
            end_page_id = math.floor(post_count / posts_per_page) * posts_per_page

            for page_id in range(0, end_page_id, posts_per_page):
                yield scrapy.Request(
                    f"{thread}&start={str(page_id)}",
                    callback=self.process_posts,
                )
        except IndexError:
            return  # occurs when the requested thread doesn't exist or is empty (?)

    def process_posts(self, response):
        "Extracts post data from a page of a thread."
        thread_page_html = html.fromstring(response.body)
        posts = get_posts(thread_page_html)
        page_link = response.url
        thread = page_link[page_link.find("&t=") + 3 : page_link.find("&start")]
        forum = page_link[page_link.find("f=") + 2 : page_link.find("&t=")]
        for post in posts:
            yield PostItem(
                {"pagelink": page_link, "forum": forum, "thread": thread, **post}
            )


if __name__ == "__main__":
    # Start scraping...
    process = CrawlerProcess()
    process.crawl(MafiaScumSpider)
    process.start()
    files.download('posts.jsonl')