# Thread Scraping

Here we provide an example of how to scrape threads using donbot and the `scrapy` library. Scrapy can scrape threads much more efficiently than donbot, because it's designed to make multiple requests in parallel, and it's also designed to be able to scrape multiple pages of a website. All this means that it can scrape threads much faster than donbot can.

This example demonstrates the interoperability of donbot with other libraries and its usefulness for basic research activities. In this case, we use scrapy to manage requests across multiple threads and store asynchronously collected posts data, and donbot to parse the HTML and extract the posts.

In [None]:
# @title ## Provide a thread url . Press the play button to the left to generate!
thread = "https://forum.mafiascum.net/viewtopic.php?t=12551" # @param {type:"string"}
urls = [thread]

try:
    import scrapy
except ImportError:
    !pip install -q scrapy
    import scrapy
import math
import logging
import json
from scrapy.crawler import CrawlerProcess
from lxml import html
try:
    from donbot.operations import count_posts, get_posts
except ImportError:
    !pip install -q donbot-python
    from donbot.operations import count_posts, get_posts
from tqdm import tqdm
import os
from google.colab import files


posts_per_page = 25


class PostItem(scrapy.Item):
    number = scrapy.Field()
    id = scrapy.Field()
    user = scrapy.Field()
    content = scrapy.Field()
    time = scrapy.Field()
    pagelink = scrapy.Field()
    forum = scrapy.Field()
    thread = scrapy.Field()


# The following pipeline stores all scraped items (from all spiders)
# into a single jsonl file, containing one item per line serialized
# in JSON format:
class JsonWriterPipeline(object):
    # operations performed when spider starts
    def open_spider(self, spider):
        self.file = open("posts.jsonl", "w")

    # when the spider finishes
    def close_spider(self, spider):
        self.file.close()

    # when the spider yields an item
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item


class MafiaScumSpider(scrapy.Spider):
    name = "mafiascum"

    # settings
    custom_settings = {
        "LOG_LEVEL": logging.WARNING,
        "ITEM_PIPELINES": {"__main__.JsonWriterPipeline": 1},
    }

    def start_requests(self):
        "Generates scrapy.Request objects for each URL in the 'archive.txt' file."

        for url in tqdm(urls):
            yield scrapy.Request(url=url, callback=self.request_each_page)

    def request_each_page(self, response):
        "Generates scrapy.Request objects for each page of a thread."
        try:
            thread = response.url
            post_count = count_posts(html.fromstring(response.body))
            end_page_id = math.floor(post_count / posts_per_page) * posts_per_page

            for page_id in range(0, end_page_id, posts_per_page):
                yield scrapy.Request(
                    f"{thread}&start={str(page_id)}",
                    callback=self.process_posts,
                )
        except IndexError:
            return  # occurs when the requested thread doesn't exist or is empty (?)

    def process_posts(self, response):
        "Extracts post data from a page of a thread."
        thread_page_html = html.fromstring(response.body)
        posts = get_posts(thread_page_html)
        page_link = response.url
        thread = page_link[page_link.find("&t=") + 3 : page_link.find("&start")]
        forum = page_link[page_link.find("f=") + 2 : page_link.find("&t=")]
        for post in posts:
            yield PostItem(
                {"pagelink": page_link, "forum": forum, "thread": thread, **post}
            )


if __name__ == "__main__":
    # Start scraping...
    process = CrawlerProcess()
    process.crawl(MafiaScumSpider)
    process.start()
    files.download('posts.jsonl')


