In [1]:
from scrapy import Spider, Request, Item, Field
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

from twisted.internet import reactor
from tinydb import TinyDB, Query

import time

## Wiki Article Class
We're going to store the name, path and text of an article

In [2]:
class WikiArticle(Item):
    name = Field()
    path = Field()
    info = Field()

## Wiki Spider
next, let's use scrapy to create a spider for the website

In [3]:
class WikiSpider(Spider):
    name = "wiki_spider"
    start_urls = ['http://sdsfwiki.benzr.xyz/']  # replace with your wiki site URL

    def parse(self, response):
        article = WikiArticle()
        article['name'] = response.css('h1::text').get()
        article['path'] = response.url
        article['info'] = response.css('div.content::text').getall()  # replace 'div.content' with the proper CSS selector for your wiki content
        yield article

        # Follow internal links
        for href in response.css('a::attr(href)').getall():  # replace 'a' with the proper CSS selector for your wiki internal links
            yield response.follow(href, self.parse)


## Database time!

In [4]:
db = TinyDB('wiki_db.json')

def store_in_db(item):
    db.insert(dict(item))

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s', 'LOG_LEVEL': 'DEBUG'})
runner = CrawlerRunner({
    'ITEM_PIPELINES': {__name__ + '.store_in_db': 1},  # Magic to make the crawler call `store_in_db` for each scraped item.
})


and let's try it out!

In [5]:
d = runner.crawl(WikiSpider)
d.addBoth(lambda _: reactor.stop())  # Stop the reactor when the spider finishes
reactor.run()  # Start the reactor (blocks execution)

INFO: Overridden settings:
{}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-06-30 13:40:43 [scrapy.extensions.telnet] INFO: Telnet Password: 87f191d2bcf8e7b5
2023-06-30 13:40:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2023-06-30 13:40:43 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewar