## Scrape Archived Mini Normals from Mafiascum.net

#### Scrapy Structure/Lingo:
**Spiders** extract data **items**, which Scrapy send one by one to a configured **item pipeline** (if there is possible) to do post-processing on the items.)

## Import relevant packages...

In [1]:
import scrapy
import math
import logging
import json
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field
from scrapy.selector import Selector

## Initial variables...

In [2]:
perpage = 25

class PostItem(scrapy.Item):
    pagelink = scrapy.Field()
    forum = scrapy.Field()
    thread = scrapy.Field()
    number = scrapy.Field()
    timestamp = scrapy.Field()
    user = scrapy.Field()
    content = scrapy.Field()

## Define what happens to scrape output...

In [3]:
# The following pipeline stores all scraped items (from all spiders) 
# into a single items.jl file, containing one item per line serialized 
# in JSON format:
class JsonWriterPipeline(object):

    # operations performed when spider starts
    def open_spider(self, spider):
        self.file = open('posts.jl', 'w')

    # when the spider finishes
    def close_spider(self, spider):
        self.file.close()

    # when the spider yields an item
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## Define spider...

In [4]:
class MafiaScumSpider(scrapy.Spider):
    name = 'mafiascum'
    
    # define set of threads we're going to scrape from (ie all of them)
    start_urls = [each[:each.find('\n')] for each in open('archive.txt').read().split('\n\n\n')]
        
    # settings
    custom_settings = {'LOG_LEVEL': logging.WARNING,
                      'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}}

    # get page counts and then do the REAL parse on every single page
    def parse(self, response):
        # find page count 
        try:
            postcount = Selector(response).xpath(
                '//div[@class="pagination"]/text()').extract()
            postcount = int(postcount[0][4:postcount[0].find(' ')])

            # yield parse for every page of thread
            for i in range(math.ceil(postcount/perpage)):
                yield scrapy.Request(response.url+'&start='+str(i*perpage),
                                    callback=self.parse_page)
        except IndexError: # if can't, the thread probably doesn't exist
            return
        
        
    def parse_page(self, response):
        # scan through posts on page and yield Post items for each
        sel = Selector(response)
        location = sel.xpath('//div[@id="page-body"]/h2/a/@href').extract()[0]
        forum = location[location.find('f=')+2:location.find('&t=')]
        if location.count('&') == 1:
            thread = location[location.find('&t=')+3:]
        elif location.count('&') == 2:
            thread = location[
                location.find('&t=')+3:location.rfind('&')]
        
        posts = (sel.xpath('//div[@class="post bg1"]') +
                 sel.xpath('//div[@class="post bg2"]'))
        
        for p in posts:
            post = PostItem()
            post['forum'] = forum
            post['thread'] = thread
            post['pagelink'] = response.url
            try:
                post['number'] = p.xpath(
                    'div/div[@class="postbody"]/p/a[2]/strong/text()').extract()[0][1:]
            except IndexError:
                post['number'] = p.xpath(
                    'div[@class="postbody"]/p/a[2]/strong/text()').extract()[0][1:]
            
            try:
                post['timestamp'] = p.xpath(
                    'div/div/p/text()[4]').extract()[0][23:-4]
            except IndexError:
                post['timestamp'] = p.xpath(
                    'div[@class="postbody"]/p/text()[4]').extract()[0][23:-4]
            
            try:
                post['user'] = p.xpath('div/div/dl/dt/a/text()').extract()[0]
            except IndexError:
                post['user'] = '<<DELETED_USER>>'
                
            try:
                post['content'] = p.xpath(
                    'div/div/div[@class="content"]').extract()[0][21:-6]
            except IndexError:
                post['content'] = p.xpath(
                    'div[@class="postbody"]/div[@class="content"]').extract()[0][21:-6]
            
            yield post

## Start scraping...

In [5]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(MafiaScumSpider)
process.start()

2017-09-25 10:26:08 [scrapy.utils.log] INFO: Scrapy 1.4.0 started (bot: scrapybot)
2017-09-25 10:26:08 [scrapy.utils.log] INFO: Overridden settings: {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


...and output should be a json file in same directory as this notebook! 

## Leftover Code...

In [None]:
# open mini normal archive

# ??? i don't remember what this does; probably helped me collect archive links some time ago
runthis = False

if runthis:
    # relevant packages
    from selenium import webdriver
    from scrapy.selector import Selector
    import re

    # configure browser
    options = webdriver.ChromeOptions()
    options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'
    options.add_argument('window-size=800x841')
    driver = webdriver.Chrome(chrome_options=options)

    # get the thread titles and links
    links = []
    titles = []
    for i in range(0, 400, 100):
        driver.get('https://forum.mafiascum.net/viewforum.php?f=53&start=' + str(i))
        sel = Selector(text=driver.page_source)
        links += sel.xpath('//div[@class="forumbg"]/div/ul[@class="topiclist topics"]/li/dl/dt/a[1]/@href').extract()
        titles += sel.xpath('//div[@class="forumbg"]/div/ul[@class="topiclist topics"]/li/dl/dt/a[1]/text()').extract()

    # formatting, excluding needless threads...
    titles = titles[1:]
    links = links[1:]
    del links[titles.index('Mini Normal Archives')]
    del titles[titles.index('Mini Normal Archives')]
    titles = [re.search(r'\d+', each).group(0) for each in titles]

    # match txt archive game numbers with forum archive game numbers to find links
    f = open('archive.txt', 'r')
    txtarchives = f.read().split('\n\n\n')
    numbers = [re.search(r'\d+', each[:each.find('\n')]).group(0) for each in txtarchives]
    f.close()

    # store the result...
    for i, n in enumerate(numbers):
        txtarchives[i] = 'http://forum.mafiascum.net' + links[titles.index(n)][1:] + '\n' + txtarchives[i]
    f = open('archive2.txt', 'w')
    f.write('\n\n\n'.join(txtarchives))
    f.close()