In [22]:
import scrapy
import scrapy.crawler as crawler
from multiprocessing import Process, Queue
from twisted.internet import reactor
import logging

In [16]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [17]:
# the wrapper to make it run more times
def run_spider(spider):
    def f(q):
        try:
            runner = crawler.CrawlerRunner()
            deferred = runner.crawl(spider)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

In [29]:
class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/'
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }

    def parse(self, response):
        for quote in response.css('div.quote'):
             yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
                'tags': quote.css('div.tags a.tag::text').getall(),
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
        
        #page = response.url.split("/")[-2]
        #filename = 'quotes-%s.html' % page
        #with open(filename, 'wb') as f:
        #    f.write(dic)

 response.follow just returns a Request instance; you still have to yield this Request.

In [None]:
process.crawl(QuotesSpider)
process.start()

In [30]:
print('first run:')
run_spider(QuotesSpider)

#print('\nsecond run:')
#run_spider(QuotesSpider)

first run:


In [31]:
!ls

Python-crawler.ipynb quoteresult.json     quotes-2.html
quoteresult.jl       quotes-1.html        [34mtutorial[m[m


In [32]:
import pandas as pd
qt = pd.read_json('quoteresult.json')

In [40]:
qt

Unnamed: 0,author,tags,text
0,Albert Einstein,"[change, deep-thoughts, thinking, world]",“The world as we have created it is a process ...
1,J.K. Rowling,"[abilities, choices]","“It is our choices, Harry, that show what we t..."
2,Albert Einstein,"[inspirational, life, live, miracle, miracles]",“There are only two ways to live your life. On...
3,Jane Austen,"[aliteracy, books, classic, humor]","“The person, be it gentleman or lady, who has ..."
4,Marilyn Monroe,"[be-yourself, inspirational]","“Imperfection is beauty, madness is genius and..."
5,Albert Einstein,"[adulthood, success, value]",“Try not to become a man of success. Rather be...
6,André Gide,"[life, love]",“It is better to be hated for what you are tha...
7,Thomas A. Edison,"[edison, failure, inspirational, paraphrased]","“I have not failed. I've just found 10,000 way..."
8,Eleanor Roosevelt,[misattributed-eleanor-roosevelt],“A woman is like a tea bag; you never know how...
9,Steve Martin,"[humor, obvious, simile]","“A day without sunshine is like, you know, nig..."
