这里通过抓取简单网站简单介绍一些Scrapy的用法
- 参考官方文档
a. 抓取第一页
b. 获取内容
c. 翻页爬取
d. 保存爬取内容
which scrapy #命令可以查看scrapy的路径
scrapy startproject quotetutorial(项目名称) #创建一个项目
cd quotetutorial
ls
scrapy genspider quotes quotes.toscrape.com #爬取网站
ls
cd spiders #爬取的主要代码在这里面
- quotetutorial/spider/quotes.py
# -*- coding: utf-8 -*-
import scrapy
from quotetutorial.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.tosrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
return response.text
- quotetutorial/spiders/items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
scrapy crawl quotes
#### 3.2 抓取内容
- quotetutorial/quotes.py
# -*- coding: utf-8 -*-
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.tosrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tag = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tag'] = tag
yield item
- quotetutorial/items.py
class QuoteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tag = scrapy.Field()
scrapy shell quotes.toscrape.com # shell交互命令
response
quotes = response.css('quote')
qutoes # css selector一种选择器
qutoes[0] # 输出第一个元素
quotes[0].css('.text')
quotes[0].css('.text::text')
quotes[0].css('.text::text').extract() # 返回一个列表
text = quote.css('.text::text').extract_first() # 返回第一个元素
- quotetutorial/quotes.py
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.tosrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tag = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tag'] = tag
yield (item)
next = response.css('.pager .next a::attr(href)').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url, callback=self.parse)
:scrapy crawl quotes -o quotes.json #.csv .jl .json .marshal .pickle .xml
:scrapy crawl quotes -o quotes.
- pipelines.py
class MongoPipleline(object):
def __init__(self):
self.client = pymongo.MongoClient('localhost')
self.db = self.client['quotestutorial']
def process_item(self,item,spider):
self.db['quotes'].insert(dict(item)) # 字典形式
return item
def close_spider(self):
self.client.close()
class TextPipeline(object):
def __init__(self):
self.limit = 50
def process_item(self, item, spider):
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + "..."
return item
else:
return DropItem("Missing Text")