# 1 Selectors Using CSS Classes

In [1]:
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests

In [2]:
REQUEST_HEADER = {
    'User-agent': 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; .NET CLR 2.0.50727; CIBA)',
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Upgrade-Insecure-Requests': '1',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8',
}

URL = "http://quotes.toscrape.com/"

In [3]:
html = requests.get(URL, headers=REQUEST_HEADER).text

In [4]:
response = HtmlResponse(url=URL, body=html, encoding='utf-8')

In [5]:
response.css('title')

[<Selector xpath='descendant-or-self::title' data='<title>Quotes to Scrape</title>'>]

In [6]:
response.css('title').extract()

['<title>Quotes to Scrape</title>']

In [7]:
response.css('title::text').extract()

['Quotes to Scrape']

In [8]:
response.css('.author')

[<Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' author ')]" data='<small class="author" itemprop="author">'>,
 <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' author ')]" data='<small class="author" itemprop="author">'>,
 <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' author ')]" data='<small class="author" itemprop="author">'>,
 <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' author ')]" data='<small class="author" itemprop="author">'>,
 <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' author ')]" data='<small class="author" itemprop="author">'>,
 <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' author ')]" data='<small class="author" itemprop="author">'>

In [9]:
type(response.css('.author'))

scrapy.selector.unified.SelectorList

In [10]:
response.css('.author').extract()

['<small class="author" itemprop="author">Albert Einstein</small>',
 '<small class="author" itemprop="author">J.K. Rowling</small>',
 '<small class="author" itemprop="author">Albert Einstein</small>',
 '<small class="author" itemprop="author">Jane Austen</small>',
 '<small class="author" itemprop="author">Marilyn Monroe</small>',
 '<small class="author" itemprop="author">Albert Einstein</small>',
 '<small class="author" itemprop="author">André Gide</small>',
 '<small class="author" itemprop="author">Thomas A. Edison</small>',
 '<small class="author" itemprop="author">Eleanor Roosevelt</small>',
 '<small class="author" itemprop="author">Steve Martin</small>']

In [11]:
type(response.css('.author').extract())

list

In [12]:
response.css('.author').extract()[0]

'<small class="author" itemprop="author">Albert Einstein</small>'

In [13]:
response.css('.author').extract_first()

'<small class="author" itemprop="author">Albert Einstein</small>'

In [14]:
response.css('.author::text').extract()

['Albert Einstein',
 'J.K. Rowling',
 'Albert Einstein',
 'Jane Austen',
 'Marilyn Monroe',
 'Albert Einstein',
 'André Gide',
 'Thomas A. Edison',
 'Eleanor Roosevelt',
 'Steve Martin']

In [15]:
response.css('small.author::text').extract()

['Albert Einstein',
 'J.K. Rowling',
 'Albert Einstein',
 'Jane Austen',
 'Marilyn Monroe',
 'Albert Einstein',
 'André Gide',
 'Thomas A. Edison',
 'Eleanor Roosevelt',
 'Steve Martin']

In [16]:
response.css('.quote > .text').extract()

['<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>',
 '<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>',
 '<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>',
 '<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>',
 '<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it\'s better to be absolutely ridiculous than absolutely boring.”</span>',
 '<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>',
 '<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for wha

In [17]:
# 浏览器F12: copy selecor
response.css('body > div > div:nth-child(2) > div.col-md-8 > div:nth-child(1) > span.text').extract()

['<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>']

In [18]:
response.css('a::attr(href)').extract()

['/',
 '/login',
 '/author/Albert-Einstein',
 '/tag/change/page/1/',
 '/tag/deep-thoughts/page/1/',
 '/tag/thinking/page/1/',
 '/tag/world/page/1/',
 '/author/J-K-Rowling',
 '/tag/abilities/page/1/',
 '/tag/choices/page/1/',
 '/author/Albert-Einstein',
 '/tag/inspirational/page/1/',
 '/tag/life/page/1/',
 '/tag/live/page/1/',
 '/tag/miracle/page/1/',
 '/tag/miracles/page/1/',
 '/author/Jane-Austen',
 '/tag/aliteracy/page/1/',
 '/tag/books/page/1/',
 '/tag/classic/page/1/',
 '/tag/humor/page/1/',
 '/author/Marilyn-Monroe',
 '/tag/be-yourself/page/1/',
 '/tag/inspirational/page/1/',
 '/author/Albert-Einstein',
 '/tag/adulthood/page/1/',
 '/tag/success/page/1/',
 '/tag/value/page/1/',
 '/author/Andre-Gide',
 '/tag/life/page/1/',
 '/tag/love/page/1/',
 '/author/Thomas-A-Edison',
 '/tag/edison/page/1/',
 '/tag/failure/page/1/',
 '/tag/inspirational/page/1/',
 '/tag/paraphrased/page/1/',
 '/author/Eleanor-Roosevelt',
 '/tag/misattributed-eleanor-roosevelt/page/1/',
 '/author/Steve-Martin',
 

# 2 Selectors Using XPath

In [20]:
response.xpath('/html/head/title')

[<Selector xpath='/html/head/title' data='<title>Quotes to Scrape</title>'>]

In [21]:
response.xpath('/html/head/title').extract()

['<title>Quotes to Scrape</title>']

In [22]:
response.xpath('//title').extract()

['<title>Quotes to Scrape</title>']

In [23]:
# copy xpath
response.xpath('/html/body/div/div[2]/div[1]/div[1]/span[2]/small').extract()

['<small class="author" itemprop="author">Albert Einstein</small>']

In [24]:
response.xpath('/html/body/div/div/div/div/span/small').extract()

['<small class="author" itemprop="author">Albert Einstein</small>',
 '<small class="author" itemprop="author">J.K. Rowling</small>',
 '<small class="author" itemprop="author">Albert Einstein</small>',
 '<small class="author" itemprop="author">Jane Austen</small>',
 '<small class="author" itemprop="author">Marilyn Monroe</small>',
 '<small class="author" itemprop="author">Albert Einstein</small>',
 '<small class="author" itemprop="author">André Gide</small>',
 '<small class="author" itemprop="author">Thomas A. Edison</small>',
 '<small class="author" itemprop="author">Eleanor Roosevelt</small>',
 '<small class="author" itemprop="author">Steve Martin</small>']

In [25]:
response.xpath('/html/body/div/div/div/div/span/small/text()').extract()

['Albert Einstein',
 'J.K. Rowling',
 'Albert Einstein',
 'Jane Austen',
 'Marilyn Monroe',
 'Albert Einstein',
 'André Gide',
 'Thomas A. Edison',
 'Eleanor Roosevelt',
 'Steve Martin']

In [27]:
response.xpath("//span[@class='text']").extract()

['<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>',
 '<span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>',
 '<span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>',
 '<span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>',
 '<span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it\'s better to be absolutely ridiculous than absolutely boring.”</span>',
 '<span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>',
 '<span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for wha

In [28]:
response.xpath("//span[@class='text']/text()").extract()

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

# 3 Using Regular Expressions with Selectors

In [29]:
URL = "http://quotes.toscrape.com/page/2"
html = requests.get(URL, headers=REQUEST_HEADER).text
response = HtmlResponse(url=URL, body=html, encoding='utf-8')

In [31]:
response.xpath("//*[contains(text(), 'friend')]/text()").extract()

["“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantl

In [32]:
response.css(".text:contains('friend')::text").extract()

["“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantl

In [33]:
response.css('.author::text').re('A[a-z]*\s\w+')

['Albert Einstein', 'Allen Saunders']

In [35]:
response.xpath("//*[@class='author'][starts-with(text(), 'A')]/text()").extract()

['Albert Einstein', 'Allen Saunders']