# 1 Using selectors

### 1.1 Constructing selectors

In [1]:
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from urllib import request

In [2]:
REQUEST_HEADER = {
    'User-agent': 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; iCafeMedia; .NET CLR 2.0.50727; CIBA)',
    'Accept': '*/*',
    'Accept-Charset': 'gzip, deflate',
    # 'Cookie': 'Tango_UserReference=38D8FF1624305B16496E9808; MTCCK=1; _csuid=48feeef505683659; cookmcnt=999; CID=1459382; cookMemberName=YunFan; cookMemberID=61448; savedEmail=liyunfan@genscriptcorp.com; DLDExec=OK; __utma=232384002.1655516880.1231991960.1231994793.1232000250.3; __utmb=232384002; __utmc=232384002; __utmz=232384002.1231991960.1.1.utmccn=(direct)|utmcsr=(direct)|utmcmd=(none)',
    'Accept-Language': 'en'
}

In [3]:
body = '<html><body><span>good</span></body></html>'
Selector(text=body).xpath('//span/text()').get()

'good'

In [4]:
response = HtmlResponse(url='http://example.com', body=body, encoding='utf-8')
Selector(response=response).xpath('//span/text()').get()

'good'

In [6]:
req = request.Request('https://docs.scrapy.org/en/latest/_static/selectors-sample1.html', headers=REQUEST_HEADER)
body = request.urlopen(req).read().decode("utf-8" )
from pprint import pprint
pprint(body)

('<html>\n'
 ' <head>\n'
 "  <base href='http://example.com/' />\n"
 '  <title>Example website</title>\n'
 ' </head>\n'
 ' <body>\n'
 "  <div id='images'>\n"
 "   <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' "
 '/></a>\n'
 "   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' "
 '/></a>\n'
 "   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' "
 '/></a>\n'
 "   <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' "
 '/></a>\n'
 "   <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' "
 '/></a>\n'
 '  </div>\n'
 ' </body>\n'
 '</html>\n'
 '\n')


In [7]:
response = HtmlResponse(url='https://docs.scrapy.org/en/latest/_static/selectors-sample1.html', body=body,
                        encoding='utf-8')

### 1.2 Using selectors

In [8]:
response.xpath('//title/text()')

[<Selector xpath='//title/text()' data='Example website'>]

In [9]:
response.xpath('//title/text()').getall()

['Example website']

In [10]:
response.xpath('//title/text()').get()

'Example website'

In [11]:
response.css('title::text').get()

'Example website'

In [12]:
response.css('img').xpath('@src').getall()

['image1_thumb.jpg',
 'image2_thumb.jpg',
 'image3_thumb.jpg',
 'image4_thumb.jpg',
 'image5_thumb.jpg']

In [13]:
response.xpath('//div[@id="images"]/a/text()').get()

'Name: My image 1 '

In [14]:
response.xpath('//div[@id="not-exists"]/text()').get() is None

True

In [15]:
response.xpath('//div[@id="not-exists"]/text()').get()

In [16]:
response.xpath('//div[@id="not-exists"]/text()').get(default='not-found')

'not-found'

In [17]:
response.css('img')

[<Selector xpath='descendant-or-self::img' data='<img src="image1_thumb.jpg">'>,
 <Selector xpath='descendant-or-self::img' data='<img src="image2_thumb.jpg">'>,
 <Selector xpath='descendant-or-self::img' data='<img src="image3_thumb.jpg">'>,
 <Selector xpath='descendant-or-self::img' data='<img src="image4_thumb.jpg">'>,
 <Selector xpath='descendant-or-self::img' data='<img src="image5_thumb.jpg">'>]

In [18]:
[img.attrib['src'] for img in response.css('img')]

['image1_thumb.jpg',
 'image2_thumb.jpg',
 'image3_thumb.jpg',
 'image4_thumb.jpg',
 'image5_thumb.jpg']

In [19]:
response.css('img').attrib['src']

'image1_thumb.jpg'

In [20]:
response.css('base').attrib['href']

'http://example.com/'

In [21]:
response.xpath('//base/@href').get()

'http://example.com/'

In [22]:
response.css('base::attr(href)').get()

'http://example.com/'

In [23]:
response.css('base').attrib['href']

'http://example.com/'

In [25]:
response.xpath('//a[contains(@href, "image")]/@href').getall()

['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']

In [26]:
response.css('a[href*=image]::attr(href)').getall()

['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']

In [27]:
response.xpath('//a[contains(@href, "image")]/img/@src').getall()

['image1_thumb.jpg',
 'image2_thumb.jpg',
 'image3_thumb.jpg',
 'image4_thumb.jpg',
 'image5_thumb.jpg']

In [28]:
response.css('a[href*=image] img::attr(src)').getall()

['image1_thumb.jpg',
 'image2_thumb.jpg',
 'image3_thumb.jpg',
 'image4_thumb.jpg',
 'image5_thumb.jpg']

### 1.3 Extensions to CSS Selectors

In [30]:
response.css('title::text').get()

'Example website'

In [31]:
response.css('#images *::text').getall()

['\n   ',
 'Name: My image 1 ',
 '\n   ',
 'Name: My image 2 ',
 '\n   ',
 'Name: My image 3 ',
 '\n   ',
 'Name: My image 4 ',
 '\n   ',
 'Name: My image 5 ',
 '\n  ']

In [32]:
response.css('img::text').getall()

[]

In [33]:
response.css('img::text').get()

In [34]:
response.css('img::text').get(default='')

''

In [35]:
response.css('a::attr(href)').getall()

['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']

### 1.4 Nesting selectors

In [37]:
links = response.xpath('//a[contains(@href, "image")]')
links

[<Selector xpath='//a[contains(@href, "image")]' data='<a href="image1.html">Name: My image 1 <'>,
 <Selector xpath='//a[contains(@href, "image")]' data='<a href="image2.html">Name: My image 2 <'>,
 <Selector xpath='//a[contains(@href, "image")]' data='<a href="image3.html">Name: My image 3 <'>,
 <Selector xpath='//a[contains(@href, "image")]' data='<a href="image4.html">Name: My image 4 <'>,
 <Selector xpath='//a[contains(@href, "image")]' data='<a href="image5.html">Name: My image 5 <'>]

In [38]:
links.getall()

['<a href="image1.html">Name: My image 1 <br><img src="image1_thumb.jpg"></a>',
 '<a href="image2.html">Name: My image 2 <br><img src="image2_thumb.jpg"></a>',
 '<a href="image3.html">Name: My image 3 <br><img src="image3_thumb.jpg"></a>',
 '<a href="image4.html">Name: My image 4 <br><img src="image4_thumb.jpg"></a>',
 '<a href="image5.html">Name: My image 5 <br><img src="image5_thumb.jpg"></a>']

In [39]:
for index, link in enumerate(links):
    args = (index, link.xpath('@href').get(), link.xpath('img/@src').get())
    print('Link number %d points to url %r and image %r' % args)

Link number 0 points to url 'image1.html' and image 'image1_thumb.jpg'
Link number 1 points to url 'image2.html' and image 'image2_thumb.jpg'
Link number 2 points to url 'image3.html' and image 'image3_thumb.jpg'
Link number 3 points to url 'image4.html' and image 'image4_thumb.jpg'
Link number 4 points to url 'image5.html' and image 'image5_thumb.jpg'


### 1.5 Selecting element attributes

In [40]:
response.xpath("//a/@href").getall()

['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']

In [41]:
response.css('a::attr(href)').getall()

['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']

In [42]:
response.css('a')

[<Selector xpath='descendant-or-self::a' data='<a href="image1.html">Name: My image 1 <'>,
 <Selector xpath='descendant-or-self::a' data='<a href="image2.html">Name: My image 2 <'>,
 <Selector xpath='descendant-or-self::a' data='<a href="image3.html">Name: My image 3 <'>,
 <Selector xpath='descendant-or-self::a' data='<a href="image4.html">Name: My image 4 <'>,
 <Selector xpath='descendant-or-self::a' data='<a href="image5.html">Name: My image 5 <'>]

In [43]:
[a.attrib['href'] for a in response.css('a')]

['image1.html', 'image2.html', 'image3.html', 'image4.html', 'image5.html']

In [44]:
response.css('base')

[<Selector xpath='descendant-or-self::base' data='<base href="http://example.com/">'>]

In [45]:
response.css('base').attrib

{'href': 'http://example.com/'}

In [46]:
response.css('base').attrib['href']

'http://example.com/'

In [47]:
response.css('foo').attrib

{}

### 1.6 Using selectors with regular expressions