In [1]:
# Try to use code from THINKFUL examples to scrape some data from amazon.
import scrapy
import re
from scrapy.crawler import CrawlerProcess

class AmazonScraper(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "AmazonScraper"
    
    # URL(s) to start with.
    start_urls = [
        'https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dgarden&field-keywords=blender',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <li> element on the page.
        # Each li contains the product container for each result.
        # // is a shortcut for finding all li regardless of parent level
        for article in response.xpath('//li'):
            
            # Yield a dictionary with the values we want.
            # attempting to get the top results for blenders - 
            # we want blender name, result id, rank, ASIN, and would like link
            yield {
                'name': article.xpath('a/h2/@data-attribute').extract_first(),
                'result_id': article.xpath('@id').extract_first(),
                'ASIN': article.xpath('@data-asin').extract_first(),
                'rank': article.xpath('@data-result-rank').extract_first()
            }
        # Get the URL of the previous page.
        #next_page = response.xpath('//div[@class="nav-previous"]/a/@href').extract_first()
        
        # There are a LOT of pages here.  For our example, we'll just scrape the first 9.
        # This finds the page number. The next segment of code prevents us from going beyond page 9.
        #pagenum = int(re.findall(r'\d+',next_page)[0])
        
        # Recursively call the spider to run on the next page, if it exists.
        #if next_page is not None and pagenum < 10:
            #next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
            #yield scrapy.Request(next_page, callback=self.parse)

# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',          # Store data in JSON format.
    'FEED_URI': 'blender_amazon.json',       # Name our storage file. Doesn't resave over old file? - 
                                             # have to delete and rerun to save new
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'USER-AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
})

# Start the crawler with our spider.
process.crawl(AmazonScraper)
process.start()
print('Success!')

Success!


In [4]:
import pandas as pd

# Checking whether we got data
Amazondf=pd.read_json('blender_amazon.json', orient='records')
print(Amazondf.shape)
print(Amazondf.head(31))
print(Amazondf.tail())

(186, 4)
          ASIN  name  rank                         result_id
0         None   NaN   NaN                              None
1   B00NGV4506   NaN   0.0                          result_0
2   B00EI7DPI0   NaN   1.0                          result_1
3   B007TIE0GQ   NaN   2.0                          result_2
4   B017TZ9SME   NaN   3.0                          result_3
5   B012T634SM   NaN   4.0                          result_4
6   B0019MLLCO   NaN   5.0                          result_5
7   B00939FV8K   NaN   6.0                          result_6
8   B07CX95VRT   NaN   7.0                          result_7
9   B07BS3H5VF   NaN   8.0                          result_8
10  B0081PTLGU   NaN   9.0                          result_9
11  B008H4SLV6   NaN  10.0                         result_10
12  B00Y2U1QUM   NaN  11.0                         result_11
13  B0764BD7WV   NaN  12.0                         result_12
14  B000GIGZXM   NaN  13.0                         result_13
15  B00DBQ1AIG 

- Should be over 1K results - maybe only got first 184 because only the first 31 blenders (sorted by featured) are shown on the first page.

- Also the ASIN doesn't seem to be working correctly.  The 22 result on the amazon page corresponds to the 15th index in the JSON file.