In [8]:
from datetime import datetime
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging

class FreshThymeSpider(scrapy.Spider):
    name = 'Fresh Thyme Market Spider'

    def start_requests( self ):
        #Bacon Scraper part
        bacon_urls = ['https://ww2.freshthyme.com/sm/planning/rsid/951/results?q=Bacon&take=48&f=Category%3AHot+Dogs%2C+Bacon+%26+Sausage',
                      'https://ww2.freshthyme.com/sm/planning/rsid/952/results?q=Bacon&take=48&f=Category%3AHot+Dogs%2C+Bacon+%26+Sausage']
        for url in bacon_urls:
            yield scrapy.Request( url = url, callback = self.cardsParse, meta={'type': 'bacon', 'url': url})

        #Egg Scraper part
        egg_urls = ['https://ww2.freshthyme.com/sm/planning/rsid/951/results?q=Eggs&take=48&f=Category%3AEggs',
                      'https://ww2.freshthyme.com/sm/planning/rsid/952/results?q=Eggs&take=48&f=Category%3AEggs']
        for url in egg_urls:
            yield scrapy.Request( url = url, callback = self.cardsParse, meta={'type': 'egg', 'url': url})

        #Heirloom Tomatoes part
        tomato_urls = ['https://ww2.freshthyme.com/sm/planning/rsid/951/results?q=heirloom%20tomatoes',
                       'https://ww2.freshthyme.com/sm/planning/rsid/952/results?q=heirloom%20tomatoes']

        for url in tomato_urls:
            yield scrapy.Request( url = url, callback = self.cardsParse, meta={'type': 'tomato', 'url': url})

    def cardsParse(self, response):
        #Failsafe for links
        try:
            #grabs the store location
            storeXpath = '//*[contains(@class,"HeaderSubtitle")]/text()'
            store = response.xpath(storeXpath).extract_first()
            #grabs all cards from list and saves the link to follow
            xpath = '//*[contains(@class,"Listing")]/div/a/@href'
            listCards = response.xpath(xpath)
            for url in listCards:
                yield response.follow( url = url, callback = self.itemParse, meta={'store': store, 'type': response.meta.get('type'), 'url': response.meta.get('url')} )
        except AttributeError:
           pass
    
    def itemParse(self, response):
        #xpaths to extract 
        nameXpath = '//*[contains(@class, "PdpInfoTitle")]/text()'
        priceXpath = '//*[contains(@class, "PdpMainPrice")]/text()'
        unitPriceXpath = '//*[contains(@class, "PdpPreviousPrice")]/text()'
        prevPriceXpath = '//*[contains(@class, "PdpUnitPrice")]/text()'
        #Adding the data to data frame
        itemType = response.meta.get('type')
        if(itemType == "bacon"):
            baconFrame.loc[len(baconFrame)] = [response.xpath(nameXpath).extract_first(),
                                               response.xpath(priceXpath).extract_first(), 
                                               response.xpath(unitPriceXpath).extract_first(), 
                                               response.xpath(prevPriceXpath).extract_first(), 
                                               response.meta.get('store'),
                                               response.meta.get('url')]
        elif(itemType == "egg"):
            eggFrame.loc[len(eggFrame)] = [response.xpath(nameXpath).extract_first(),
                                           response.xpath(priceXpath).extract_first(), 
                                           response.xpath(prevPriceXpath).extract_first(), 
                                           response.meta.get('store'),
                                           response.meta.get('url')]
        elif(itemType == "tomato"):
            tomatoFrame.loc[len(tomatoFrame)] = [response.xpath(nameXpath).extract_first(),
                                                 response.xpath(priceXpath).extract_first(), 
                                                 response.xpath(prevPriceXpath).extract_first(), 
                                                 response.meta.get('store'),
                                                 response.meta.get('url')]

# Start
#DEBUG Switch
DEBUG = 0

#Data frames
baconFrame = pd.DataFrame(columns=['Bacon', 'Current Price', 'Unit Price', 'Sale', 'Store Location', 'Url'])
eggFrame = pd.DataFrame(columns=['Egg', 'Current Price', 'Sale', 'Store Location', 'Url'])
tomatoFrame = pd.DataFrame(columns=['Heirloom Tomato', 'Current Price', 'Sale', 'Store Location', 'Url'])

if(DEBUG):
    #To see the inner mechanics of the spider
    configure_logging()

#This is to start the spider
process = CrawlerProcess()
process.crawl(FreshThymeSpider)
process.start()
process.stop()

if(DEBUG):
    #To see the outputs
    print(baconFrame)
    print(eggFrame)
    print(tomatoFrame)

#Adds the date that the data was scraped
currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
#To CSV files
baconFrame.to_csv(currentDate + "Fresh Thyme Bacon.csv")
eggFrame.to_csv(currentDate + "Fresh Thyme Egg.csv")
tomatoFrame.to_csv(currentDate + "Fresh Thyme Heirloom Tomatoes.csv")


2023-06-05 13:06:18 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2023-06-05 13:06:18 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.2.0, Python 3.10.9 | packaged by conda-forge | (main, Jan 11 2023, 15:15:40) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1t  7 Feb 2023), cryptography 37.0.1, Platform Windows-10-10.0.19044-SP0
2023-06-05 13:06:18 [scrapy.crawler] INFO: Overridden settings:
{}
2023-06-05 13:06:18 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-06-05 13:06:18 [scrapy.extensions.telnet] INFO: Telnet Password: 9f0e03d97dd27433
2023-06-05 13:06:18 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2023-06-05 13:06:20 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpaut