In [1]:
from datetime import datetime
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from DSPG_Products import Products #Imports the products to be processed
from DSPG_Cleaner import DataCleaner # This is to handle the cleaning of data
from DSPG_SpiderErrors import DataFormatingError

class FreshThymeSpider(scrapy.Spider):
    name = 'Fresh Thyme Market Spider'

    def start_requests( self ):
        #Bacon Scraper part
        bacon_urls = ['https://ww2.freshthyme.com/sm/planning/rsid/951/results?q=Bacon&take=48&f=Category%3AHot+Dogs%2C+Bacon+%26+Sausage',
                      'https://ww2.freshthyme.com/sm/planning/rsid/952/results?q=Bacon&take=48&f=Category%3AHot+Dogs%2C+Bacon+%26+Sausage']
        for url in bacon_urls:
            yield scrapy.Request( url = url, callback = self.cardsParse, meta={'type': 'bacon', 'url': url})

        #Egg Scraper part
        egg_urls = ['https://ww2.freshthyme.com/sm/planning/rsid/951/results?q=Eggs&take=48&f=Category%3AEggs',
                      'https://ww2.freshthyme.com/sm/planning/rsid/952/results?q=Eggs&take=48&f=Category%3AEggs']
        for url in egg_urls:
            yield scrapy.Request( url = url, callback = self.cardsParse, meta={'type': 'egg', 'url': url})

        #Heirloom Tomatoes part
        tomato_urls = ['https://ww2.freshthyme.com/sm/planning/rsid/951/results?q=heirloom%20tomatoes',
                       'https://ww2.freshthyme.com/sm/planning/rsid/952/results?q=heirloom%20tomatoes']

        for url in tomato_urls:
            yield scrapy.Request( url = url, callback = self.cardsParse, meta={'type': 'tomato', 'url': url})

    def cardsParse(self, response):
        #Failsafe for links
        try:
            #grabs the store location
            storeXpath = '//*[contains(@class,"HeaderSubtitle")]/text()'
            store = response.xpath(storeXpath).extract_first()
            #grabs all cards from list and saves the link to follow
            xpath = '//*[contains(@class,"Listing")]/div/a/@href'
            listCards = response.xpath(xpath)
            for url in listCards:
                yield response.follow( url = url, callback = self.itemParse, meta={'store': store, 'type': response.meta.get('type'), 'url': response.meta.get('url')} )
        except AttributeError:
           pass
    
    def itemParse(self, response):
        #xpaths to extract 
        nameXpath = '//*[contains(@class, "PdpInfoTitle")]/text()'
        priceXpath = '//*[contains(@class, "PdpMainPrice")]/text()'
        prevPriceXpath = '//*[contains(@class, "PdpPreviousPrice")]/text()'
        name = response.xpath(nameXpath).extract_first()
        price = response.xpath(priceXpath).extract_first()
        sale = response.xpath(prevPriceXpath).extract_first()
        url = response.meta.get('url')
        clean = DataCleaner()
        #Adding the data to data frame
        itemType = response.meta.get('type')
        if(itemType == "bacon"):
            #We only want bacon
            if 'bacon' not in name.lower().replace(' ', ''):
                return
            clean.LoadDataSet(0, url)
            clean.Data['Product Type'] = name
            clean.Data['Current Price'] = price
            clean.Data['Orignal Price'] = sale
            clean.baconModifications()
        elif(itemType == "egg"):
            string = name.lower().replace(' ', '')
            string = string.split('-')
            checkString = string[len(string)-1]
            #We only want eggs
            if 'each' not in checkString or 'cooked' in name.lower().replace(' ', '') or 'boiled' in name.lower().replace(' ', ''):
                return
            clean.LoadDataSet(1, url)
            if '1each' in checkString or '12each' in checkString:
                clean.Data['True Amount'] = f"{1} dz"
                clean.Data['Amount in dz'] = 1.0
            elif '1.5each' in checkString:
                clean.Data['True Amount'] = f"{1.5} dz"
                clean.Data['Amount in dz'] = 1.5
            clean.Data['Product Type'] = name
            clean.Data['Current Price'] = price
            clean.Data['Orignal Price'] = sale
        elif(itemType == "tomato"):
            clean.LoadDataSet(2, url)
            clean.Data['Product Type'] = name
            clean.Data['Current Price'] = price
            clean.Data['Orignal Price'] = sale
            clean.heirloomTomatoesModifications(None)
        #Add more products here
        else:
            raise DataFormatingError(itemType)
        clean = self.setLocationalData(clean, response.meta.get('store'))
        clean.determineLocality()
        clean.cleanPricing()
        if(itemType == "bacon"):
            baconFrame.loc[len(baconFrame)] = list(clean.Data.values())
        elif(itemType == "egg"):   
            eggFrame.loc[len(eggFrame)] = list(clean.Data.values())
        elif(itemType == "tomato"):
            tomatoFrame.loc[len(tomatoFrame)] = list(clean.Data.values())
        #Add more products here
        else:
            raise DataFormatingError(itemType)
        
    def setLocationalData(self, clean, storeLocation):
        store = storeLocation.lower().replace(' ', '')
        if 'westdesmoines' in store:
            clean.Data['Address'] = '2900 University Ave. Suite 240'
            clean.Data['State'] = 'IA'
            clean.Data['City'] = 'West Des Moines'
            clean.Data['Zip Code'] = '50266'    
        elif 'davenport' in store:
            clean.Data['Address'] = '2130 E. Kimberly Rd.'
            clean.Data['State'] = 'IA'
            clean.Data['City'] = 'Davenport'
            clean.Data['Zip Code'] = '52807'    
        return clean


In [None]:
#Start
#Setup
#Data frames
baconFrame = pd.DataFrame(columns=['Bacon', 'Current Price', 'Orignal Price', 'Weight in lbs', 'True Weight', 'Brand', 'Local', 'Address', 'State', 'City', 'Zip Code', 'Date Collected', 'Url']) #Bacon Frame
eggFrame = pd.DataFrame(columns=['Eggs', 'Current Price', 'Orignal Price', 'Amount in dz', 'True Amount', 'Brand', 'Local', 'Address', 'State', 'City', 'Zip Code', 'Date Collected', 'Url']) #Egg Frame
tomatoFrame = pd.DataFrame(columns=['Heirloom Tomatoes', 'Current Price', 'Orignal Price', 'Weight in lbs', 'True Weight', 'Brand', 'Organic', 'Local', 'Address', 'State', 'City', 'Zip Code', 'Date Collected', 'Url']) #Heirloom Tomato Frame

#DEBUG Switch
DEBUG = 0                       
if(DEBUG):
    #To see the inner mechanics of the spider
    configure_logging()

#This is to start the spider
process = CrawlerProcess()
process.crawl(FreshThymeSpider)
process.start()
process.stop()

#Adds the date that the data was scraped
currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
#To CSV files
baconFrame.to_csv(currentDate + "Fresh Thyme Bacon.csv", index=False)
eggFrame.to_csv(currentDate + "Fresh Thyme Eggs.csv", index=False)
tomatoFrame.to_csv(currentDate + "Fresh Thyme Heirloom Tomatoes.csv", index=False)
