In [3]:
#Imports
from datetime import datetime
import pandas as pd
from enum import Enum
#Imports for Scraping
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from os import path
import time
import sys


#Creator's Note: Products(Enum) and ProductsLoader is probably the only classes you need to edit 
#unless you need to change the way the data is cleaned. Which handled in the DataCleaner class

#These class is here so that we can expand to differnet products easier making the spider more dynamic and expandable
class Products(Enum):
    #Add products like this ProductName = index iteration, [], [] 
    #the 2 empty list will be filled in using the ProductsLoader class
    Bacon = 0, [], []
    Eggs = 1, [], []
    HeirloomTomatoes = 2, [], []

    # Helper method to reduce code for adding to the products and weed out duplicate inputs
    # if you type something in really wrong code will stop the setup is important 
    # correct index inputs are correct index number, url, urls, xpath, xpaths
    def addToProduct(self, items, index):
        product = None
        if isinstance(index, int):
            product = self.value[index]
        elif isinstance(index, str):
            if index.lower() in ['urls', 'url']:
                product = self.value[1]
            elif index.lower() in ['xpaths', 'xpath']:
                product = self.value[2]
        if product == None:
            raise ValueError(f"Invalid index input for ({index}) for input: {items}")
        #Sets are fast at finding dups so we use them for speed
        product_set = set(product)
        for item in items:
            if item not in product_set:
                product.append(item)
                product_set.add(item)

#This class loads the xpaths and urls to the Products Enum and adds dataframes to the spider
class ProductsLoader():
    DataFrames = []
    storeXpaths = []
    def __init__(self):
        self.dataFrameAdder()
        self.setStoreXpaths()
        self.urlsAdder()
        self.xpathMaker()

    #This adds the dataframe to the spider on load
    def dataFrameAdder(self):
        #Dataframes (You can add more here)
        baconFrame = pd.DataFrame(columns=['Bacon', 'Current Price', 'Weight' ,'Sale', 'Store Location', 'Url'])
        eggFrame = pd.DataFrame(columns=['Egg', 'Current Price', 'Weight' ,'Sale', 'Store Location', 'Url'])
        tomatoFrame = pd.DataFrame(columns=['Heirloom Tomato', 'Current Price', 'Weight' ,'Sale', 'Store Location', 'Url'])
        self.DataFrames = [baconFrame,
                           eggFrame,
                           tomatoFrame
                          ]

    def setStoreXpaths(self):
        CoralvilleButtonXpath = '//*[@id="store-locator"]//*[contains(@data-store-id,"2843") and contains(@class,"fp-btn fp-btn-default fp-btn-mystore ") and contains(@role,"button")]'
        IowaCityButtonXpath = '//*[@id="store-locator"]//*[contains(@data-store-id,"2844") and contains(@class,"fp-btn fp-btn-default fp-btn-mystore ") and contains(@role,"button")]'
        CedarRapidsButtonXpath = '//*[@id="store-locator"]//*[contains(@data-store-id,"2845") and contains(@class,"fp-btn fp-btn-default fp-btn-mystore ") and contains(@role,"button")]'
        self.storeXpaths = [CedarRapidsButtonXpath,
                            IowaCityButtonXpath,
                            CoralvilleButtonXpath
                            ]

    #Adding Urls to products
    def urlsAdder(self):
        BaconUrls = ['https://shop.newpi.coop/shop/meat/bacon/sliced/applegate_natural_hickory_smoked_uncured_sunday_bacon_8_oz/p/19959#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/beeler_hickory_smoked_bacon/p/7703726#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/beeler_bacon_ends_and_pieces/p/1564405684703446698#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/beeler_hickory_smoked_bacon/p/7791059#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/garrett_valley_pork_bacon_classic_dry_rubbed_uncured/p/7703238#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/turkey/garrett_valley_turkey_bacon_sugar_free_paleo/p/7703237#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/beeler_pepper_bacon/p/1564405684702577823#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/turkey/plainville_farms_turkey_bacon_uncured/p/4750634#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/bacon/garrett_valley_pork_bacon_8_oz/p/6572556#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/refrigerated/meat_alternatives/herbivorous_butcher_hickory_maple_bacon/p/1564405684704334152#!/?department_id=1322093',
                     'https://shop.newpi.coop/shop/meat/pork/new_pi_bulk_bacon/p/1564405684704337543#!/?department_id=1322171',
                     'https://shop.newpi.coop/shop/meat/pork/niman_ranch_uncured_bacon_12_oz/p/7276#!/?department_id=1322171'
                    ]
        EggUrls = ['https://shop.newpi.coop/shop/refrigerated/eggs/cage_free/farmers_hen_house_eggs_grade_a_free_range_large_brown_12_ea/p/7110637',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/cage_free/farmers_hen_house_eggs_white_cage_free_large/p/7110638',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/organic/farmers_hen_house_eggs_large_brown_free_range/p/7613595',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/organic/vital_farms_eggs_organic_pasture_raised_large_12_ea/p/5637123',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/hotz_eggs_dozen_xtra_large/p/1564405684714084840',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/cage_free/farmers_hen_house_eggs_jumbo_brown/p/7613596',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/cage_free/cedar_ridge_farm_grade_a_large_eggs/p/1564405684704338616',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/organic/farmers_hen_house_eggs_og_pasture_lrg_brwn/p/7613597',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/organic/farmers_hen_house_eggs_large_brown_free_range/p/1564405684703497142',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/organic/vital_farms_large_pasture_raised_eggs_12_ea/p/5323128',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/vital_farms_eggs_pasture_raised_large_18_ea/p/1564405684690018196',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/organic/organic_valley_free_range_brown_large_eggs_12_ea/p/48765',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/cage_free/cedar_ridge_farm_grade_a_extra_large_eggs/p/1564405684704338617',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/steinecke_family_farm_duck_eggs/p/1564405684711593802',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/cosgrove_rd_farm_eggs_pasture_raised/p/1564405684710338102',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/cage_free/cedar_ridge_farm_grade_a_jumbo_eggs/p/1564405684704338619',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/hotz_eggs_large/p/1564405684704684702',
                   'https://shop.newpi.coop/shop/refrigerated/eggs/hotz_eggs_medium/p/1564405684713746940',
                  ]
        HeirloomTomatoesUrls = ['https://shop.newpi.coop/shop/produce/fresh_vegetables/tomatoes/heirloom_tomatoes/p/2311736']

        Products.Bacon.addToProduct(BaconUrls,'urls')
        Products.Eggs.addToProduct(EggUrls,'urls')
        Products.HeirloomTomatoes.addToProduct(HeirloomTomatoesUrls,'urls')

    #This handles the xpaths by adding to the Products class
    #most websites have simular xpaths for each item. You might need to make differnet xpaths for each item 
    #if that is the case
    #For assigning xpaths mark them if they are optional meaning it could or could not be present on the page 
    #we do this for speed up if you mark it as non optional and its not pressent it will skip the value 
    #and hurt the preformence
    #best practice is to render the optional last so it reduces the chances of skipping 
    #Note spiecal cases do happen but they are extremely rare a good indiaction of finding one 
    #is by using skipHandler method and tracking/watching the logs  
    #IMPORTANT < -!- NOT ALL XPATHS ARE THE SAME FOR EACH PRODUCT -!->
    def xpathMaker(self):
        #Add the xpaths here and mark if they are optional
        nameXpath = '//*[@id="products"]//*[contains(@class,"fp-item-detail")]//*[contains(@class,"fp-item-name")]' #special because the store can be not carrying the product
        priceXpath = '//*[@id="products"]//*[contains(@class,"fp-item-detail")]//*[contains(@class,"fp-item-price")]//span[contains(@class,"fp-item-base-price")]'
        weightXpath = '//*[@id="products"]//*[contains(@class,"fp-item-detail")]//*[contains(@class,"fp-item-price")]//span[contains(@class,"fp-item-size")]'
        saleXpath = '//*[@id="products"]//*[contains(@class,"fp-item-detail")]//*[contains(@class,"fp-item-sale")]//span[contains(@class,"fp-item-sale-price")]' # optional
        #xpath, Optional, special
        xpathList = [(nameXpath, False, True),
                     (priceXpath, False),
                     (weightXpath, False),
                     (saleXpath, True)]

        Products.Bacon.addToProduct(xpathList,'xpath')
        Products.Eggs.addToProduct(xpathList,'xpath')
        Products.HeirloomTomatoes.addToProduct(xpathList,'xpath')


class DataCleaner():
    DataArray = []
    storeLocation = ''
    def cleanUp(self, item, url):
        self.DataArray = item
        self.DataArray.append(self.storeLocation)
        self.DataArray.append(url)
        return self.DataArray
    
    def setStore(self, storeIndex):
        cases = {
            0: "3338 Center Point Road Northeast Cedar Rapids, IA 52402",
            1: "22 South Van Buren Street Iowa City, IA 52240",
            2: "1101 2nd St Coralville, IA 52241"
        }
        self.storeLocation = cases.get(storeIndex)

    

class NewPioneerSpider():
    name = "New Pioneer Co-op"  #The store name 
    spiderLogs = []         #The logs 
    skipped = []            #Skipped data 

    #These are methods that are available for your convences
    def log(self, *args):
        self.spiderLogs.append(('Logger:', args))
        if self.LOGGER:
            print('Logger:', *args)

    def debug(self, *args):
        self.spiderLogs.append(('Debug:', args))
        if self.DEBUGGER:
            print('Debug:', *args)
    
    def printer(self, *args):
        self.spiderLogs.append(('Printer:', args))
        print(*args)
    
    def printLogs(self):
        print("\n< --- Printing Logs --- >\n")
        for entry in self.spiderLogs:
            print(*entry)

    def Logs_to_file(self, filename):
        with open(filename, 'w') as file:
            for log_entry in self.spiderLogs:
                file.write('{} {}\n'.format(log_entry[0], log_entry[1]))
    
    def __init__(self):
        self.DEBUGGER = False #The debugger switch to see whats going on. The Default is False
        self.LOGGER = False #When you need to see everything that happends. The Default is False
        self.attempts = 3 #The number of attempts the spider can retry if an error occurs. Default is 3
        self.waitTime = 10 #The number of seconds WebDriver will wait. Default is 10
        self.count = 0 #This saves the location of the url we are going through
        self.runTime = 0 #Total time of extractions
        self.totalRecoveries = 0 #Number of recoveries made while running
        self.maxRetryCount = 100 #Number of retrys the javascript can make Defualt is 100
        self.cleaner = DataCleaner() #Loads the cleaner
        self.load = ProductsLoader() #Loads all products
        #Selenium needs a webdriver to work. I chose Firefox however you can do another if you need too
        self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install(), log_path=path.devnull))
        self.log("Driver started")
    
    #This handles the restart in case we run into an error
    def restart(self):
        self.driver.quit()
        self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install(), log_path=path.devnull))
        self.log("Driver restarted")
        self.setStoreLocation()
    
    #Some stores need to have a location set
    def setStoreLocation(self):
        storeLocationUrl = 'https://shop.newpi.coop/my-store/store-locator'
        self.driver.get(storeLocationUrl)
        time.sleep(5) #Wait for the page to set
        xpath = self.load.storeXpaths[self.storeIndex]
        ignored_exceptions=(NoSuchElementException,StaleElementReferenceException)
        elements = WebDriverWait(self.driver, self.waitTime, ignored_exceptions=ignored_exceptions).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
        elements[0].click()
        time.sleep(5) #Wait for the page to set
        self.log("Store location set")


    #This starts the spider
    def start_requests( self ):
        self.runTime = time.time()
        self.log("Loading from ProductsLoader Class")
        self.dataFrames = self.load.DataFrames #Adds all dataframes
        self.debug("Products Loaded and Data Frames Added")
        self.debug('\n < --- Setup runtime is %s seconds --- >' % (time.time() - self.runTime))
        self.totalRecoveries = 0 
        #This sweeps through every inputed store
        for index in range(len(self.load.storeXpaths)):
            self.storeIndex = index
            self.cleaner.setStore(self.storeIndex)
            self.setStoreLocation()
            #Sweeps through all products
            for product in (Products):
                result = self.requestExtraction(product)
            self.debug("New store location data added")

        #Adds the date that the data was scraped
        currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
        self.log("Exporting files")
        #Dataframes to CSV files
        for df, product in zip(self.dataFrames, (Products)):
            df.to_csv(currentDate + self.name +" " + product.name + ".csv")
            self.log('\n', df.to_string())
        self.debug('\n < --- Total runtime took %s seconds with %d recoveries --- >' % (time.time() - self.runTime, self.totalRecoveries))
        if len(self.skipped) != 0:
            self.debug('\n < -!- WARNING SKIPPED (' + str(len(self.skipped)) + ') DATA FOUND --->')
        self.Logs_to_file(currentDate + self.name + ' Spider Logs.txt')
        if len(self.skipped) > 0:
            self.debug(self.skipped)
            self.skipHandler(currentDate)      
        self.driver.quit()

    #This handles the extraction request for the inputed product 
    def requestExtraction(self, product):
        self.count = 0
        errors = 0
        start = time.time()
        self.debug("Starting "+ product.name)    
        for trying in range(self.attempts):
            try:
                self.makeRequest(product)
                self.debug(product.name + " Finished")    
                self.log('\n< --- ' + product.name + ' scrape took %s seconds with %d recoveries --- >\n' % ((time.time() - start), errors))
                self.totalRecoveries += errors
                return self.totalRecoveries
            except Exception as e:
                #Note sometimes the browser will closed unexpectedly and theres not we can do but restart the driver
                errors += 1
                self.debug("An error occurred:", e)
                self.debug("Recovering extraction and continueing")
                self.restart() 
        self.debug(product.name + " Did not Finished after " + str(self.attempts) + " Time wasted: %s seconds" % (time.time() - start))
        self.totalRecoveries += errors
        return self.totalRecoveries

    #This handles the reqests for each url and adds the data to the dataframe
    def makeRequest(self, product):
        productUrls = product.value[1]
        total = len(productUrls)
        while self.count < total:
            url = productUrls[self.count]
            self.driver.get(url)
            self.log("Making a request for: ", url)
            item = []
            time.sleep(1) # marionette Error Fix
            breakout = False
            for xpath in product.value[2]:
                #Retrying the xpath given the number of attempts
                for attempt in range(self.attempts):
                    data = self.javascriptXpath(xpath[0])
                    if data in {'empty', 'skip'}:
                        #speical case
                        if len(xpath) == 3:
                            #the first attempt shouldnt go through in case it when through to fast
                            #this will slow the fuction down however Accuracy > Speed 
                            if xpath[2]:
                                if attempt == 0:
                                    self.debug("Found a missing item in store. Double checking")
                                    continue
                                #example would be when there is actually is a '' in the xpath
                                self.debug("xpath marked as speical")
                                notFoundXpath = '//*[@id="products"]//*[contains(@class,"fp-text-center fp-not-found")]//*[contains(@class,"fp-text-center")]'
                                data = self.javascriptXpath(notFoundXpath)
                                if data in {'empty', 'skip'}:
                                    self.debug("Missing item retrying")
                                else:
                                    self.debug("An Item not in stock for: ", url) 
                                    item.append(data)
                                    df = self.dataFrames[product.value[0]]
                                    num = len(df.columns) - len(item) % len(df.columns)
                                    item += ["None in stock"] * (num - 2)
                                    breakout = True
                                    break
                        if xpath[1] and data == 'empty':    
                            #this is where setting the xpath to optional comes in
                            self.debug("xpath wasnt avaliable")
                            item.append(None)
                            break
                        self.debug("Missing item retrying")
                    else:  #Data found
                        item.append(data)
                        self.log(data + ' was added to the list for: ', url)
                        break
                if breakout:
                    break
                if attempt == self.attempts:
                    data = 'skip'
                if data == 'skip':  #To help clean the data we skip the item with gaps of data 
                    self.debug("An Item has been skipped for: ", url)  
                    item = ['SKIPPED']
                    #Taking the product name  dataframe number and index added as well as the url 
                    #to retry for later 
                    #This could take time to do so we do this at the very end after we made the cvs files
                    self.skipped.append([product, self.count, url])
                    break
            if 'SKIPPED' in item:
                #No point in cleaning skipped items
                items = ['SKIPPED']*(self.dataFrames[product.value[0]].shape[1] - 1)
                items.append(url)
            else:
                #We call the DataCleaner class to handle the cleaning of the data
                #Its best to clean the data before we add it to the data frame
                self.debug('Data cleaning started: ', item)
                items = self.cleaner.cleanUp(item, url)
                self.debug('Data cleaning finished: ', item)
            self.debug('Extracted: ', items)
            self.dataFrames[product.value[0]].loc[len(self.dataFrames[product.value[0]])] = items                    
            self.count += 1
            self.printer(product.name + " item added ", self.count, " of ", total, ":  ", items)

    #Collecting the data from the xpath in JavaScript is faster and results in fewer errors than doing it in python
    #This is where selenium shines because we can both use JavaScript and render JavaScript websites
    #and is the only reason why we use it instead of scrapy
    def javascriptXpath(self, xpath):
        # if the time expires it assumes xpath wasnt found in the page
        try: 
            #Waits for page to load 
            ignored_exceptions=(NoSuchElementException,StaleElementReferenceException)
            elements = WebDriverWait(self.driver, self.waitTime, ignored_exceptions=ignored_exceptions).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))

            # Runs the javascript and collects the text data from the inputed xpath
            # We want to keep repeating if we get any of these outputs becasue the page is still 
            # loading and we dont want to skip or waste time. (for fast computers)
            retrycount = 0
            invalidOutputs = {"error", 'skip' "$nan", ''}
            while retrycount < self.maxRetryCount :
                text = self.driver.execute_script("""
                    const element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
                    if (!element) {
                        return 'skip';
                    }
                    return element.textContent.trim();
                """, 
                xpath)
                checkText = text.replace(" ", "").lower()
                if checkText in invalidOutputs:
                    retrycount+=1
                else:
                    self.log(retrycount, "xpath attempts for (", text, ")")
                    return text
            self.log("xpath attempts count met. Problematic text (" + text + ") for ", xpath)
            return 'skip'
        except TimeoutException:
            self.log('Could not find xpath for: ', xpath)
            return 'empty'

           

    #This is here to hopefully fix skipped data
    #Best case sinarios this will never be used
    def skipHandler(self, currentDate):
        corrections = 0
        # skipped format
        # [product name, DataFrame number, DataFrame index, url]
        while len(self.skipped) != 0:
            #each skip 
            for index, dataSkip in enumerate(self.skipped):
                product = dataSkip[0]
                #Limiting the Attempts to fix while avoiding bottlenecking the problem
                for attempt in range(self.attempts*2):
                    product = dataSkip[0]
                    url = dataSkip[2]
                    self.driver.get(url)
                    self.log("Making a request for: ", url)
                    item = []
                    breakout = False
                    for xpath in product.value[2]:
                        for attemptIn in range(self.attempts*2):
                            if data in {'empty', 'skip'}:
                                #speical case
                                if len(xpath) == 3:
                                    #the first attempt shouldnt go through in case it when through to fast
                                    #this will slow the fuction down however Accuracy > Speed 
                                    if xpath[2]:
                                        if attempt == 0:
                                            self.debug("Found a missing item in store. Double checking")
                                            continue
                                        #example would be when there is actually is a '' in the xpath
                                        self.debug("xpath marked as speical")
                                        notFoundXpath = '//*[@id="products"]//*[contains(@class,"fp-text-center fp-not-found")]//*[contains(@class,"fp-text-center")]'
                                        data = self.javascriptXpath(notFoundXpath)
                                        if data in {'empty', 'skip'}:
                                            self.debug("Missing item retrying")
                                        else:
                                            self.debug("An Item not in stock for: ", url) 
                                            item.append(data)
                                            df = self.dataFrames[product.value[0]]
                                            num = len(df.columns) - len(item) % len(df.columns)
                                            item += ["None in stock"] * (num - 2)
                                            breakout = True
                                            break
                                if xpath[1] and data == 'empty':    
                                    #this is where setting the xpath to optional comes in
                                    self.debug("xpath wasnt avaliable")
                                    item.append(None)
                                    break
                                self.debug("Missing item retrying")
                            else:  #Data found
                                item.append(data)
                                self.log(data + ' was added to the list for: ', url)
                                break
                        if breakout:
                            break
                    if breakout:
                        break
                    if attemptIn == self.attempts*2:
                        data = 'skip'
                        break
                if data == 'skip':  #To help clean the data we skip the item with gaps of data 
                    self.debug("Item still missing attempting other skipped for now") 
                else:
                    items = self.cleaner.cleanUp(item, url)
                    self.dataFrames[dataSkip[1]].loc[dataSkip[2]] = items                    
                    self.printer("Fixed " + product.name + " item: ", items)
                    #To avoid infinite loops and never saving our data we save the file now
                    self.dataFrames[product.value[0]].to_csv(currentDate + "REPAIRED Gateway Market " + product.name + ".csv")
                    self.debug('\n < --- Total runtime with saving of repairs took %s seconds --- >' % (time.time() - self.runTime))
                    self.Logs_to_file(currentDate + self.name + ' Spider REPAIR Logs.txt')
                    #To avoid fixing fixed items we pop, mark, and break
                    self.skipped.pop(index)
                    corrections += 1
                    break
        self.debug('\n < --- Total runtime with all repairs took %s seconds --- >' % (time.time() - self.runTime))
        self.Logs_to_file(currentDate + self.name + ' spider COMPLETED REPAIR Logs.txt')

In [None]:
# Start
#DEBUG Switch
SHOW = True

#Spider setup
spider = NewPioneerSpider()
spider.LOGGER = True
spider.DEBUGGER = True

#Running the spider
spider.start_requests()

if(SHOW):
    [print(dataFrame) for dataFrame in spider.dataFrames]
    spider.printLogs()
