In [30]:
#Imports
from datetime import datetime
import pandas as pd
from enum import Enum
#Imports for Scraping
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from os import path
import time

#This class is here so that we can expand to differnet products easier make the spider 
#more dynamic and expandable
class Products(Enum):
    #Name = Index, URL list
    Bacon = 1, ['https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18483',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18485',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-24190',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18553',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-33732',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18521',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18548',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18469',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-33734',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-33736',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-33731',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-29349',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18524',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-24260',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-24163',
                'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-18482'
                ]
    Eggs = 2, ['https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-22775',
               'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-22776',
               'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-12603',
              ]
    
    HeirloomTomatoes = 3, ['https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-11820',
                           'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-22455',
                           'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-11896',
                           'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-11973',
                           'https://gatewaymarket.storebyweb.com/s/1000-1/i/INV-1000-22343',
                          ]

class GatewaySpider():
    name = "Gateway Market Spider"
    baconFrame = pd.DataFrame(columns=['Bacon', 'Current Price', 'Original Price', 'Brand', 'Location', 'Url'])
    eggFrame = pd.DataFrame(columns=['Egg', 'Current Price', 'Original Price', 'Brand', 'Location', 'Url'])
    tomatoFrame = pd.DataFrame(columns=['Heirloom Tomato', 'Current Price', 'Original Price', 'Brand', 'Location', 'Url'])
    spiderLogs = []
    skipped = []

    #These are methods that are available for your convences
    def log(self, *args):
        self.spiderLogs.append(('Logger:', args))
        if self.LOGGER:
            print('Logger:', *args)

    def debug(self, *args):
        self.spiderLogs.append(('Debug:', args))
        if self.DEBUGGER:
            print('Debug:', *args)
    
    def printer(self, *args):
        self.spiderLogs.append(('Printer:', args))
        print(*args)
    
    def printLogs(self):
        print("\n< --- Printing Logs --- >\n")
        for entry in self.spiderLogs:
            print(*entry)

    def Logs_to_file(self, filename):
        with open(filename, 'w') as file:
            for log_entry in self.spiderLogs:
                file.write('{} {}\n'.format(log_entry[0], log_entry[1]))
    
    def __init__(self):
        self.DEBUGGER = False #The debugger switch to see whats going on. The Default is False
        self.LOGGER = False #When you need to see everything that happends. The Default is False
        self.attempts = 3 #The number of attempts the spider can retry if an error occurs. Default is 3
        self.waitTime = 10 #The number of seconds WebDriver will wait. Default is 10
        self.count = 0 #This saves the location of the url we are going through
        self.runTime = 0 #Total time of extractions
        self.totalRecoveries = 0 #Number of recoveries made while running
        #Selenium needs a webdriver to work. I chose Firefox however you can do another if you need too
        self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install(), log_path=path.devnull))
        self.log("Driver started")
        
    def restart(self):
        self.driver.close()
        self.driver.quit()
        self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install(), log_path=path.devnull))
        self.log("Driver restarted")
    
    def forceQuit(self):
        self.printer("Browser window was closed by user. Stopping program")
        self.log('\n < --- Total runtime took %s seconds with %d recoveries --- >' % (time.time() - self.runTime, self.totalRecoveries))
        self.Logs_to_file(self.name + ' Logs.txt')
        self.driver.quit()

    def requestExtraction(self, productType):
        self.count = 0
        errors = 0
        extractionType = productType.value[0]
        start = time.time()
        for trying in range(self.attempts):
            try:
                if extractionType == 1:
                    self.requestBacon()
                elif extractionType == 2:
                    self.requestEgg()
                elif extractionType == 3:
                    self.requestHeirloomTomatoes()
                # Add elif for more products here
                else:
                    self.debug("An error extractionType for " + str(extractionType) + " has occured")
                self.debug(productType.name + " Finished")    
                self.log('\n< --- ' + productType.name + ' scrape took %s seconds with %d recoveries --- >\n' % ((time.time() - start), errors))
                self.totalRecoveries += errors
                return self.totalRecoveries
            except WebDriverException:
                self.forceQuit()
                return None
            except Exception as e:
                errors += 1
                self.debug("An error occurred:", e)
                self.debug("Recovering extraction and continueing")
                self.restart() 
        self.debug(productType.name + " Did not Finished after " + str(self.attempts) + " Time wasted: %s seconds" % (time.time() - start))
        self.totalRecoveries += errors
        return self.totalRecoveries

    def start_requests( self ):
        self.runTime = time.time()
        self.totalRecoveries = 0 
        result = self.requestExtraction(Products.Bacon)
        if(result == None): return
        result = self.requestExtraction(Products.Eggs)
        if(result == None): return
        result = self.requestExtraction(Products.HeirloomTomatoes)
        if(result == None): return
        self.driver.close()
        self.driver.quit()
        #Adds the date that the data was scraped
        currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
        self.log("Exporting files")
        #Dataframes to CSV files
        self.baconFrame.to_csv(currentDate + "Gateway Market Bacon.csv")
        self.eggFrame.to_csv(currentDate + "Gateway Market Egg.csv")
        self.tomatoFrame.to_csv(currentDate + "Gateway Market Heirloom Tomatoes.csv")
        self.log('\n', self.baconFrame.to_string())
        self.log('\n', self.eggFrame.to_string())
        self.log('\n', self.tomatoFrame.to_string())
        self.debug('\n < --- Total runtime took %s seconds with %d recoveries --- >' % (time.time() - self.runTime, self.totalRecoveries))
        self.debug('\n < --- Number of skips ' + str(len(self.skipped)) +' --->')
        if len(self.skipped) != 0:
            self.debug(self.skipped)
        self.Logs_to_file(currentDate + self.name + ' Logs.txt')


    
    #This handles the xpaths 
    #most websites have simular xpaths for each item. You might need to make differnet xpaths for each item 
    #if that is the case
    #For assigning xpaths mark them if they are optional meaning it could or could not be present on the page 
    #we do this for speed up if you mark it as non optional and its not pressent it will skip the value 
    #and hurt the preformence
    #best practice is to render the optional last so it reduces the chances of skipping 
    def xpathMaker(self):
        #Add the xpaths here and mark if they are optional
        nameXpath = '//*[@id="item-details"]/h1[contains(@class, "name")]'
        priceXpath = '//*[@id="item-details"]//*[contains(@class, "wc-pricing")]/div[1]'
        prevPriceXpath = '//*[@id="item-details"]//*[contains(@class, "wc-pricing")]/div[2]/s' # optional
        brandXpath = '//*[@id="item-details"]/div[1]' # optional
        #xpath, Optional
        xpathList = [(nameXpath, False),
                     (priceXpath, False),
                     (prevPriceXpath, True),
                     (brandXpath, True)]
        return xpathList
    
    #Collecting the data from the xpath in JavaScript is faster and results in fewer errors than doing it in python
    def javascriptXpath(self, xpath):
        try: 
            #Waits for page to load 
            ignored_exceptions=(NoSuchElementException,StaleElementReferenceException)
            elements = WebDriverWait(self.driver, self.waitTime, ignored_exceptions=ignored_exceptions).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))
            for quickRetry in range(self.attempts): #this is for fast computers
                #Runs the javascript and collects the text data from the inputed xpath
                text = self.driver.execute_script("""
                    const element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
                    if (!element) {
                        return 'Skip';
                    }
                    return element.textContent.trim();
                """, 
                xpath)
                if self.checkOutput(text):
                    time.sleep(1)
                else:
                    self.log('found ', text, ' for xpath: ', xpath)
                    return text
        except TimeoutException:
            #This means the xpath wasnt found in the page
            self.log('Could not find xpath for: ', xpath)
        return 'Empty'

    def checkOutput(self, check):
        self.log('Validing input')
        invalidOutputs = {"error", 'skip', "$nan", ''}
        if check.lower() in invalidOutputs:
            self.log("Invalid word:", check)
            return True
        else:
            self.log("Valid")
            return False

    #This handles the reqests 
    def makeRequest(self, url):
        xpathList = self.xpathMaker()
        self.log("xpath list retrieved ", xpathList)
        item = []
        time.sleep(1) # marionette Error Fix
        for xpath in xpathList:
            data = 'skip'
            #Retrying the xpath given the number of attempts
            for attempt in range(self.attempts):
                data = self.javascriptXpath(xpath[0])
                if(self.checkOutput(data)): # Data not found
                    self.debug("Missing item retrying")
                elif data == 'Empty':     
                    if xpath[1]:
                        self.debug("xpath wasnt avaliable")
                        item.append(None)
                        break
                    self.debug("Missing item retrying")
                else:  #Data found
                    item.append(data)
                    self.log(data + ' was added to the list for: ', url)
                    break
            if data == 'skip':  #To help clean the data we skip the item with gaps of data 
                self.debug("An Item has been skipped for: ", url)  
                item = ['Skipped']*(len(xpathList))
                self.skipped.append(url)
        return self.DataCleaning(item, url)
    
    def requestBacon( self ):
        baconUrls = Products.Bacon.value[1]
        total = len(baconUrls)
        while self.count < total:
            url = baconUrls[self.count]
            self.driver.get(url)
            self.log("Making a request for: ", url)
            items = self.makeRequest(url) 
            self.debug('Extracted: ', items)
            self.baconFrame.loc[len(self.baconFrame)] = items                    
            self.count += 1
            self.printer("Bacon item added ", self.count, " of ", total, ":  ", items)

    def requestEgg(self): 
        eggsUrls = Products.Eggs.value[1]
        total = len(eggsUrls)
        while self.count < total:
            url = eggsUrls[self.count]
            self.driver.get(url)
            self.log("Making a request for: ", url)
            items = self.makeRequest(url) 
            self.debug('Extracted: ', items)
            self.eggFrame.loc[len(self.eggFrame)] = items                    
            self.count += 1
            self.printer("Egg item added ", self.count, " of ", total, ":  ", items)
    
    def requestHeirloomTomatoes(self):
        tomatoesUrls = Products.HeirloomTomatoes.value[1]
        total = len(tomatoesUrls)
        while self.count < total:
            url = tomatoesUrls[self.count]
            self.driver.get(url)
            self.log("Making a request for: ", url)
            items = self.makeRequest(url) 
            self.debug('Extracted: ', items)
            self.tomatoFrame.loc[len(self.tomatoFrame)] = items                    
            self.count += 1
            self.printer("Heirloom tomato item added ", self.count, " of ", total, ":  ", items)

    #This part is a special case for this particular spider cleaning could be implemented here
    def DataCleaning(self, item, url):
        self.debug('Data cleaning started: ', item)
        item.append("2002 Woodland Avenue Des Moines, IA 50312")
        item.append(url)
        self.debug('Data cleaning finished: ', item)
        return item


In [None]:
# Start
#DEBUG Switch
SHOW = True
spider = GatewaySpider()
# spider.LOGGER = True
spider.DEBUGGER = True
spider.start_requests()
if(SHOW):
    print(spider.baconFrame)
    print(spider.eggFrame)
    print(spider.tomatoFrame)
    spider.printLogs()