In [1]:
from datetime import datetime
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging

class IowaFoodHubSpider(scrapy.Spider):
    name = 'Iowa Food Hub'
    currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
    def start_requests( self ):

        iowaFoodHubBaconUrl = 'https://iowa-food-hub.myshopify.com/search?q=bacon'
        yield scrapy.Request( url = iowaFoodHubBaconUrl, callback = self.iowaFoodHubSearch, meta={'url': iowaFoodHubBaconUrl, 'type': 'bacon'})

        iowaFoodHubEggsUrl = 'https://iowa-food-hub.myshopify.com/search?q=Egg'
        yield scrapy.Request( url = iowaFoodHubEggsUrl, callback = self.iowaFoodHubSearch, meta={'url': iowaFoodHubEggsUrl, 'type': 'eggs'})

    def iowaFoodHubSearch(self, response):
        #Failsafe for links
        try:
            #grabs all cards from list and saves the link to follow
            xpath = '//*[@id="MainContent"]//a[contains(@class,"list-view-item")]/@href'
            linkList = response.xpath(xpath)
            productType = response.meta.get('type')
            if productType == 'bacon':
                for url in linkList:
                    yield response.follow( url = url, callback = self.iowaFoodHubBacon, meta={'url': response.meta.get('url')}, dont_filter=True )
            elif productType == 'eggs':
                for url in linkList:
                    yield response.follow( url = url, callback = self.iowaFoodHubEggs, meta={'url': response.meta.get('url')}, dont_filter=True )
        except AttributeError:
           pass

    def iowaFoodHubBacon(self, response):
        #validating the name
        nameXpath = '//*[@id="ProductSection-product-template"]//*[contains(@class, "product-single__title")]/text()'
        name = response.xpath(nameXpath).extract_first()
        desiredNames = {"bacon"}
        if not self.containsWord(name, desiredNames):
            return 
        #The other areas we are interested in
        venderXpath = '//*[@id="ProductSection-product-template"]//*[contains(@class, "product-single__vendor")]/text()'
        priceXpath = '//*[@id="ProductPrice-product-template"]/text()'
        
        #getting the product discription
        discXpath = '//*[@id="ProductSection-product-template"]//*[contains(@class, "product-single__description") and @itemprop="description"]/*'
        description = response.xpath(discXpath)
        descriptionText = ''
        for text in description:
            descriptionText += "".join(text.xpath('.//text()').extract_first().strip())
        
        #Adding product to data frame    
        IowaFoodHubBaconDataFrame.loc[len(IowaFoodHubBaconDataFrame)] = [name,
                                                                         response.xpath(venderXpath).extract_first(), 
                                                                         response.xpath(priceXpath).extract_first(), 
                                                                         descriptionText,                                       
                                                                         self.currentDate,
                                                                         response.meta.get('url')
                                                                        ]
                
    def iowaFoodHubEggs(self, response):
        #validating the name
        nameXpath = '//*[@id="ProductSection-product-template"]//*[contains(@class, "product-single__title") and @itemprop="name"]/text()'       
        name = response.xpath(nameXpath).extract_first()
        desiredNames = {"egg"}
        if not self.containsWord(name, desiredNames):
            return 
        #The other areas we are interested in
        venderXpath = '//*[@id="ProductSection-product-template"]//*[contains(@class, "product-single__vendor") and @itemprop="brand"]/text()'
        priceXpath = '//*[@id="ProductPrice-product-template" and @itemprop="price"]/text()'

        #getting the product discription
        discXpath = '//*[@id="ProductSection-product-template"]//*[contains(@class, "product-single__description") and @itemprop="description"]/*'
        description = response.xpath(discXpath)
        descriptionText = ''
        for text in description:
            descriptionText += "".join(text.xpath('.//text()').extract_first()).strip()
        
        #Adding product to data frame
        IowaFoodHubEggDataFrame.loc[len(IowaFoodHubEggDataFrame)] = [name,
                                                                     response.xpath(venderXpath).extract_first(), 
                                                                     response.xpath(priceXpath).extract_first(), 
                                                                     descriptionText,                                       
                                                                     self.currentDate,
                                                                     response.meta.get('url')
                                                                    ]
    def containsWord(self, string, validWords):
        checkText = string.replace(" ", "").lower()
        for word in validWords:
            if word in checkText:
                return True
        return False

DEBUG = False
#Data frames
IowaFoodHubBaconDataFrame = pd.DataFrame(columns=['Bacon', 'Vender', 'Price', 'Weight', 'Extraction Date', 'Url'])
IowaFoodHubEggDataFrame = pd.DataFrame(columns=['Eggs', 'Vender', 'Price', 'Amount', 'Extraction Date', 'Url'])
if(DEBUG):
    #To see the inner mechanics of the spider
    configure_logging()

#This is to start the spider
process = CrawlerProcess()
process.crawl(IowaFoodHubSpider)
process.start()
process.stop()
currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]

print(IowaFoodHubBaconDataFrame)
print(IowaFoodHubEggDataFrame)
IowaFoodHubBaconDataFrame.to_csv(currentDate + "Iowa Food Hub Bacon.csv")
IowaFoodHubEggDataFrame.to_csv(currentDate + "Iowa Food Hub Eggs.csv")

if(DEBUG):
    #To see the outputs
    print(IowaFoodHubBaconDataFrame)
    print(IowaFoodHubEggDataFrame)



2023-06-14 12:09:14 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: scrapybot)
2023-06-14 12:09:14 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.14, cssselect 1.1.0, parsel 1.6.0, w3lib 1.21.0, Twisted 22.2.0, Python 3.10.9 | packaged by conda-forge | (main, Jan 11 2023, 15:15:40) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1t  7 Feb 2023), cryptography 37.0.1, Platform Windows-10-10.0.19045-SP0
2023-06-14 12:09:14 [scrapy.crawler] INFO: Overridden settings:
{}
2023-06-14 12:09:14 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2023-06-14 12:09:14 [scrapy.extensions.telnet] INFO: Telnet Password: b3a35dc10e59d64a
2023-06-14 12:09:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2023-06-14 12:09:15 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpaut

                                Bacon              Vender  \
0  IFH - Driftless Provisions - Bacon       Iowa Food Hub   
1      HCF Beef Bacon, Hickory Smoked  Hart Country Meats   

                                          Price  \
0  \n                  $12.00\n                   
1  \n                  $15.00\n                   

                                              Weight Extraction Date  \
0  Nitrate free, smoked and ready for you!Sliced ...     2023-06-14    
1  1 pound of delicious beef bacon.  Cured and hi...     2023-06-14    

                                                 Url  
0  https://iowa-food-hub.myshopify.com/search?q=b...  
1  https://iowa-food-hub.myshopify.com/search?q=b...  
                                 Eggs            Vender  \
0  CRF Free Range Eggs, Large - Dozen  Cedar Ridge Farm   

                                         Price  \
0  \n                  $5.00\n                   

                                              Amount Extract