In [1]:
from datetime import datetime
import pandas as pd
import os
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from DSPG_Products import Products #Imports the products to be processed
from DSPG_Cleaner import DataCleaner # This is to handle the cleaning of data
from DSPG_SpiderErrors import ProductFinderError

class JoiaFoodFarmSpider(scrapy.Spider):
    name = 'Joia Food Farm'
    currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
    def start_requests( self ):
        #Bacon Scraper part
         
        JoiaFoodFarmBaconUrls = 'https://www.joiafoodfarm.com/farmstore?category=Pork'
        yield scrapy.Request( url = JoiaFoodFarmBaconUrls, callback = self.JoiaFoodFarmSearch, meta={'type': 'Bacon'})
        
        JoiaFoodFarmEggsUrls = 'https://www.joiafoodfarm.com/farmstore?category=Eggs'
        yield scrapy.Request( url = JoiaFoodFarmEggsUrls, callback = self.JoiaFoodFarmSearch, meta={'type': 'Eggs'})

    def JoiaFoodFarmSearch(self, response):
        #Failsafe for links
        try:
            #grabs all cards from list and saves the link to follow
            xpath = '//main//*[contains(@class, "ProductList-grid")]//*[contains(@class, "ProductList-item-link")]/@href'
            linkList = response.xpath(xpath)
            productType = response.meta.get('type')
            itemIndex = self.dataFrameItemIndex(productType)
            if productType == 'Bacon':
                for url in linkList:
                    yield response.follow( url = url, callback = self.JoiaFoodFarmBacon, meta={'DataFrameIndex': itemIndex}, dont_filter=True )
            elif productType == 'Eggs':
                for url in linkList:
                    yield response.follow( url = url, callback = self.JoiaFoodFarmEggs, meta={'DataFrameIndex': itemIndex}, dont_filter=True )
        except AttributeError:
           pass

    def JoiaFoodFarmBacon(self, response):
        nameXpath = '//*[contains(@class, "ProductItem-summary")]//h1[contains(@class, "ProductItem-details-title")]/text()'
        name = response.xpath(nameXpath).extract_first()
        if "bacon" not in name.lower():
            return          
        
        #load cleaner template
        clean = DataCleaner()
        clean.LoadDataSet(0, response.url)
        clean.Data['Product Type'] = name
        
        #The other areas we are interested in
        priceXpath = '//*[contains(@class, "ProductItem-summary")]//*[contains(@class, "product-price")]/text()'    
        clean.Data['Current Price'] = response.xpath(priceXpath).extract_first()
        
        #getting the product discription
        discXpath = '//*[contains(@class, "ProductItem-summary")]//*[contains(@class, "ProductItem-details-excerpt")]/descendant-or-self::text()'
        description = response.xpath(discXpath).getall()
        # remove leading and trailing whitespace from each string
        description = [text.strip() for text in description]
        # remove empty strings
        description = list(filter(None, description))
        # join the strings into a single string
        descriptionText = " ".join(description)
        unit = clean.findWeightUnit(descriptionText)
        if not unit:
            unit = clean.findWeightUnit(clean.Data['Product Type'])
        clean.Data['True Weight'] = unit
        clean.Data['Weight in lbs'] = clean.ozToLb(clean.Data['True Weight'])
        clean.cleanPricing()
        clean = self.setLocationalData(clean)
        #Adding product to data frame
        indexFrame = response.meta.get('DataFrameIndex')
        DataFrame[indexFrame].loc[len(DataFrame[indexFrame])] = list(clean.Data.values())

    def JoiaFoodFarmEggs(self, response):
        nameXpath = '//*[contains(@class, "ProductItem-summary")]//h1[contains(@class, "ProductItem-details-title")]/text()'
        name = response.xpath(nameXpath).extract_first()
        if "egg" not in name.lower():
            return 
        
        #load cleaner template
        clean = DataCleaner()
        clean.LoadDataSet(1, response.url)
        clean.Data['Product Type'] = name
        
        #The other areas we are interested in       
        priceXpath = '//*[contains(@class, "ProductItem-summary")]//*[contains(@class, "product-price")]/text()'
        clean.Data['Current Price'] = response.xpath(priceXpath).extract_first()
        
        #getting the product discription
        discXpath = '//*[contains(@class, "ProductItem-summary")]//*[contains(@class, "ProductItem-details-excerpt")]/descendant-or-self::text()'
        description = response.xpath(discXpath).getall()
        # remove leading and trailing whitespace from each string
        description = [text.strip() for text in description]
        # remove empty strings
        description = list(filter(None, description))
        # join the strings into a single string
        descriptionText = " ".join(description)
        if not clean.EggFinder(descriptionText):
            clean.EggFinder(clean.Data['Product Type'])
        clean.cleanPricing()
        clean = self.setLocationalData(clean)
        #Adding product to data frame
        indexFrame = response.meta.get('DataFrameIndex')
        DataFrame[indexFrame].loc[len(DataFrame[indexFrame])] = list(clean.Data.values())

    def dataFrameItemIndex(self, string):
        for item in product.ProductList:
            if item[1] == string:
                return item[0]
        raise ProductFinderError(string)

    def setLocationalData(self, clean):
        #Brands dont change from this site so we add them here
        clean.Data['Brand'] = 'Joia food farm'
        clean.Data['Address'] = '2038 March Avenue'
        clean.Data['State'] = 'IA'
        clean.Data['City'] = 'Charles City'
        clean.Data['Zip Code'] = '50616'    
        return clean 
    

In [None]:
#Start
#Data frames
product = Products()
DataFrame = product.ProductDataFrames

DEBUG = False
if(DEBUG):
    #To see the inner mechanics of the spider
    configure_logging()

#This is to start the spider
process = CrawlerProcess()
process.crawl(JoiaFoodFarmSpider)
process.start()
process.stop()

currentDate = str(datetime(datetime.today().year, datetime.today().month, datetime.today().day))[:-8]
folderPath = currentDate + "Data"
if not os.path.exists(folderPath):
    os.makedirs(folderPath)

for index, frame in enumerate(DataFrame):
    if not frame.empty:
        fileName = currentDate + "Joia Food Farm " + product.ProductList[index][1] + ".csv"
        frame.to_csv(os.path.join(folderPath, fileName), index=False)

if(DEBUG):
    #To see the outputs
    for data in DataFrame:
        print(data)
