In [2]:
from lxml import html
from lxml import objectify
import requests
import json
import pandas as pd
import csv
import numpy as np

In [12]:
class Scraper_BackCountry:
    def __init__(self):
        BASE_URL = "https://www.backcountry.com"
        HEADERS = {
            'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'referer': 'https://www.google.com/',
        }
    
    def getSpecs_BackCountry(self, url):
        tree = self.get(url)

        specNames = tree.xpath('//dt[@class="prod-details-accordion__techspec-name js-techspec-name"]/text()')
        specValues = tree.xpath('//dd[@class="prod-details-accordion__techspec-value js-techspec-value"]/text()')

        img = tree.xpath('//img[@class="ui-flexslider__img js-flexslider-img qa-flexslider__img"]')[0]
        img = html.tostring(img, encoding="unicode")
        img = img.split('data-src="//')[1].split('"')[0]
        name = tree.xpath('//title/text()')[0].split(" |")[0]
        summary = tree.xpath('//script[@type="text/javascript"]/text()')[3]
        summary = summary.split('BC.product = ')[1].split(';BC.product')[0]
        summary = json.loads(summary)
        currency = tree.xpath('//meta[@itemprop="priceCurrency"]/@content')[0]
        avgRating, numRatings = calculateRating(tree.xpath('//div[@class="reviews-wrap"]')[0])

        data = {}
        data["URL"] = url
        data["Name"] = summary["displayName"]
        data["Brand"] = summary["brand"]["displayName"]
        data["lowPrice"] = summary["lowListPrice"].split("$")[1]  #maybe add symbols for different currencies
        data["highPrice"] = summary["highListPrice"].split("$")[1]
        data["PriceCurrency"] = currency
        data["Image"] = img
        data['Rating'] = avgRating
        data['numRatings'] = numRatings
        for i, item in enumerate(specValues):
            data[specNames[i]] = item
        return data

    
    def getPages_BackCountry(self, url):
        tree = self.get(url)
        numPagesList = tree.xpath('//li[@class="page-link page-number qa-page-link"]/.//a/text()')
        if(len(numPagesList) == 0):
            return [url]

        urlList = []
        numPages = int(numPagesList[-1])
        if "?" in url:
            pagesStr1 = "&page="
        else:
            pagesStr1 = "?page="
        pagesStr2 = "&pagesize=42"    # for some reason always added when changing page manually on website
        for i in range(numPages):
            pageURL = url + pagesStr1 + str(i) + pagesStr2
            urlList.append(pageURL)
        return urlList
    
    
    
        def getProducts_BackCountry(self, urls):
            urlPages = getPages_BackCountry(urls) # get urls of each page of the website category
            urlProducts = []
            for url in urlPages: #get urls of each product on each page and append to list
                productsURL = getProductsFromPage_BackCountry(url)
                for productURL in productsURL:
                    urlProducts.append(productURL)
            products = []
            for prodURL in urlProducts:
                print(prodURL)
                specs = getSpecs_BackCountry(self.BASE_URL + prodURL)
                products.append(specs) # append product list to specs
            return product
        
        
        def scrape_BackCountry(self, categories):
            products = {}
            for category in categories:
                print(category)
                products[category] = []
                for url in categories[category]:
                    products[category].append(getProducts_BackCountry(url))
            return products
    
    
    
    # calculates produt rating from reviews listed on product page
    def calculateRating(self, tree):
        #get number of reviews of each numbered rating
        numFive = len(tree.xpath('//div[@class="rating-value-5 review-header-rating"]'))
        numFour = len(tree.xpath('//div[@class="rating-value-4 review-header-rating"]'))
        numThree = len(tree.xpath('//div[@class="rating-value-3 review-header-rating"]'))
        numTwo = len(tree.xpath('//div[@class="rating-value-2 review-header-rating"]'))
        numOne = len(tree.xpath('//div[@class="rating-value-1 review-header-rating"]'))
        
        numSum = numFive + numFour + numThree + numTwo + numOne
        # return -1 if product has no ratings
        if(numSum == 0):
            return -1, 0
        #calculate average rating and total number of ratings
        return ( ( 5 * numFive + 4 * numFour + 3 * numThree + 2 * numTwo + 1 * numOne) / numSum ), numSum
    
    def getProductsFromPage_BackCountry(self, url):
        tree = self.get(url)

        products = tree.xpath('//div[@aria-label="Product"]/.//a/@href')
        return products
    
    
        # helper function for get requests
    def get(self, url):
        # sleep to add variable pauses in requests
        delay = np.random.randint(20)
        time.sleep(delay)
        #request then convert data into html element
        page = requests.get(url, headers=self.HEADERS)
        tree = html.fromstring(page.content)
        return tree

In [15]:
url = "https://www.backcountry.com/3-season-tents"

In [16]:
scraper = Scraper_BackCountry()
scraper.getPages_BackCountry(url)

['https://www.backcountry.com/3-season-tents?page=0&pagesize=42',
 'https://www.backcountry.com/3-season-tents?page=1&pagesize=42',
 'https://www.backcountry.com/3-season-tents?page=2&pagesize=42',
 'https://www.backcountry.com/3-season-tents?page=3&pagesize=42']

In [15]:
HEADERS = {
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'referer': 'https://www.google.com/',
}