In [1]:
import pandas as pd
import requests
import time
import sys
import re
from bs4 import BeautifulSoup as bs
from IPython.display import display, HTML, clear_output
from datetime import datetime
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

In [2]:
opts = FirefoxOptions()
opts.add_argument("--headless")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}


In [13]:
class scraper:
    opts = FirefoxOptions()
    opts.add_argument("--headless")
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

    @staticmethod
    def dic_to_df(dic):
        new = pd.DataFrame(dic)
        new.sort_values('pecentage_off', inplace =True, ascending = False)
        new.reset_index(drop=True, inplace = True)
        new['previous_price'] = new['previous_price'].apply(lambda x: "£{:.2f}".format((x)))
        new['price'] = new['price'].apply(lambda x: "£{:.2f}".format((x)))
        new['pecentage_off'] = new['pecentage_off'].apply(lambda x: "{:.0f}%".format(x))

        return new
        
    @staticmethod
    def display(df, show = 50):

        def path_to_image_html(path):
            return '<img src="'+ path + '" width="150" >'

        def make_clickable(val):
            # target _blank to open new window
            return '<a target="_blank" href="{}">{}</a>'.format(val, val)
    
        if isinstance(df, dict):
            df = scraper.dic_to_df(df)
        
        show = min(show, len(df))
        format_dic = {'img_link':path_to_image_html, 'item_link':make_clickable}
        display(HTML(df.head(show).to_html(escape=False, formatters=format_dic, index = False)))
        
    @staticmethod
    def bananafingers(pages = 99,
                      display = False,
                      dic = {
                          'item_name':[],
                          'price':[],
                          'pecentage_off':[],
                          'previous_price':[],
                          'img_link':[],
                          'item_link':[]}
                     ):
            
        for pg in range(1, pages + 1):
            url = f'https://bananafingers.co.uk/outlet?p={pg}' # loop through each outlet page
            page = requests.get(url, headers = scraper.headers) # call website
            if page.status_code != 200:
                page = requests.get(url, headers = headers) # try again
            if page.status_code != 200:
                page = requests.get(url, headers = headers) # try again again
            if page.status_code != 200:
                break # give up
            soup = bs(page.text, 'html.parser') # parse html into text
            if soup.find('div', class_ ='message info empty'): #stop if the returned html contains an empty warning (ran out of sale items)
                break
            all = soup.findAll('li', class_ = 'item product product-item') # find html class for sale items, gather all classes into list
        
            for i in all: # each sale item, get relevant info from html 
                pecentage_off = float(i.find('span').get_text().strip().replace('%', ''))
                dic['pecentage_off'].append(pecentage_off)
                dic['img_link'].append( i.find('img')['src'])
                dic['item_name'].append(i.find(class_='product-item-link').get_text().strip())
                dic['item_link'].append( i.find(class_='product-item-link')['href'])
                price = float(i.find(class_='price').get_text().strip('£'))
                dic['price'].append(price)
                dic['previous_price'].append(price / (1-(pecentage_off/100)))

        if display:
            scraper.display(dic)
        
        return dic

    @staticmethod
    def rockrun(display = False,
                dic = {
                    'item_name':[],
                    'price':[],
                    'pecentage_off':[],
                    'previous_price':[],
                    'img_link':[],
                    'item_link':[]}
               ):
        
        browser = webdriver.Firefox(options=scraper.opts)
        
        browser.get('https://rockrun.com/collections/climbing-mountaineering-deals') # use selenium (via firefox instance) to connect to rockrun
        time.sleep(1)
        
        body = browser.find_element(By.CSS_SELECTOR, "body") # need to scroll down to access all sale items, so click somewhere that wont change the page, and scroll down
        no_of_pagedowns = 50
        
        while no_of_pagedowns:
            body.send_keys(Keys.PAGE_DOWN) # send pg_down key press to firefox instance
            time.sleep(1) #it loads new thingies so give it a mo
            no_of_pagedowns-=1
        
        soup = bs(browser.page_source) # convert html from selenium to parsed text
        browser.quit()
        
        all = soup.findAll('div', class_='product-wrap') # find html class for sale items, gather all classes into list
        
        for i in all: # each sale item, get relevant info from html 
            dic['item_name'].append(i.find(class_ ='product-thumbnail__title').get_text())
            price = float(i.find(class_ = 'money').get_text().strip().replace('£',''))
            dic['price'].append(price)
            previous_price = price if i.find(class_ = 'product-thumbnail__was-price compare-at-price') is None else float(i.find(class_ = 'product-thumbnail__was-price compare-at-price').get_text().strip().replace('£',''))
            dic['previous_price'].append(previous_price)
            dic['pecentage_off'].append((1 - (price/previous_price))*100)
            dic['item_link'].append(f"https://rockrun.com{i.find('a')['href']}")
            dic['img_link'].append(f"https://{i.find('img')['src'].strip('/')}")
        
        if display:
            scraper.display(dic)
    
        return dic

    def climbers_shop(display = False,
                      dic = {
                          'item_name':[],
                          'price':[],
                          'pecentage_off':[],
                          'previous_price':[],
                          'img_link':[],
                          'item_link':[]}
                     ):

        browser = webdriver.Firefox(options=scraper.opts)
        browser.get('https://www.climbers-shop.com/climbing-equipment/eol/instock')
        time.sleep(1)
        
        body = browser.find_element(By.CSS_SELECTOR, "body")
        no_of_pagedowns = 30
        
        while no_of_pagedowns:
            body.send_keys(Keys.PAGE_DOWN) 
            time.sleep(1)
            no_of_pagedowns-=1
        
        soup = bs(browser.page_source)
        browser.quit()
        
        pattern2 = re.compile(r'item col-facetItem ctrPad16$')
        all = soup.findAll('div', class_ = pattern2)
        
        for i in all:
            if i.find('div', class_ = re.compile(r'col-1 pricing$')).find(id='lblwas'): #some items arnt actually on sale idk, so just skip if i cant return a prev price
                dic['item_name'].append(i.find('a', class_ = re.compile(r'col-1 frItemName$')).get_text())
                dic['price'].append(float(i.find('div', class_ = re.compile(r'col-1 pricing$')).find(id='lblNow').get_text().strip().replace('£','')))
                dic['previous_price'].append(float(i.find('div', class_ = re.compile(r'col-1 pricing$')).find(id='lblwas').get_text().strip().replace('£','')))
                dic['pecentage_off'].append(float(i.find('div', class_ = re.compile(r'col-1 pricing$')).find(class_='percentOff-betterSearch').get_text().split(' ')[1].replace('%', '')))
                dic['item_link'].append(f"https://www.climbers-shop.com{i.find('a', class_ = re.compile(r'col-1 frItemName$'))['href']}")
                dic['img_link'].append(f"https://www.climbers-shop.com{i.find('img')['data-src']}")
        
        if display:
            scraper.display(dic)
    
        return dic
        
    def gooutdoors(pages = 99,
                   display = False,
                   dic = {
                       'item_name':[],
                       'price':[],
                       'pecentage_off':[],
                       'previous_price':[],
                       'img_link':[],
                       'item_link':[]}
                     ):
        browser = webdriver.Firefox(options=scraper.opts)
        pattern = re.compile(r'^product-item')
        
        for pg in range(1,pages + 1):
            url = f'https://www.gooutdoors.co.uk/climbing/sal:view/page{pg}.html'
            browser.get(url)
            time.sleep(1)
            soup = bs(browser.page_source)
            if soup.find(id='noPage'): #stop if the returned html contains an empty warning (ran out of sale items) or pgs >= 100 (just in case)
                break
            all = soup.find('div', class_ = 'productlist_grid').findAll('article', class_ = pattern)
        
            for i in all:
                sale_text = i.find(class_='offer-text').find().get_text()
                if '%' in sale_text:
                    off = float([i for i in sale_text.split(' ') if '%' in i][0].replace('%', ''))/100
                else:
                    off = 0
                dic['item_link'].append(f"https://www.gooutdoors.co.uk{i.find('a')['href']}")
                dic['img_link'].append(i.find('img')['src'])
                dic['item_name'].append(i.find('h2').get_text())
                price = float(i.find(class_='loyalty-price').get_text().partition('£')[2]) * (1-off)
                dic['price'].append(price)
                previous_price = float(i.find(class_='retail-price').get_text().partition('£')[2])
                dic['previous_price'].append(previous_price)
                dic['pecentage_off'].append((1 - (price/previous_price))*100)
            time.sleep(5) # gooutdoors doesnt like being called lots :(
        browser.quit()
        
        if display:
            scraper.display(dic)
    
        return dic


    @staticmethod
    def alpine_trek(pages = 99,
                    display = False,
                    dic = {
                        'item_name':[],
                        'price':[],
                        'pecentage_off':[],
                        'previous_price':[],
                        'img_link':[],
                        'item_link':[]}
                     ):
        browser = webdriver.Firefox(options=scraper.opts)
        url = f'https://www.alpinetrek.co.uk/outlet/climbing/1/'
        browser.get(url)
        time.sleep(1)
        soup = bs(browser.page_source)

        items = int(re.findall('\d+|$', soup.find(class_ ='product-amount inline highlight-2').get_text())[0])
        pgs = items/len(soup.findAll(class_='product-item product-fallback'))
        
        for pg in range(1,min(pages+1,int(pgs)+2)):
            url = f'https://www.alpinetrek.co.uk/outlet/climbing/{pg}/'
            browser.get(url)
            time.sleep(1)
            soup = bs(browser.page_source)
            all = soup.findAll(class_='product-item product-fallback')
        
            for i in all:
                if i.find(class_='product-price').find(class_='uvp'):
                    dic['item_link'].append(i.find(class_='product-link')['href'])
                    dic['img_link'].append(i.find(class_='product-image')['src'])
                    dic['item_name'].append(i.find(class_='manufacturer-title').get_text() + i.find(class_='product-title').get_text().replace('\n',' '))
                    previous_price = float(re.findall(r"\d+\.\d+", i.find(class_='product-price').find(class_='uvp').get_text())[0])
                    price = float(re.findall(r"\d+\.\d+", i.find(class_='product-price').find(class_='price high-light').get_text())[0])
                    dic['price'].append(price)
                    dic['previous_price'].append(previous_price)
                    dic['pecentage_off'].append((1 - (price/previous_price))*100)
        browser.quit()
        
        if display:
            scraper.display(dic)
    
        return dic
        
    @staticmethod
    def scrape(display = True):
        dic = scraper.bananafingers()
        dic = scraper.rockrun(dic = dic)
        dic = scraper.climbers_shop(dic = dic)
        dic = scraper.gooutdoors(dic = dic)
        
        df = scraper.dic_to_df(dic)
        
        if display:
            scraper.display(df)
            
        return df

In [None]:
url = f'https://www.alpinetrek.co.uk/outlet/climbing/1/' # loop through each outlet page
page = requests.get(url, headers = headers)
page.status_code

In [3]:
dic = {
       'item_name':[],
       'price':[],
       'pecentage_off':[],
       'previous_price':[],
       'img_link':[],
       'item_link':[]}

browser = webdriver.Firefox(options=opts)
url = f'https://www.alpinetrek.co.uk/outlet/climbing/1/'
browser.get(url)
time.sleep(1)
soup = bs(browser.page_source)

items = int(re.findall('\d+|$', soup.find(class_ ='product-amount inline highlight-2').get_text())[0])
pages = items/len(soup.findAll(class_='product-item product-fallback'))
for pg in range(1,int(pages)+2):
    url = f'https://www.alpinetrek.co.uk/outlet/climbing/{pg}/'
    browser.get(url)
    time.sleep(1)
    soup = bs(browser.page_source)
    all = soup.findAll(class_='product-item product-fallback')

    for i in all:
        if i.find(class_='product-price').find(class_='uvp'):
            dic['item_link'].append(i.find(class_='product-link')['href'])
            dic['img_link'].append(i.find(class_='product-image')['src'])
            dic['item_name'].append(i.find(class_='manufacturer-title').get_text() + i.find(class_='product-title').get_text().replace('\n',' '))
            previous_price = float(re.findall(r"\d+\.\d+", i.find(class_='product-price').find(class_='uvp').get_text())[0])
            price = float(re.findall(r"\d+\.\d+", i.find(class_='product-price').find(class_='price high-light').get_text())[0])
            dic['price'].append(price)
            dic['previous_price'].append(previous_price)
            dic['pecentage_off'].append((1 - (price/previous_price))*100)



The geckodriver version (0.35.0) detected in PATH at /usr/bin/geckodriver might not be compatible with the detected firefox version (135.0.1); currently, geckodriver 0.36.0 is recommended for firefox 135.*, so it is advised to delete the driver in PATH and retry


In [14]:
scraper.alpine_trek(display = True)

The geckodriver version (0.35.0) detected in PATH at /usr/bin/geckodriver might not be compatible with the detected firefox version (135.0.1); currently, geckodriver 0.36.0 is recommended for firefox 135.*, so it is advised to delete the driver in PATH and retry


item_name,price,pecentage_off,previous_price,img_link,item_link
Red Chili Ventic Air Lace Climbing shoes,£32.78,60%,£81.95,,https://www.alpinetrek.co.uk/red-chili-ventic-air-lace-climbing-shoes/
C.A.M.P. Head Band System for Titan Large,£2.72,55%,£6.05,,https://www.alpinetrek.co.uk/camp-head-band-system-for-titan-large/
C.A.M.P. Head Band System for Titan Small,£2.72,55%,£6.05,,https://www.alpinetrek.co.uk/camp-head-band-system-for-titan-small/
UnParallel Women's Flagship LV Climbing shoes,£71.53,55%,£158.95,,https://www.alpinetrek.co.uk/unparallel-womens-flagship-lv-climbing-shoes/
UnParallel Women's TN Pro LV Climbing shoes,£67.48,55%,£149.95,,https://www.alpinetrek.co.uk/unparallel-womens-tn-pro-lv-climbing-shoes/
UnParallel NewTro VCS Climbing shoes,£67.48,55%,£149.95,,https://www.alpinetrek.co.uk/unparallel-newtro-vcs-climbing-shoes/
UnParallel Shoe L5 UP Approach shoes,£65.68,55%,£145.95,,https://www.alpinetrek.co.uk/unparallel-shoe-l5-up-approach-shoes/
UnParallel Regulus LV Climbing shoes,£63.88,55%,£141.95,,https://www.alpinetrek.co.uk/unparallel-regulus-lv-climbing-shoes/
Arc'teryx Women's C-Quence Harness Climbing harness,£58.03,55%,£128.95,,https://www.alpinetrek.co.uk/arcteryx-womens-c-quence-harness-climbing-harness/
Edelrid Women's Autana Climbing harness,£35.98,55%,£79.95,,https://www.alpinetrek.co.uk/edelrid-womens-autana-climbing-harness-bf/


{'item_name': ['Black Diamond Transition Gloves ',
  'Gibbon Slacklines Giboard Line Slacklining ',
  "La Sportiva Women's Theory Climbing shoes ",
  'UnParallel Up-Rise VCS Climbing shoes ',
  'Black Diamond Hot Forged Alpine Pick Replacement pick ',
  'Gibbon Slacklines Treewear Tree protection ',
  'Edelrid Perfect Alpine II Special Edition 8.3 mm Half rope ',
  'Black Diamond Nylon Runner 18 mm Sewn runner ',
  'Edelrid Spinner Leash Leash ',
  'La Sportiva Mantra Climbing shoes ',
  "Red Chili Kid's Puzzle Climbing shoes ",
  'Black Diamond Capitan Helmet Climbing helmet ',
  'Black Diamond Hotforge Screwgate Carabiner Screwgate carabiner ',
  'Black Diamond Vapor Helmet Climbing helmet ',
  'C.A.M.P. Photon Lock Screwgate carabiner ',
  "Black Diamond Women's Momentum Harness Climbing harness ",
  'UnParallel Sirius Lace Climbing shoes ',
  'UnParallel Vim Climbing shoes ',
  'Metolius Finger Tape 2-Pack Tape ',
  'Edelrid Daku 10.0 Single rope ',
  'Black Diamond Vision Helmet C