In [8]:
import pandas as pd
import requests
import time
import subprocess
import sys
import re
from bs4 import BeautifulSoup as bs
from IPython.display import display, HTML, clear_output
from datetime import datetime
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

pd.set_option('display.max_colwidth', None)

def path_to_image_html(path):
    return '<img src="'+ path + '" width="150" >'

def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)

In [None]:
subprocess.check_call([sys.executable, "-m", "pip", "install", 'beautifulsoup4'])
subprocess.check_call([sys.executable, "-m", "pip", "install", 'selenium'])

### Getting Firefox, gecko and selenium to work :)

This was genuinely remarkable difficult, so here is how i did it (inside a jupyter lab isntance through ubuntu). All the following commands are ran in a terminal inside the lab instance (no kernel)

First install firefox, latest version as from the [website](https://support.mozilla.org/en-US/kb/install-firefox-linux#w_install-from-your-distribution-package-manager)

```
  253  sudo install -d -m 0755 /etc/apt/keyrings
  254  wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- | sudo tee /etc/apt/keyrings/packages.mozilla.org.asc > /dev/null
  255  gpg -n -q --import --import-options import-show /etc/apt/keyrings/packages.mozilla.org.asc | awk '/pub/{getline; gsub(/^ +| +$/,""); if($0 == "35BAA0B33E9EB396F59CA838C0BA5CE6DC6315A3") print "\nThe key fingerprint matches ("$0").\n"; else print "\nVerification failed: the fingerprint ("$0") does not match the expected one.\n"}'
  256  echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | sudo tee -a /etc/apt/sources.list.d/mozilla.list > /dev/null
  257  echo '
  258  Package: *
  259  Pin: origin packages.mozilla.org
  260  Pin-Priority: 1000
  261  ' | sudo tee /etc/apt/preferences.d/mozilla 
  262  sudo apt-get update && sudo apt-get install firefox
  263  apt-get install -y libglib2.0-0=2.50.3-2     libnss3=2:3.26.2-1.1+deb9u1     libgconf-2-4=3.2.6-4+b1     libfontconfig1=2.11.0-6.7+b1
  264  sudo apt-get install firefox-geckodriver
  265  apt-get firefox
  266  apt-get install firefox
```

Now we need to install the latest geckodriver for firefox, these are helpful [stack](https://stackoverflow.com/questions/40867959/installing-geckodriver-only-using-terminal). Note that i had to put the driver in `/usr/bin` not `/usr/local/bin`

```
# i manually downloaded the driver and moved to active directory from here:  https://github.com/mozilla/geckodriver/releases
# though apparently you can do this: (make sure this is the latest release)
        328: wget https://github.com/mozilla/geckodriver/releases/download/v0.36.0/geckodriver-v0.35.0-linux32.tar.gz
  329  sudo mv geckodriver-v0.35.0-linux64.tar.gz /usr/bin
  330  cd /usr/bin
  332  sudo tar -xvf geckodriver-v0.35.0-linux64.tar.gz 
  333  sudo chmod +x geckodriver
  334  geckodriver
```

Finally migrate to the kernel/virtualenv you want, and install selenium:

```
pip install -U selenium
```

In [2]:
dict = {'item_name':[],
        'price':[],
        'pecentage_off':[],
        'previous_price':[],
        'img_link':[],
        'item_link':[]}

opts = FirefoxOptions()
opts.add_argument("--headless") # dont actually make a browser pop up

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

### Banana Fingers

In [3]:
for pg in range(1, 100):
    url = f'https://bananafingers.co.uk/outlet?p={pg}' # loop through each outlet page
    page = requests.get(url, headers = headers) # call website
    if page.status_code != 200:
        page = requests.get(url, headers = headers) # try again
    if page.status_code != 200:
        page = requests.get(url, headers = headers) # try again again
    if page.status_code != 200:
        break # give up
    soup = bs(page.text, 'html.parser') # parse html into text
    if soup.find('div', class_ ='message info empty'): #stop if the returned html contains an empty warning (ran out of sale items)
        break
    all = soup.findAll('li', class_ = 'item product product-item') # find html class for sale items, gather all classes into list

    for i in all: # each sale item, get relevant info from html 
        pecentage_off = float(i.find('span').get_text().strip().replace('%', ''))
        dict['pecentage_off'].append(pecentage_off)
        dict['img_link'].append( i.find('img')['src'])
        dict['item_name'].append(i.find(class_='product-item-link').get_text().strip())
        dict['item_link'].append( i.find(class_='product-item-link')['href'])
        price = float(i.find(class_='price').get_text().strip('£'))
        dict['price'].append(price)
        dict['previous_price'].append(price / (1-(pecentage_off/100)))
    
    #print(f'page {pg} finished at {datetime.now().strftime("%H:%M:%S")}') # checking


### Rockrun

In [4]:
browser = webdriver.Firefox(options=opts)

browser.get('https://rockrun.com/collections/climbing-mountaineering-deals') # use selenium (via firefox instance) to connect to rockrun
time.sleep(1)

body = browser.find_element(By.CSS_SELECTOR, "body") # need to scroll down to access all sale items, so click somewhere that wont change the page, and scroll down
no_of_pagedowns = 50

while no_of_pagedowns:
    body.send_keys(Keys.PAGE_DOWN) # send pg_down key press to firefox instance
    time.sleep(1) #it loads new thingies so give it a mo
    no_of_pagedowns-=1

soup = bs(browser.page_source) # convert html from selenium to parsed text
browser.quit()

all = soup.findAll('div', class_='product-wrap') # find html class for sale items, gather all classes into list

for i in all: # each sale item, get relevant info from html 
    dict['item_name'].append(i.find(class_ ='product-thumbnail__title').get_text())
    price = float(i.find(class_ = 'money').get_text().strip().replace('£',''))
    dict['price'].append(price)
    previous_price = price if i.find(class_ = 'product-thumbnail__was-price compare-at-price') is None else float(i.find(class_ = 'product-thumbnail__was-price compare-at-price').get_text().strip().replace('£',''))
    dict['previous_price'].append(previous_price)
    dict['pecentage_off'].append((1 - (price/previous_price))*100)
    dict['item_link'].append(f"https://rockrun.com{i.find('a')['href']}")
    dict['img_link'].append(f"https://{i.find('img')['src'].strip('/')}")

The geckodriver version (0.35.0) detected in PATH at /usr/bin/geckodriver might not be compatible with the detected firefox version (135.0.1); currently, geckodriver 0.36.0 is recommended for firefox 135.*, so it is advised to delete the driver in PATH and retry


### GoOutdoors

In [5]:
browser = webdriver.Firefox(options=opts)
pattern = re.compile(r'^product-item')

for pg in range(1,100):
    url = f'https://www.gooutdoors.co.uk/climbing/sal:view/page{pg}.html'
    browser.get(url)
    time.sleep(1)
    soup = bs(browser.page_source)
    if soup.find(id='noPage'): #stop if the returned html contains an empty warning (ran out of sale items) or pgs >= 100 (just in case)
        break
    all = soup.find('div', class_ = 'productlist_grid').findAll('article', class_ = pattern)

    for i in all:
        sale_text = i.find(class_='offer-text').find().get_text()
        if '%' in sale_text:
            off = float([i for i in sale_text.split(' ') if '%' in i][0].replace('%', ''))/100
        else:
            off = 0
        dict['item_link'].append(f"https://www.gooutdoors.co.uk{i.find('a')['href']}")
        dict['img_link'].append(i.find('img')['src'])
        dict['item_name'].append(i.find('h2').get_text())
        price = float(i.find(class_='loyalty-price').get_text().partition('£')[2]) * (1-off)
        dict['price'].append(price)
        previous_price = float(i.find(class_='retail-price').get_text().partition('£')[2])
        dict['previous_price'].append(previous_price)
        dict['pecentage_off'].append((1 - (price/previous_price))*100)
    time.sleep(5) # gooutdoors doesnt like being called lots :(


The geckodriver version (0.35.0) detected in PATH at /usr/bin/geckodriver might not be compatible with the detected firefox version (135.0.1); currently, geckodriver 0.36.0 is recommended for firefox 135.*, so it is advised to delete the driver in PATH and retry


### Climbers Shop (same as rockrun really)

In [6]:
browser = webdriver.Firefox(options=opts)
browser.get('https://www.climbers-shop.com/climbing-equipment/eol/instock')
time.sleep(1)

body = browser.find_element(By.CSS_SELECTOR, "body")
no_of_pagedowns = 30

while no_of_pagedowns:
    body.send_keys(Keys.PAGE_DOWN) 
    time.sleep(1)
    no_of_pagedowns-=1

soup = bs(browser.page_source)
browser.quit()

pattern2 = re.compile(r'item col-facetItem ctrPad16$')
all = soup.findAll('div', class_ = pattern2)

for i in all:
    if i.find('div', class_ = re.compile(r'col-1 pricing$')).find(id='lblwas'): #some items arnt actually on sale idk, so just skip if i cant return a prev price
        dict['item_name'].append(i.find('a', class_ = re.compile(r'col-1 frItemName$')).get_text())
        dict['price'].append(float(i.find('div', class_ = re.compile(r'col-1 pricing$')).find(id='lblNow').get_text().strip().replace('£','')))
        dict['previous_price'].append(float(i.find('div', class_ = re.compile(r'col-1 pricing$')).find(id='lblwas').get_text().strip().replace('£','')))
        dict['pecentage_off'].append(float(i.find('div', class_ = re.compile(r'col-1 pricing$')).find(class_='percentOff-betterSearch').get_text().split(' ')[1].replace('%', '')))
        dict['item_link'].append(f"https://www.climbers-shop.com{i.find('a', class_ = re.compile(r'col-1 frItemName$'))['href']}")
        dict['img_link'].append(f"https://www.climbers-shop.com{i.find('img')['data-src']}")

The geckodriver version (0.35.0) detected in PATH at /usr/bin/geckodriver might not be compatible with the detected firefox version (135.0.1); currently, geckodriver 0.36.0 is recommended for firefox 135.*, so it is advised to delete the driver in PATH and retry


### Output Dataframe

In [7]:
new = pd.DataFrame(dict)
new.sort_values('pecentage_off', inplace =True, ascending = False)
new.reset_index(drop=True, inplace = True)
new['previous_price'] = new['previous_price'].apply(lambda x: "£{:.2f}".format((x)))
new['price'] = new['price'].apply(lambda x: "£{:.2f}".format((x)))
new['pecentage_off'] = new['pecentage_off'].apply(lambda x: "{:.0f}%".format(x))

In [57]:
show = min(50, len(new))
format_dict = {'img_link':path_to_image_html, 'item_link':make_clickable}
display(HTML(new.head(show).to_html(escape=False, formatters=format_dict, index = False)))

item_name,price,pecentage_off,previous_price,img_link,item_link
Exped Schnozzel Pumpbag UL L,£9.95,73%,£37.50,,https://rockrun.com/collections/climbing-mountaineering-deals/products/exped-schnozzel-pumpbag-ul-l
Moore's Wall Bouldering (North Carolina),£11.40,70%,£38.00,,https://rockrun.com/collections/climbing-mountaineering-deals/products/moores-wall-bouldering
Evolv Zenist Women's,£49.00,67%,£148.48,,https://bananafingers.co.uk/evolv-zenist-women-s
Moon Lyra Long Sleeve Top - Women's,£14.95,67%,£45.00,,https://rockrun.com/collections/climbing-mountaineering-deals/products/moon-lyra-long-sleeve-top-womens
Evolv Geshido Women's,£55.00,66%,£161.76,,https://bananafingers.co.uk/evolv-geshido-women-s
Evolv Geshido,£55.00,66%,£161.76,,https://bananafingers.co.uk/evolv-geshido
CampKnife Blade Rock Piton 90mm,£5.97,63%,£16.00,,https://www.gooutdoors.co.uk/15898800/camp-knife-blade-rock-piton-90mm-15898800
Moon Sigma Tank Top - Women's,£14.95,63%,£40.00,,https://rockrun.com/collections/climbing-mountaineering-deals/products/moon-sigma-tank-top
EB Strange,£50.00,62%,£131.58,,https://bananafingers.co.uk/eb-strange
Looking For Wild Bavella Shorts - Womens (Navy Peony),£24.95,62%,£65.00,,https://rockrun.com/collections/climbing-mountaineering-deals/products/looking-for-wild-bavella-shorts-womens-navy-peony


## Filtering - Classification

I dont want to reccomend clothes (im not a sweat) so i will get rid of all that shite, and some other items id not be intersted in. 
Filter based on image or text either agg or stack after convolution idk

    step 1- Generate target, will do this manually :( - binary if interested or not
    step 2- tokenise label into usable format
    step 3- go from images link to actual image (should all be 150 by 150 to make things easier)
    step 4- build models ??
        idea 1 - two models, agg results
        idea 2 - one model, stack text data after convolution??
        idea 3 - towered approach
            do convolution -> some dense layers
            do text -> some dense layers.
            stack outputs arrs from there on?
                    

### Step 1

lets get some targets. To make it easier ill randomly order the items, present them one at a time (just title and photo) then label whatever is presented. update into an array, stop whenever i get bored

In [55]:
arr = []
df = new.sample(frac = 1)[['item_name','img_link']] # shuffle them all
for i in range(len(df)): 
    display(HTML(df.iloc[[i]].to_html(escape=False, formatters=format_dict, index = False))) #display row
    inp = input() # request input
    if inp not in ['1', '0']: # can exit if bored
        clear_output(wait = True)
        print(f'Not 0 or 1, ending labelling at {len(arr)} items')
        break
    arr.append(inp) # append 0 or 1
    clear_output(wait = True) # clear output for next item

df = df.iloc[:len(arr)].reset_index( drop = True)
df['label'] = arr

Not 0 or 1, ending labelling at 44 items


In [56]:
df

Unnamed: 0,item_name,img_link,label
0,Edelrid Women's Highball T-Shirt IV,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/4/9/49242_167a.webp_1.png,1
1,Evolv Shaman Lace LV,https://rockrun.com/cdn/shop/products/evolv-shaman-lv-lace_1600x.png?v=1671191825,0
2,Looking For Wild Cinto T-Shirt - Last Season's,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/b/d/bd-211021-lfw-portes0437_2_1.jpg,1
3,Evolv Elektra Womens,https://rockrun.com/cdn/shop/products/MERGRA_1600x.jpg?v=1589461852,0
4,Boreal Satori Women's - Last Season's,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/9/b/9b808353-5adc-495f-af9e-087e9d895943-4_2_1.jpg,0
5,Picture CC Bamaga Tee,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/m/t/mts1059_a_f.png,1
6,Black Diamond Treeline Rain Shell - Men's,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/d/a/daffe87a63ec026f09707fccbbe282cc55596c4e_71337.jpg,1
7,Five-Ten Niad Lace - Women's,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/f/w/fw2874_ftw_photo_side-lateral-center_white_4.jpg,0
8,Black Diamond Big Gun,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/7/3/7354_source_1612062222_1.jpg,0
9,Black Diamond Women's Rays Pullover Hoody,https://bananafingers.co.uk/media/catalog/product/cache/183e1672aae0fbc68f03bfd4fd2e0cd0/7/3/730066_1008_w_bd_rays_pullover_hoody_light_gray_heather_04__1_1.png,1
