In [None]:
#!pip install selenium
#!pip install piapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time
from selenium.webdriver import Keys, ActionChains
from selenium.webdriver.common.actions.action_builder import ActionBuilder
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import numpy as np
import pandas as pd
from selenium.webdriver.chrome.options import Options
import os
import pandas as pd
import pickle as pkl
import openai
from numpy import dot
from numpy.linalg import norm
from piapy import PiaVpn
from datetime import datetime
import json
from selenium.webdriver.firefox.options import Options

In [2]:
use_pia = True #set to True to use if using private internet access for scraping, this will automate restarting proxies after failures

### Batching Tesco Scraping

In [3]:
def reset_driver(driver, pia, browser='firefox'):
    """
    Reset the driver and vpn, if using PIA
    """
    if pia: 
        !piactl disconnect
    driver.quit()
    time.sleep(10)

    if browser=='firefox':
        caps = DesiredCapabilities().FIREFOX
        caps["pageLoadStrategy"] = "eager"  #  interactive
        driver = webdriver.Firefox(desired_capabilities=caps) 
    elif browser=='chrome':
        options = chrome.options.Options()
        options.add_argument("--headless=new")
        options.page_load_strategy = "eager"
        driver = webdriver.Chrome(options=options)
    if pia:
        !piactl connect
        pass
    return driver


    

In [4]:
def get_products(driver, url=None):
    """
    Args:
        driver: webdriver
    Returns:
        list of products
    Gets all products from the tesco page and returns them as a list of dictionaries
    """
    count = 0
    products = []
    a_s = driver.find_elements(By.TAG_NAME, "a")
    for a in a_s:
        # check if a has a href attribute
        if(a.get_attribute('href') != None and'/products/' in a.get_attribute('href')):
            product = {}
            count += 1
            #print(f"{count}:{a.get_attribute('href')}")
            try:
                span_child = a.find_element(By.TAG_NAME, "span")
                product = {'name': span_child.text, 'link': a.get_attribute('href'), 'source_page': url, 'scrape_time': time.time() }
                gp = a.find_element(By.XPATH, '..')
                gp = gp.find_element(By.XPATH, '..')
                p_elements = gp.find_elements(By.TAG_NAME, "p")
                prices = []
                for p in p_elements:
                    if '£' in p.text:
                        prices.append(p.text)
                product['prices'] = prices
                products.append(product)
            except Exception as e:
                continue
    return products

        


In [10]:
categories = [
    {"name": "fresh-food", "page_count":77},
    {"name": "bakery", "page_count": 15}, 
    {"name": "frozen-food", "page_count": 22}, 
    {"name": "treats-and-snacks", "page_count": 39},
    {"name": "food-cupboard", "page_count": 124},
    {"name": "drinks", "page_count": 66},
    {"name": "household", "page_count": 26},
    {"name": "home-and-ents", "page_count": 87},
    {"name": "health-and-beauty", "page_count": 80},
    {"name": "baby-and-toddler", "page_count": 19},
    {"name": "pets", "page_count": 16}
]

[cat['name'] for cat in categories[:5]]   
'fresh-food'  'bakery'  'frozen-food'  'treats-and-snacks'  'food-cupboard'  'drinks' 

'fresh-foodbakeryfrozen-foodtreats-and-snacksfood-cupboarddrinks'

In [2]:
categories = [
    {"name": "fresh-food", "page_count":77},
    {"name": "bakery", "page_count": 15}, 
    {"name": "frozen-food", "page_count": 22}, 
    {"name": "treats-and-snacks", "page_count": 39},
    {"name": "food-cupboard", "page_count": 124},
    {"name": "drinks", "page_count": 66},
    {"name": "household", "page_count": 26},
    {"name": "home-and-ents", "page_count": 87},
    {"name": "health-and-beauty", "page_count": 80},
    {"name": "baby-and-toddler", "page_count": 19},
    {"name": "pets", "page_count": 16}
]

# build up list of pages to scrape
urls = []
for category in categories:
    for i in range(1, category['page_count']+1):
        if i == 1:
            urls.append(f"https://www.tesco.com/groceries/en-GB/shop/{category['name']}/all/?count=48")
        else:
            urls.append(f"https://www.tesco.com/groceries/en-GB/shop/{category['name']}/all/?page={i}&count=48")

failed_urls = []
success_urls = []
products = []


In [9]:
caps = DesiredCapabilities().FIREFOX
caps["pageLoadStrategy"] = "eager"  #  interactive
driver = webdriver.Firefox(desired_capabilities=caps) 

""" options = chrome.options.Options()
options.add_argument("--headless=new")
options.page_load_strategy = "eager"
 """

failure_count = 0

start_time = time.time()

success_log = open("success_log.txt", "a")
success_log.write("url, product_count, scrape_time \n")
failure_log = open("failure_log.txt", "a")
failure_log.write("url, product_count, scrape_time, error \n")

if use_pia:
    print("Connecting to PIA")
    !piactl connect
    time.sleep(10)

for i, url in enumerate([url for url in urls if url not in success_urls]):
    try:
        if i % 10 == 0:
            print(f"Scraped {i-1} pages in {time.time()-start_time} seconds, avg: {np.round((time.time()-start_time)/(i-1), 2)} seconds per page")
            # save products to json
            with open(f'products_{start_time}.json', 'w') as f:
                json.dump(products, f)

        driver.get(url)
        WebDriverWait(driver, timeout=np.random.normal(3, 1.2))
        page_products = get_products(driver, url)
        products.extend(page_products)

        if len(page_products) < 5:
            failed_urls.append({ "url": url, "error": "Not enough products found", "attempts": 1})
            failure_count += 1

            failure_log.write(f"'{url}', {len(page_products)}, {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}, ")
            failure_log.write(' \n')

            if failure_count > 10:
                if use_pia & (failure_count < 20):
                    print("Too many failures, restarting VPN")
                    driver = reset_driver(driver, use_pia)
                else:
                    print("Too many failures, exiting")
                    break
            continue
        else:
            success_urls.append(url)
            success_log.write(f"'{url}', {len(page_products)}, {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}")
            success_log.write(' \n')
            failure_count = 0
            
    except Exception as e:
        print(f"Failed to scrape {url}")
        failed_urls.append({ "url": url, "error": e, "attempts": 1})
        failure_log.write(f"'{url}', None, {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')}, '{e}' \n ")
        failure_log.write(' \n')
        failure_count += 1

        if failure_count > 10:
            if use_pia & (failure_count < 20):
                print("Too many failures, restarting VPN")
                driver = reset_driver(driver, use_pia)
            else:
                print("Too many failures, exiting")
                break
            break

        continue


for failure in failed_urls:
    try:
        driver.get(failure['url'])
        products.extend(get_products(driver))
        success_urls.append(failure['url'])
    except Exception as e:
        print(f"Failed to scrape {failure['url']}")
        failure['error'] = e
        failure['attempts'] += 1
        continue


  driver = webdriver.Firefox(desired_capabilities=caps)


Connecting to PIA
Scraped -1 pages in 10.570011854171753 seconds, avg: -10.57 seconds per page
Scraped 9 pages in 121.92822313308716 seconds, avg: 13.55 seconds per page
Scraped 19 pages in 229.99709916114807 seconds, avg: 12.11 seconds per page
Scraped 29 pages in 355.5164740085602 seconds, avg: 12.26 seconds per page
Scraped 39 pages in 471.85629391670227 seconds, avg: 12.1 seconds per page
Scraped 49 pages in 591.3232898712158 seconds, avg: 12.07 seconds per page
Scraped 59 pages in 704.9118368625641 seconds, avg: 11.95 seconds per page
Scraped 69 pages in 813.4494049549103 seconds, avg: 11.79 seconds per page
Scraped 79 pages in 923.4006190299988 seconds, avg: 11.69 seconds per page
Failed to scrape https://www.tesco.com/groceries/en-GB/shop/food-cupboard/all/?page=117&count=48
Scraped 89 pages in 1315.4062399864197 seconds, avg: 14.78 seconds per page
Scraped 99 pages in 1427.485995054245 seconds, avg: 14.42 seconds per page
Scraped 109 pages in 1548.1717839241028 seconds, avg: 14

### Retrying the failed ones

In [8]:
len(failed_urls)

20

In [33]:
#driver.close()
#time.sleep(10)
driver = webdriver.Firefox(desired_capabilities=caps)
driver = reset_driver(driver, use_pia)

while len([failed_url for failed_url in failed_urls if failed_url['attempts'] <= 3]):
    for failure in [url for url in failed_urls if url['attempts'] <= 3]:
        try:
            driver.get(failure['url'])
            products.extend(get_products(driver))
            success_urls.append(failure['url'])
            print(f"Successfully scraped {failure['url']}")
            # remove from failed_urls
            failed_urls = [url for url in failed_urls if url['url'] != failure['url']]
        except Exception as e:
            print(f"Failed to scrape {failure['url']}")
            failure['error'] = e
            failure['attempts'] += 1
            continue

  driver = webdriver.Firefox(desired_capabilities=caps)
  driver = webdriver.Firefox(desired_capabilities=caps)


Successfully scraped https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=52&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=77&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/food-cupboard/all/?page=124&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/drinks/all/?page=66&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/household/all/?page=26&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/home-and-ents/all/?page=86&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/home-and-ents/all/?page=87&count=48
Successfully scraped https://www.tesco.com/groceries/en-GB/shop/pets/all/?page=16&count=48


In [35]:
len(failed_urls)

0

In [25]:
22*50

1100

### Saving

In [9]:
tesco_prices_df = pd.DataFrame(products)
tesco_prices_df = tesco_prices_df[tesco_prices_df['prices'].str.len() > 0]
tesco_prices_df['price'] = tesco_prices_df['prices'].apply(lambda x: (x[0].replace("£", "") if len(x)>0 else np.nan))
tesco_prices_df['relative_price'] = tesco_prices_df['prices'].apply(lambda x: x[1] if len(x)>1 else np.nan)
# prices to numeric
tesco_prices_df.loc[:, 'price']= pd.to_numeric(tesco_prices_df['price'], errors='coerce' )
tesco_prices_df = tesco_prices_df.dropna(subset=['price'])
tesco_prices_df.sort_values(by=['price'], ascending=False)
tesco_prices_df.to_pickle("/Users/finn/Documents/GitHub/FM-ds.github.io/EconOb/Prices/scrapes/2023_07_13_tesco.pkl")


In [32]:
tesco_prices_df.iloc[0].scrape_time

1687876275.744428

In [121]:
old_df = pd.read_pickle("/Users/finn/Documents/GitHub/FM-ds.github.io/EconOb/Prices/tesco_prices_with_embeddings.pkl")

In [127]:
# select all columns except embeddings and bread_distance
old_df.drop(columns=["embeddings", "bread_distance"]).to_pickle("/Users/finn/Documents/GitHub/FM-ds.github.io/EconOb/Prices/scrapes/2023_06_21_tesco.pkl")

In [None]:
tesco_prices_df.to_


In [113]:
tesco_prices_df

Unnamed: 0,name,link,source_page,scrape_time,prices,price,relative_price
1,Tesco Carrots Loose,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£0.09, £0.55/kg]",0.09,£0.55/kg
2,Hearty Food Co. Garlic Baguette 170G,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£0.37, £0.22/100g]",0.37,£0.22/100g
3,Jaffa Clementine Or Sweet Easy Peeler 600G,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£2.00, £3.33/kg]",2.00,£3.33/kg
4,Tesco British Pork Wafer Thin Honey Roast Ham ...,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£2.25, £1.80/100g]",2.25,£1.80/100g
5,Tesco Unsmked Back Bacon*10 300G Promo,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£2.10, £7.00/kg]",2.10,£7.00/kg
...,...,...,...,...,...,...,...
20936,Yankee Pink Sands Aromatherapy Oil 10Ml,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£6.00, £600.00/litre]",6.00,£600.00/litre
20937,Tesco Push Top Food Storage White 1.5L,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£4.75, £4.75/each]",4.75,£4.75/each
20938,Tesco Rotating Organiser,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£8.00, £8.00/each]",8.00,£8.00/each
20939,Airwick Active Fresh Refill Eucalyptus & Frees...,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£6.00, £26.32/litre]",6.00,£26.32/litre


In [None]:
tesco_prices_df.iloc[0]['prices']

In [111]:
# find rows with prices len > 0
tesco_prices_df = tesco_prices_df[tesco_prices_df['prices'].str.len() > 0]

Unnamed: 0,name,link,source_page,scrape_time,prices
1,Tesco Carrots Loose,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£0.09, £0.55/kg]"
2,Hearty Food Co. Garlic Baguette 170G,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£0.37, £0.22/100g]"
3,Jaffa Clementine Or Sweet Easy Peeler 600G,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£2.00, £3.33/kg]"
4,Tesco British Pork Wafer Thin Honey Roast Ham ...,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£2.25, £1.80/100g]"
5,Tesco Unsmked Back Bacon*10 300G Promo,https://www.tesco.com/groceries/en-GB/products...,,1.687614e+09,"[£2.10, £7.00/kg]"
...,...,...,...,...,...
20936,Yankee Pink Sands Aromatherapy Oil 10Ml,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£6.00, £600.00/litre]"
20937,Tesco Push Top Food Storage White 1.5L,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£4.75, £4.75/each]"
20938,Tesco Rotating Organiser,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£8.00, £8.00/each]"
20939,Airwick Active Fresh Refill Eucalyptus & Frees...,https://www.tesco.com/groceries/en-GB/products...,,1.687642e+09,"[£6.00, £26.32/litre]"


In [107]:
[urls for urls in urls if url not in success_urls]

[]

In [104]:
len(urls)

571

In [103]:
len(products)

20941

In [86]:
webdriver.Firefox.quit(webdriver)


AttributeError: module 'selenium.webdriver' has no attribute 'service'

In [80]:
len(products)

19164

In [83]:
len(urls)

571

In [82]:
len(success_urls)

430

In [81]:
len(failed_urls)

51

In [77]:
products[len(products)-1]

{'name': 'Tesco Wooden Hangers 5 Pack',
 'link': 'https://www.tesco.com/groceries/en-GB/products/312618674',
 'source_page': None,
 'scrape_time': 1687613140.069751,
 'prices': ['£3.50', '£0.70/each']}

In [59]:
print(f"hello \n world \n {2+2}")

hello 
 world 
 4


### Misc

In [None]:
caps = DesiredCapabilities().FIREFOX
caps["pageLoadStrategy"] = "eager"  #  interactive
driver = webdriver.Firefox(desired_capabilities=caps)

driver.get(urls[50])
products = get_products(driver)
print(len(products))

In [6]:
vpn = PiaVpn()
vpn.status()

SystemError: No command specified


In [22]:
len(failed_urls)

571

In [26]:
success_urls

['https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=2&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=3&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=4&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=5&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=6&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=7&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=8&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=9&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=10&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=11&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/all/?page=12&count=48',
 'https://www.tesco.com/groceries/en-GB/shop/fresh-food/

In [25]:
len(success_urls)

1142

In [24]:
len(urls)

571