In [1]:
!pip install undetected-chromedriver

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting selenium>=4.9.0 (from undetected-chromedriver)
  Obtaining dependency information for selenium>=4.9.0 from https://files.pythonhosted.org/packages/3f/fd/c2e7bb547b5b96c7bd536b4a80c4564b7ce5cd38d10095fbba8648996ab9/selenium-4.18.1-py3-none-any.whl.metadata
  Downloading selenium-4.18.1-py3-none-any.whl.metadata (6.9 kB)
Collecting websockets (from undetected-chromedriver)
  Obtaining dependency information for websockets from https://files.pythonhosted.org/packages/95/aa/75fa3b893142d6d98a48cb461169bd268141f2da8bfca97392d6462a02eb/websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting trio~=0.17 (from selenium>=

In [7]:
# !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

In [8]:
# !dpkg -i google-chrome-stable_current_amd64.deb

In [10]:
import time
from typing import List, Optional

from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc

In [14]:
# Initialize empty lists to store product information
product_titles: List[str] = []
product_prices: List[str] = []
product_image_srcs: List[str] = []
product_links: List[str] = []

In [2]:
driver = uc.Chrome(use_subprocess=False)
driver.get('https://direct.asda.com/george/school-uniform/D10,default,sc.html')

True

In [16]:
urls = [
    "https://direct.asda.com/george/school/all-boys-school-uniform/D10M1G2C1,default,sc.html",
    "https://direct.asda.com/george/school/all-girls-school-uniform/D10M2G2C1,default,sc.html",
    "https://direct.asda.com/george/school/all-school-shoes-trainers/D10M20G1C1,default,sc.html"
]

In [15]:
def scrape_asda_from_url(url: str) -> None:
    driver.get(url)
    scroll_to_bottom(driver)
    products = driver.find_elements(By.CSS_SELECTOR, "div.product-mini-outer-container")
    for product in products:
        parse_product(product)

def scroll_to_bottom(driver):
    SCROLL_PAUSE_TIME = 0.5
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def parse_product(product) -> None:
    soup = BeautifulSoup(product.get_attribute("innerHTML"), "html.parser")
    product_container = soup.find("div", class_="product-mini")
    if product_container is not None:
        product_title = get_product_info(product_container, "a", "title")
        product_titles.append(product_title)
        product_price = get_product_info(product_container, "span", "product__price-value")
        product_prices.append(product_price)
        product_image_src = get_product_image(product_container)
        product_image_srcs.append(product_image_src)
        product_link = get_product_info(product_container, "a", "title", "href")
        product_links.append(product_link)

def get_product_info(container, tag, class_name, attribute=None) -> Optional[str]:
    try:
        if attribute:
            return container.find(tag, class_=class_name)[attribute].strip()
        else:
            return container.find(tag, class_=class_name).text.strip()
    except AttributeError:
        return None

def get_product_image(container) -> Optional[str]:
    try:
        return container.find("img", class_="primary-image")["src"]
    except AttributeError:
        return None

In [17]:
for url in urls:
    scrape_asda_from_url(url)

In [27]:
product_df = pd.DataFrame({
    "product_title": product_titles,
    "product_price": product_prices,
    "product_image_src": product_image_srcs,
    "product_link": product_links
})

In [28]:
product_df = product_df[~product_df["product_price"].isnull()]

In [30]:
product_df.to_dict(orient="records")

[{'product_title': 'White School Polo Shirt 5 Pack',
  'product_price': '£7.50',
  'product_image_src': 'https://asda.scene7.com/is/image/Asda/5059194006713?hei=600&wid=450&qlt=85&fmt=pjpg&resmode=sharp2&op_usm=1.1,0.5,0,0&defaultimage=default_details_George_rd',
  'product_link': '/george/school/boys-school-polo-shirts/white-school-polo-shirt-5-pack/GEM1027123-23619,default,pd.html?clr=White'},
 {'product_title': 'White Crew Neck School T-Shirt 2 Pack',
  'product_price': '£2',
  'product_image_src': 'https://asda.scene7.com/is/image/Asda/5059188972390?hei=600&wid=450&qlt=85&fmt=pjpg&resmode=sharp2&op_usm=1.1,0.5,0,0&defaultimage=default_details_George_rd',
  'product_link': '/george/school/sports-swimwear/white-crew-neck-school-t-shirt-2-pack/GEM895449-23619,default,pd.html?clr=White'},
 {'product_title': 'Navy School Sweatshirt 2 Pack',
  'product_price': '£5',
  'product_image_src': 'https://asda.scene7.com/is/image/Asda/5059190684335?hei=600&wid=450&qlt=85&fmt=pjpg&resmode=sharp2&

In [23]:
product_df.to_csv("../data/asda_product.csv", index=False)

In [6]:
product_df["product_link"].loc[0]

'/george/school/boys-school-polo-shirts/white-school-polo-shirt-5-pack/GEM1027123-23619,default,pd.html?clr=White'

In [None]:
driver.quit()