In [41]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
import warnings
warnings.filterwarnings("ignore")


In [42]:
# Create an empty DataFrame
result = pd.DataFrame()

for page in range(1, 6): # Change 6 to the number of pages you want to scrape
    url = f"https://shop.supervalu.ie/shopping/search/allaisles?q=milk&page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    product_divs = soup.find_all("div", {"data-product": True})
    for product_div in product_divs:
        product_data = json.loads(product_div["data-product"])
        
        # Extract the product link
        product_link = f"https://shop.supervalu.ie{product_div.find('a')['href']}"
        
        # Append the product link to the product data dictionary
        product_data['product_link'] = product_link
        
        # Append the product data to the DataFrame
        result = result.append(pd.json_normalize(product_data), ignore_index=True)

# Print the DataFrame
result


Unnamed: 0,id,name,price,brand,category,product_link
0,1005787000,Avonmore Fresh Milk (2 L),2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...
1,1009043000,SuperValu Fresh Milk (3 L),3.20,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...
2,1018392000,Avonmore Low Fat Super Milk (1 L),1.65,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...
3,1025460000,SuperValu Fresh Milk (2 L),2.29,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...
4,1020370000,Avonmore Light 1% Fat Milk (2 L),2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...
...,...,...,...,...,...,...
445,1447498002,Koh Coconut Milk (1 L),3.47,Koh Coconut,Drinks,https://shop.supervalu.iehttps://shop.superval...
446,1018670000,Dairygold Original (454 g),3.25,Dairygold,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...
447,1391764000,Cadbury Dairy Milk Chocolate Buttons Treat Siz...,2.50,Cadbury,Food Cupboard,https://shop.supervalu.iehttps://shop.superval...
448,1211941001,Cadbury Dairy Milk Freddo Bars 5 Pack (90 g),1.55,Cadbury,Food Cupboard,https://shop.supervalu.iehttps://shop.superval...


In [43]:

# Split the name column into two columns
result[['Product', 'Quantity']] = result['name'].str.split('(', n=1, expand=True)

# Remove the closing bracket from the quantity column
result['Quantity'] = result['Quantity'].str.replace(')', '')

# Remove any leading or trailing whitespace from the product and quantity columns
result['Product'] = result['Product'].str.strip()
result['Quantity'] = result['Quantity'].str.strip()

# Drop the original "name" column
result = result.drop('name', axis=1)

# Print the updated dataframe
result


Unnamed: 0,id,price,brand,category,product_link,Product,Quantity
0,1005787000,2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Fresh Milk,2 L
1,1009043000,3.20,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,SuperValu Fresh Milk,3 L
2,1018392000,1.65,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Low Fat Super Milk,1 L
3,1025460000,2.29,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,SuperValu Fresh Milk,2 L
4,1020370000,2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Light 1% Fat Milk,2 L
...,...,...,...,...,...,...,...
445,1447498002,3.47,Koh Coconut,Drinks,https://shop.supervalu.iehttps://shop.superval...,Koh Coconut Milk,1 L
446,1018670000,3.25,Dairygold,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Dairygold Original,454 g
447,1391764000,2.50,Cadbury,Food Cupboard,https://shop.supervalu.iehttps://shop.superval...,Cadbury Dairy Milk Chocolate Buttons Treat Siz...,170 g
448,1211941001,1.55,Cadbury,Food Cupboard,https://shop.supervalu.iehttps://shop.superval...,Cadbury Dairy Milk Freddo Bars 5 Pack,90 g


In [58]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

# Set up the Selenium driver
service = Service("path/to/chromedriver")
driver = webdriver.Chrome(service=service)

# Navigate to the Tesco grocery search page
driver.get("https://www.tesco.ie/groceries/en-IE/search?query=milk")

# Wait for the page to load and get the HTML
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//li[@class='product-list--list-item']")))
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, "html.parser")
# Create an empty DataFrame
df = pd.DataFrame(columns=['Product Title', 'Price','product_link'])
# Find all product containers
product_containers = soup.find_all("div", {"class": "product-details--wrapper"})
for container in product_containers:
    # Get the product title
    product_title = container.find("span").text.strip()
    # Extract the product link
    product_link = f"https://www.tesco.ie{container.find('a')['href']}"
    # Extract the product price
    product_price = container.find("form").text.split('€')[1]
    # Add the product title and price to the DataFrame
    df = df.append({'Product Title': product_title, 'Price': product_price,'product_link' :product_link}, ignore_index=True)


# extract the product name and quantity using regular expressions
df['Product'] = df['Product Title'].apply(lambda x: re.findall(r'^[^\d]+', x)[0].strip())
df['Quantity'] = df['Product Title'].apply(lambda x: re.findall(r'\d+\s*\w+', x)[0].strip())
# drop the original Product_Title column
df = df.drop('Product Title', axis=1)

    

Unnamed: 0,Product Title,Price,product_link
0,Tesco Fresh Milk1ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...
1,Tesco Low Fat Milk 1Ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...
2,Tesco Fresh Milk 2 Litre,2.29,https://www.tesco.ie/groceries/en-IE/products/...
3,Tesco Full Fat Milk 3Ltr,3.2,https://www.tesco.ie/groceries/en-IE/products/...
4,Tesco Low F Fortified 1L 10,1.35,https://www.tesco.ie/groceries/en-IE/products/...
5,Tesco Low Fat Milk 2 Litre,2.29,https://www.tesco.ie/groceries/en-IE/products/...
6,Tesco Low Fat Milk 3L,3.2,https://www.tesco.ie/groceries/en-IE/products/...
7,Tesco Skim Milk 1Litre,1.15,https://www.tesco.ie/groceries/en-IE/products/...
8,Avonmore Whole Super Milk 1.75L,2.49,https://www.tesco.ie/groceries/en-IE/products/...
9,Avonmore Milk 2 Litre,2.49,https://www.tesco.ie/groceries/en-IE/products/...


In [63]:

# extract the product name and quantity using regular expressions
df['Product'] = df['Product Title'].apply(lambda x: re.findall(r'^[^\d]+', x)[0].strip())
df['Quantity'] = df['Product Title'].apply(lambda x: re.findall(r'\d+\s*\w+', x)[0].strip())
# drop the original Product_Title column
df = df.drop('Product Title', axis=1)

df

Unnamed: 0,Price,product_link,Product,Quantity
0,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,1ltr
1,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,1Ltr
2,2.29,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,2 Litre
3,3.2,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Full Fat Milk,3Ltr
4,1.35,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low F Fortified,1L
5,2.29,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,2 Litre
6,3.2,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,3L
7,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Skim Milk,1Litre
8,2.49,https://www.tesco.ie/groceries/en-IE/products/...,Avonmore Whole Super Milk,75L
9,2.49,https://www.tesco.ie/groceries/en-IE/products/...,Avonmore Milk,2 Litre


### References Taken:
1) Class Notebooks for BeautifulSoup
2) Youtube Video for Selenium: https://www.youtube.com/watch?v=UOsRrxMKJYk