In [2]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import warnings
warnings.filterwarnings("ignore")


### Scaping data using BeautifulSoup for SuperValu

In [9]:
url = f"https://shop.supervalu.ie/shopping/search/allaisles?q=milk"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
product_info = soup.find("div", {"class": "product-count-info"}).text.strip().split()

In [12]:
product_info

['Showing', '1-30', 'of', '925', 'items']

In [14]:
product_info[1].split('-')[1]

'30'

In [16]:
total_page_number = round(int(product_info[3])/int(product_info[1].split('-')[1]))

In [17]:
# Create an empty DataFrame
result = pd.DataFrame()

for page in range(1, total_page_number): 
    url = f"https://shop.supervalu.ie/shopping/search/allaisles?q=milk&page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    product_divs = soup.find_all("div", {"data-product": True})
    for product_div in product_divs:
        product_data = json.loads(product_div["data-product"])
        
        # Extract the product link
        product_link = f"https://shop.supervalu.ie{product_div.find('a')['href']}"
        
        # Append the product link to the product data dictionary
        product_data['product_link'] = product_link
        
        # Append the product data to the DataFrame
        result = result.append(pd.json_normalize(product_data), ignore_index=True)

# Split the name column into two columns
result[['Product', 'Quantity']] = result['name'].str.split('(', n=1, expand=True)

# Remove the closing bracket from the quantity column
result['Quantity'] = result['Quantity'].str.replace(')', '')

# Remove any leading or trailing whitespace from the product and quantity columns
result['Product'] = result['Product'].str.strip()
result['Quantity'] = result['Quantity'].str.strip()

# Drop the original "name" column
result = result.drop('name', axis=1)

# Print the updated dataframe
result


Unnamed: 0,id,price,brand,category,product_link,Product,Quantity
0,1005787000,2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Fresh Milk,2 L
1,1009043000,3.20,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,SuperValu Fresh Milk,3 L
2,1018392000,1.65,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Low Fat Super Milk,1 L
3,1025460000,2.29,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,SuperValu Fresh Milk,2 L
4,1020370000,2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Light 1% Fat Milk,2 L
...,...,...,...,...,...,...,...
13945,1661275000,2.09,Cultured Dairy Co.,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,The Cultured Dairy Co. Strawberry Yogurt,1 L
13946,1865051000,3.59,Fage,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Total Fage Fruits Coconut,380 g
13947,1865051001,3.59,Fage,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Total Fage Fruits Mango Passionfruit,380 g
13948,1865051002,3.59,Fage,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Total Fage Fruits Strawberry,380 g


### Scraping data using selenium and BeautifulSoup for tesco 

In [64]:
# Set up the Selenium driver
service = Service("path/to/chromedriver")
driver = webdriver.Chrome(service=service)

# Navigate to the Tesco grocery search page
driver.get("https://www.tesco.ie/groceries/en-IE/search?query=milk")

# Wait for the page to load and get the HTML
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//li[@class='product-list--list-item']")))
html = driver.page_source

# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, "html.parser")
# Create an empty DataFrame
df = pd.DataFrame(columns=['Product Title', 'Price','product_link'])
# Find all product containers
product_containers = soup.find_all("div", {"class": "product-details--wrapper"})
for container in product_containers:
    # Get the product title
    product_title = container.find("span").text.strip()
    # Extract the product link
    product_link = f"https://www.tesco.ie{container.find('a')['href']}"
    # Extract the product price
    product_price = container.find("form").text.split('€')[1]
    # Add the product title and price to the DataFrame
    df = df.append({'Product Title': product_title, 'Price': product_price,'product_link' :product_link}, ignore_index=True)


# extract the product name and quantity using regular expressions
df['Product'] = df['Product Title'].apply(lambda x: re.findall(r'^[^\d]+', x)[0].strip())
df['Quantity'] = df['Product Title'].apply(lambda x: re.findall(r'\d+\s*\w+', x)[0].strip())
# drop the original Product_Title column
df = df.drop('Product Title', axis=1)

df

Unnamed: 0,Price,product_link,Product,Quantity
0,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,1ltr
1,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,1Ltr
2,2.29,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,2 Litre
3,3.2,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Full Fat Milk,3Ltr
4,1.35,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low F Fortified,1L
5,2.29,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,2 Litre
6,3.2,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,3L
7,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Skim Milk,1Litre
8,2.49,https://www.tesco.ie/groceries/en-IE/products/...,Avonmore Whole Super Milk,75L
9,2.49,https://www.tesco.ie/groceries/en-IE/products/...,Avonmore Milk,2 Litre


### Comparing the price in both shops

### References Taken:
1) Class Notebooks for BeautifulSoup
2) Youtube Video for Selenium: https://www.youtube.com/watch?v=UOsRrxMKJYk