In [7]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import warnings
warnings.filterwarnings("ignore")


### Scaping data using BeautifulSoup for SuperValu

In [33]:
# Get the total number of page numbers
url = f"https://shop.supervalu.ie/shopping/search/allaisles?q=milk"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
product_info = soup.find("div", {"class": "product-count-info"}).text.strip().split()
total_page_number = round(int(product_info[3])/int(product_info[1].split('-')[1]))

In [34]:
# Create an empty DataFrame
supervalu_df = pd.DataFrame()

for page in range(1, total_page_number+1): 
    url = f"https://shop.supervalu.ie/shopping/search/allaisles?q=milk&page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    product_divs = soup.find_all("div", {"data-product": True})
    for product_div in product_divs:
        product_data = json.loads(product_div["data-product"])
        
        # Extract the product link
        product_link = f"https://shop.supervalu.ie{product_div.find('a')['href']}"
        
        # Append the product link to the product data dictionary
        product_data['SuperValu_product_link'] = product_link
        
        # Append the product data to the DataFrame
        supervalu_df = supervalu_df.append(pd.json_normalize(product_data), ignore_index=True)

# Split the name column into two columns
supervalu_df[['Product', 'Quantity']] = supervalu_df['name'].str.split('(', n=1, expand=True)

# Remove the closing bracket from the quantity column
supervalu_df['Quantity'] = supervalu_df['Quantity'].str.replace(')', '')

# Remove any leading or trailing whitespace from the product and quantity columns
supervalu_df['Product'] = supervalu_df['Product'].str.strip()
supervalu_df['Quantity'] = supervalu_df['Quantity'].str.strip()
supervalu_df.rename(columns={'price': 'SuperValu_price'}, inplace=True)
# Print the updated dataframe
supervalu_df


Unnamed: 0,id,name,SuperValu_price,brand,category,SuperValu_product_link,Product,Quantity
0,1005787000,Avonmore Fresh Milk (2 L),2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Fresh Milk,2 L
1,1009043000,SuperValu Fresh Milk (3 L),3.20,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,SuperValu Fresh Milk,3 L
2,1018392000,Avonmore Low Fat Super Milk (1 L),1.65,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Low Fat Super Milk,1 L
3,1025460000,SuperValu Fresh Milk (2 L),2.29,SuperValu,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,SuperValu Fresh Milk,2 L
4,1020370000,Avonmore Light 1% Fat Milk (2 L),2.49,Avonmore,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Avonmore Light 1% Fat Milk,2 L
...,...,...,...,...,...,...,...,...
13945,1701135002,Yop Strawberry & Banana Yogurt Drink 4 Pack (1...,2.99,Yoplait,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Yop Strawberry & Banana Yogurt Drink 4 Pack,180 g
13946,1701135001,Yop Strawberry Yogurt Drink 4 Pack (180 g),2.99,Yoplait,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Yop Strawberry Yogurt Drink 4 Pack,180 g
13947,1686593000,Yoplait 0% Fat Peach Yogurt 4 Pack (500 g),2.99,Yoplait,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Yoplait 0% Fat Peach Yogurt 4 Pack,500 g
13948,1595686001,Yoplait 0% Pineapple Strained Yogurt 4 Pack (5...,2.99,Yoplait,"Milk, Butter & Eggs",https://shop.supervalu.iehttps://shop.superval...,Yoplait 0% Pineapple Strained Yogurt 4 Pack,500 g


### Scraping data using selenium and BeautifulSoup for tesco 

In [26]:
# Set up the Selenium driver
service = Service("path/to/chromedriver")
driver = webdriver.Chrome(service=service)
tesco_df = pd.DataFrame()

# Get the total number of page numbers
driver.get("https://www.tesco.ie/groceries/en-IE/search?query=milk")
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
page_info = soup.find("div", {"class": "pagination__items-displayed"}).text.strip().split()
total_page_number = round(int(page_info[5])/int(page_info[3]))

for page in range(1, total_page_number+1): 
    # Navigate to the Tesco grocery search page
    driver.get(f"https://www.tesco.ie/groceries/en-IE/search?query=milk&page={page}")

    # Wait for the page to load and get the HTML
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.XPATH, "//li[@class='product-list--list-item']")))
    html = driver.page_source

    # Parse the HTML with Beautiful Soup
    soup = BeautifulSoup(html, "html.parser")
    # Create an empty DataFrame
    df = pd.DataFrame(columns=['Product Title', 'Tesco_Price','Tesco_product_link'])
    # Find all product containers
    product_containers = soup.find_all("div", {"class": "product-details--wrapper"})
    for container in product_containers:
        # Get the product title
        product_title = container.find("span").text.strip()
        # Extract the product link
        product_link = f"https://www.tesco.ie{container.find('a')['href']}"
        # Extract the product price if it exists
        product_price = None
        if container.find("form"):
            product_price = container.find("form").text.split('€')[1]
        # Add the product title and price to the DataFrame
        df = df.append({'Product Title': product_title, 'Tesco_Price': product_price,'Tesco_product_link' :product_link}, ignore_index=True)


    # extract the product name and quantity using regular expressions
    df['Product'] = df['Product Title'].apply(lambda x: re.findall(r'^[^\d]+', x)[0].strip())
    df['Quantity'] = df['Product Title'].apply(lambda x: re.findall(r'\d+(?:\.\d+)?\s*\w+', x)[0].strip() if re.findall(r'\d+(?:\.\d+)?\s*\w+', x) else None)

     # Append the product data to the DataFrame
    tesco_df = tesco_df.append(df, ignore_index=True)

tesco_df

Unnamed: 0,Product Title,Tesco_Price,Tesco_product_link,Product,Quantity
0,Tesco Fresh Milk1ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,1ltr
1,Tesco Low Fat Milk 1Ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,1Ltr
2,Tesco Fresh Milk 2 Litre,,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,2 Litre
3,Tesco Full Fat Milk 3Ltr,3.20,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Full Fat Milk,3Ltr
4,Tesco Low F Fortified 1L 10,1.35,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low F Fortified,1L
...,...,...,...,...,...
331,Kendamil Organic Stage 2 Follow On Milk 800G,17.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Organic Stage,2 Follow
332,Tonys Chocolonely Milk Chocolate Bar 50G,2.15,https://www.tesco.ie/groceries/en-IE/products/...,Tonys Chocolonely Milk Chocolate Bar,50G
333,Cadbury Dairy Milk Doughnuts,2.50,https://www.tesco.ie/groceries/en-IE/products/...,Cadbury Dairy Milk Doughnuts,
334,Kendamil Organic Stage 1 First Infant Milk 800G,17.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Organic Stage,1 First


In [28]:
tesco_df['Quantity'] = tesco_df['Quantity'].str.replace(r'(\d)([a-zA-Z])', r'\1 \2')
tesco_df.dropna(subset=['Tesco_Price','Quantity'],inplace = True)

In [31]:
tesco_df

Unnamed: 0,Product Title,Tesco_Price,Tesco_product_link,Product,Quantity
0,Tesco Fresh Milk1ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,1 ltr
1,Tesco Low Fat Milk 1Ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,1 Ltr
3,Tesco Full Fat Milk 3Ltr,3.20,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Full Fat Milk,3 Ltr
4,Tesco Low F Fortified 1L 10,1.35,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low F Fortified,1 L
5,Tesco Low Fat Milk 2 Litre,2.29,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,2 Litre
...,...,...,...,...,...
330,Kendamil Follow On Milk 2 From 6 To 12 Months ...,15.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Follow On Milk,2 From
331,Kendamil Organic Stage 2 Follow On Milk 800G,17.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Organic Stage,2 Follow
332,Tonys Chocolonely Milk Chocolate Bar 50G,2.15,https://www.tesco.ie/groceries/en-IE/products/...,Tonys Chocolonely Milk Chocolate Bar,50 G
334,Kendamil Organic Stage 1 First Infant Milk 800G,17.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Organic Stage,1 First


In [32]:
# Define the function to convert units
def convert_units(unit):
    unit = unit.replace('Ltr', 'L')
    unit = unit.replace('Litre', 'L')
    unit = unit.replace('L', 'L')
    unit = unit.replace('Lt', 'L')
    unit = unit.replace('ltr', 'L')
    unit = unit.replace('Liter', 'L')
    unit = unit.replace('litre', 'L')
    unit = unit.replace('ml', 'ml')
    unit = unit.replace('Ml', 'ml')
    unit = unit.replace('gm', 'g')
    unit = unit.replace('GM', 'g')
    unit = unit.replace('Gm', 'g')
    unit = unit.replace('G', 'g')
    return unit

# Apply the function to the Quantity column
tesco_df['Quantity'] = tesco_df['Quantity'].apply(convert_units)
# Print the updated DataFrame
tesco_df


Unnamed: 0,Product Title,Tesco_Price,Tesco_product_link,Product,Quantity
0,Tesco Fresh Milk1ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Fresh Milk,1 L
1,Tesco Low Fat Milk 1Ltr,1.15,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,1 L
3,Tesco Full Fat Milk 3Ltr,3.20,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Full Fat Milk,3 L
4,Tesco Low F Fortified 1L 10,1.35,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low F Fortified,1 L
5,Tesco Low Fat Milk 2 Litre,2.29,https://www.tesco.ie/groceries/en-IE/products/...,Tesco Low Fat Milk,2 L
...,...,...,...,...,...
330,Kendamil Follow On Milk 2 From 6 To 12 Months ...,15.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Follow On Milk,2 From
331,Kendamil Organic Stage 2 Follow On Milk 800G,17.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Organic Stage,2 Follow
332,Tonys Chocolonely Milk Chocolate Bar 50G,2.15,https://www.tesco.ie/groceries/en-IE/products/...,Tonys Chocolonely Milk Chocolate Bar,50 g
334,Kendamil Organic Stage 1 First Infant Milk 800G,17.49,https://www.tesco.ie/groceries/en-IE/products/...,Kendamil Organic Stage,1 First


### Comparing the price in both shops

In [38]:
common_df = pd.merge(supervalu_df, tesco_df, on=['Product', 'Quantity'], how='inner')
common_df[['Product','Quantity','SuperValu_price','Tesco_Price','SuperValu_product_link','Tesco_product_link']].drop_duplicates()


Unnamed: 0,Product,Quantity,SuperValu_price,Tesco_Price,SuperValu_product_link,Tesco_product_link
0,Avonmore Whole Super Milk,1.75 L,2.49,2.49,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
30,Avonmore Whole Super Milk,1 L,1.65,1.65,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
60,Avonmore Fresh Milk,2.75 L,2.99,2.99,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
90,Premier Milk,1 L,1.49,1.49,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
120,Glenisk Fresh Goats Milk,1 L,2.85,2.85,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
150,Avonmore Lactose Free Milk,1 L,2.09,2.09,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
180,Premier Low Fat Milk,1 L,1.49,1.49,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
210,Premier Low Fat Milk,2 L,2.49,2.49,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
240,Avonmore Slimline Milk,500 ml,0.95,0.95,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...
269,Avonmore Protein Milk,1 L,2.29,2.29,https://shop.supervalu.iehttps://shop.superval...,https://www.tesco.ie/groceries/en-IE/products/...


### References Taken:
1) Class Notebooks for BeautifulSoup
2) Youtube Video for Selenium: https://www.youtube.com/watch?v=UOsRrxMKJYk