In [2]:
import csv
import os
import time
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs

# Retrieve configuration values
config = configparser.ConfigParser()
config.read('configuration.ini')

# Folder path and other configurations
folderpath = "/Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper"
delay = int(config.get('IGA', 'delayseconds'))
category_ignore = str(config.get('IGA', 'ignoredcategories'))

# Create a new CSV file for IGA
filename = "IGA.csv"
filepath = os.path.join(folderpath, filename)
if os.path.exists(filepath):
    os.remove(filepath)

print("Saving to " + filepath)

# Write the header
with open(filepath, "a", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Product Code", "Category", "Item Name", "Best Price", "Best Unit Price", "Item Price", "Unit Price", "Price Was", "Special Text", "Complex Promo Text", "Link"])

# Configure options for EdgeDriver
options = webdriver.EdgeOptions()
options.add_argument("--app=https://www.igashop.com.au")
options.add_experimental_option('excludeSwitches', ['enable-logging'])

# Start EdgeDriver
print("Starting IGA...")
driver = webdriver.Edge(options=options)

# Navigate to the IGA website
url = "https://www.igashop.com.au"
driver.get(url)
time.sleep(delay)

# Function to close the dialogue box
def close_dialogue_box():
    try:
        # Attempt to find and click the close button of the dialogue box
        close_button = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/button")
        close_button.click()
    except:
        # If the button is not found, just pass
        print("Close button is not found")

# Set Click & Collect location (if necessary)
# Close dialogue box if it appears
close_dialogue_box()

# Parse the page content
page_contents = bs(driver.page_source, "html.parser")

categories = [
    {"name": "Specials", "link": "/categories/specials"},
    {"name": "Fruit and Vegetable", "link": "/categories/fruit-and-vegetable/vegetables/1"},
    {"name": "Pantry", "link": "/categories/pantry"},
    {"name": "Meat, Seafood and Deli", "link": "/categories/meat-seafood-deli"},
    {"name": "Dairy, Eggs and Fridge", "link": "/categories/dairy-eggs-fridge"},
    {"name": "Bakery", "link": "/categories/bakery"},
    {"name": "Drinks", "link": "/categories/drinks"},
    {"name": "Frozen", "link": "/categories/frozen"},
    {"name": "Health and Beauty", "link": "/categories/health-and-beauty"},
    {"name": "Pet", "link": "/categories/pet"},
    {"name": "Baby", "link": "/categories/baby"},
    {"name": "Liquor", "link": "/categories/liquor"},
    {"name": "Household", "link": "/categories/household"},
    {"name": "Other", "link": "/categories/other"},
    {"name": "Front of House", "link": "/categories/front-of-house"}
]

# Remove categories ignored in the config file
categories = [cat for cat in categories if cat["name"] not in category_ignore]

# Show the user the categories to scrape
print("Categories to Scrape:")
for category in categories:
    print(category["name"])

# Iterate through each category and follow the link to get the products
for category in categories:
    driver = webdriver.Edge(options=options)

    # Get the link to the category's page
    category_link = url + category["link"]
    category_name = category["name"]

    print("Loading Category: " + category_name)

    # Follow the link to the category page
    driver.get(category_link)
    time.sleep(delay)

    # Close dialogue box if it appears on the category page
    close_dialogue_box()

    # Parse page content
    soup = bs(driver.page_source, "html.parser")

    # Get the number of pages in this category
    try:
        pagination = page_contents.find("ul", class_="flex flex-row items")
        pages = pagination.find_all("li")
        total_pages = int(pages[-2].text.strip())
    except:
        total_pages = 1

    for page in range(1, total_pages + 1):
        soup = bs(driver.page_source, "html.parser")

        # Find all products on the page
        products = soup.find_all("div", class_="overflow-hidden rounded border")
        print(category_name + ": Page " + str(page) + " of " + str(total_pages) + " | Products on this page: " + str(len(products)))

        # Iterate through each product and extract the product details
        for product in products:
            name = product.find("div", class_="flex max-w-[85%]")
            itemprice = product.find("span", class_="font-bold leading-none")
            unitprice = product.find("span", class_="leading-none")
            specialtext = product.find("div", class_="relative inline-flex w-fit shrink-0 items-center rounded px-3 py-1 font-sans text-sm font-bold bg-primary")
            productLink = product.find("a", class_="relative justify-center")["href"]
            productcode = productLink.split("/")[-1]
            
            # Extract product details
            if name and itemprice:
                name = name.text.strip()
                itemprice = itemprice.text.strip()
                best_price = itemprice
                link = url + productLink

                # Unit Price and Was Price
                if unitprice:
                    unitprice = unitprice.text.strip().lower()
                    price_was_pos = unitprice.find("was $")

                    if price_was_pos != -1:
                        price_was = unitprice[price_was_pos + 4:].strip()
                        unitprice = unitprice[:price_was_pos].strip()
                        if unitprice[0] == "|":
                            unitprice = None
                        else:
                            unitprice = unitprice[:unitprice.find("| was")].strip()

                    best_unitprice = unitprice
                else:
                    best_unitprice = None
                    price_was = None

                # Special Text
                if specialtext:
                    specialtext = specialtext.text.strip()
                    if specialtext == "1/2":
                        specialtext = "50%"

                # Complex Promo
                if complexpromo:
                    complexpromo = complexpromo.text.strip()
                    if "Pick any " in complexpromo or "Buy " in complexpromo:
                        try:
                            complexpromo = complexpromo.replace("Pick any ", "")
                            complexpromo = complexpromo.replace("Buy ", "")
                            complex_itemcount = int(complexpromo[:complexpromo.find(" for")])
                            complex_cost = float(complexpromo[complexpromo.find("$") + 1:])
                            best_price = "$" + str(round(complex_cost / complex_itemcount, 2))
                        except:
                            best_price = itemprice

                # Write contents to file
                with open(filepath, "a", newline="") as f:
                    writer = csv.writer(f)
                    writer.writerow([productcode, category_name, name, best_price, best_unitprice, itemprice, unitprice, price_was, specialtext, complexpromo, link])

                # Reset variables
                name = None
                itemprice = None
                unitprice = None
                specialtext = None
                complexpromo = None
                productLink = None
                productcode = None
                price_was = None

        # Get the link to the next page
        next_page_link = f"{category_link}?page={page + 1}"

        # Restart browser every 50 pages
        if page % 50 == 0:
            print("Restarting Browser...")
            driver.close()
            driver = webdriver.Edge(options=options)

        # Navigate to the next page
        if total_pages > 1 and page + 1 <= total_pages:
            driver.get(next_page_link)
            close_dialogue_box()  # Close the dialogue box on the next page
        time.sleep(delay)

    time.sleep(delay)
    driver.close()

driver.quit()
print("Finished")


Saving to /Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper/IGA.csv
Starting IGA...
Categories to Scrape:
Specials
Fruit and Vegetable
Pantry
Meat, Seafood and Deli
Dairy, Eggs and Fridge
Bakery
Drinks
Frozen
Health and Beauty
Pet
Baby
Liquor
Household
Other
Front of House
Loading Category: Specials
Specials: Page 1 of 1 | Products on this page: 0
Loading Category: Fruit and Vegetable
Fruit and Vegetable: Page 1 of 1 | Products on this page: 0
Loading Category: Pantry
Pantry: Page 1 of 1 | Products on this page: 0
Loading Category: Meat, Seafood and Deli
Close button is not found


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: MicrosoftEdge=128.0.2739.42)
Stacktrace:
0   msedgedriver                        0x0000000104d79944 msedgedriver + 4675908
1   msedgedriver                        0x0000000104d7154c msedgedriver + 4642124
2   msedgedriver                        0x000000010494176c msedgedriver + 251756
3   msedgedriver                        0x0000000104923298 msedgedriver + 127640
4   msedgedriver                        0x00000001049a3e60 msedgedriver + 654944
5   msedgedriver                        0x00000001049b480c msedgedriver + 722956
6   msedgedriver                        0x00000001049759a4 msedgedriver + 465316
7   msedgedriver                        0x0000000104976210 msedgedriver + 467472
8   msedgedriver                        0x0000000104d3b78c msedgedriver + 4421516
9   msedgedriver                        0x0000000104d4069c msedgedriver + 4441756
10  msedgedriver                        0x0000000104d1e7d8 msedgedriver + 4302808
11  msedgedriver                        0x0000000104d40d34 msedgedriver + 4443444
12  msedgedriver                        0x0000000104d0fd60 msedgedriver + 4242784
13  msedgedriver                        0x0000000104d60610 msedgedriver + 4572688
14  msedgedriver                        0x0000000104d60760 msedgedriver + 4573024
15  msedgedriver                        0x0000000104d710b0 msedgedriver + 4640944
16  libsystem_pthread.dylib             0x00000001a5debfa8 _pthread_start + 148
17  libsystem_pthread.dylib             0x00000001a5de6da0 thread_start + 8


In [2]:
import csv
import os
import time
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs

# Retrieve configuration values
config = configparser.ConfigParser()
config.read('configuration.ini')

# Folder path and other configurations
folderpath = "/Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper"
delay = int(config.get('IGA', 'delayseconds'))
category_ignore = str(config.get('IGA', 'ignoredcategories'))

# Create a new CSV file for IGA
filename = "IGA.csv"
filepath = os.path.join(folderpath, filename)
if os.path.exists(filepath):
    os.remove(filepath)

print("Saving to " + filepath)

# Write the header
with open(filepath, "a", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Product Code", "Category", "Item Name", "Best Price", "Best Unit Price", "Item Price", "Unit Price", "Price Was", "Special Text", "Complex Promo Text", "Link"])

# Configure options for EdgeDriver
options = webdriver.EdgeOptions()
options.add_argument("--app=https://www.igashop.com.au")
options.add_experimental_option('excludeSwitches', ['enable-logging'])

# Start EdgeDriver
print("Starting IGA...")
driver = webdriver.Edge(options=options)

# Navigate to the IGA website
url = "https://www.igashop.com.au"
driver.get(url)
time.sleep(delay)

# Set Click & Collect location (if necessary)
# Adjust this based on how IGA sets locations
element = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/button")

# Parse the page content
page_contents = bs(driver.page_source, "html.parser")

categories = [
    {"name": "Specials", "link": "/categories/specials"},
    {"name": "Fruit and Vegetable", "link": "/categories/fruit-and-vegetable/vegetables/1"},
    {"name": "Pantry", "link": "/categories/pantry"},
    {"name": "Meat, Seafood and Deli", "link": "/categories/meat-seafood-deli"},
    {"name": "Dairy, Eggs and Fridge", "link": "/categories/dairy-eggs-fridge"},
    {"name": "Bakery", "link": "/categories/bakery"},
    {"name": "Drinks", "link": "/categories/drinks"},
    {"name": "Frozen", "link": "/categories/frozen"},
    {"name": "Health and Beauty", "link": "/categories/health-and-beauty"},
    {"name": "Pet", "link": "/categories/pet"},
    {"name": "Baby", "link": "/categories/baby"},
    {"name": "Liquor", "link": "/categories/liquor"},
    {"name": "Household", "link": "/categories/household"},
    {"name": "Other", "link": "/categories/other"},
    {"name": "Front of House", "link": "/categories/front-of-house"}
]

# Remove categories ignored in the config file
categories = [cat for cat in categories if cat["name"] not in category_ignore]

# Show the user the categories to scrape
print("Categories to Scrape:")
for category in categories:
    print(category["name"])

# Iterate through each category and follow the link to get the products
for category in categories:
    driver = webdriver.Edge(options=options)

    # Get the link to the category's page
    category_link = url + category["link"]
    category_name = category["name"]

    print("Loading Category: " + category_name)

    # Follow the link to the category page
    driver.get(category_link)
    time.sleep(delay)

    # Parse page content
    soup = bs(driver.page_source, "html.parser")

    # Get the number of pages in this category
    try:
        pagination = page_contents.find("ul", class_="flex flex-row items")
        pages = pagination.find_all("li")
        total_pages = int(pages[-2].text.strip())
    except:
        total_pages = 1

    for page in range(1, total_pages + 1):
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find all products on the page
        products = soup.find_all("div", class_="overflow-hidden rounded border")
        print(category_name + ": Page " + str(page) + " of " + str(total_pages) + " | Products on this page: " + str(len(products)))

        # Iterate through each product and extract the product details
        for product in products:
            name = product.find("div", class_="flex max-w-[85%]")
            itemprice = product.find("span", class_="font-bold leading-none")
            unitprice = product.find("span", class_="leading-none")
            specialtext = product.find("div", class_="relative inline-flex w-fit shrink-0 items-center rounded px-3 py-1 font-sans text-sm font-bold bg-primary")
            productLink = product.find("a", class_="relative justify-center")["href"]
            productcode = productLink.split("/")[-1]
            
            # Extract product details
            if name and itemprice:
                name = name.text.strip()
                itemprice = itemprice.text.strip()
                best_price = itemprice
                link = url + productLink

                # Unit Price and Was Price
                if unitprice:
                    unitprice = unitprice.text.strip().lower()
                    price_was_pos = unitprice.find("was $")

                    if price_was_pos != -1:
                        price_was = unitprice[price_was_pos + 4:].strip()
                        unitprice = unitprice[:price_was_pos].strip()
                        if unitprice[0] == "|":
                            unitprice = None
                        else:
                            unitprice = unitprice[:unitprice.find("| was")].strip()

                    best_unitprice = unitprice
                else:
                    best_unitprice = None
                    price_was = None

                # Special Text
                if specialtext:
                    specialtext = specialtext.text.strip()
                    if specialtext == "1/2":
                        specialtext = "50%"

                # Complex Promo
                if complexpromo:
                    complexpromo = complexpromo.text.strip()
                    if "Pick any " in complexpromo or "Buy " in complexpromo:
                        try:
                            complexpromo = complexpromo.replace("Pick any ", "")
                            complexpromo = complexpromo.replace("Buy ", "")
                            complex_itemcount = int(complexpromo[:complexpromo.find(" for")])
                            complex_cost = float(complexpromo[complexpromo.find("$") + 1:])
                            best_price = "$" + str(round(complex_cost / complex_itemcount, 2))
                        except:
                            best_price = itemprice

                # Write contents to file
                with open(filepath, "a", newline="") as f:
                    writer = csv.writer(f)
                    writer.writerow([productcode, category_name, name, best_price, best_unitprice, itemprice, unitprice, price_was, specialtext, complexpromo, link])

                # Reset variables
                name = None
                itemprice = None
                unitprice = None
                specialtext = None
                complexpromo = None
                productLink = None
                productcode = None
                price_was = None

        # Get the link to the next page
        next_page_link = f"{category_link}?page={page + 1}"

        # Restart browser every 50 pages
        if page % 50 == 0:
            print("Restarting Browser...")
            driver.close()
            driver = webdriver.Edge(options=options)

        # Navigate to the next page
        if total_pages > 1 and page + 1 <= total_pages:
            driver.get(next_page_link)
        time.sleep(delay)

    time.sleep(delay)
    driver.close()

driver.quit()
print("Finished")


Saving to /Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper/IGA.csv
Starting IGA...
Categories to Scrape:
Specials
Fruit and Vegetable
Pantry
Meat, Seafood and Deli
Dairy, Eggs and Fridge
Bakery
Drinks
Frozen
Health and Beauty
Pet
Baby
Liquor
Household
Other
Front of House
Loading Category: Specials
Specials: Page 1 of 1 | Products on this page: 0
Loading Category: Fruit and Vegetable
Fruit and Vegetable: Page 1 of 1 | Products on this page: 0
Loading Category: Pantry
Pantry: Page 1 of 1 | Products on this page: 0
Loading Category: Meat, Seafood and Deli
Meat, Seafood and Deli: Page 1 of 1 | Products on this page: 0
Loading Category: Dairy, Eggs and Fridge
Dairy, Eggs and Fridge: Page 1 of 1 | Products on this page: 0
Loading Category: Bakery
Bakery: Page 1 of 1 | Products on this page: 0
Loading Category: Drinks
Drinks: Page 1 of 1 | Products on this page: 0
Loading Category: Frozen
Frozen: Page 1 of 1 | Products on this page: 0
Loading Category: Heal

In [None]:
from bs4 import BeautifulSoup as bs
from operator import itemgetter
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
import datetime
import requests
import time
import math
import sys
import re
import os

# set local as dir
os.chdir(sys.path[0])
now = datetime.datetime.now()
time_string = "{}-{}-{}".format(now.year, now.month, now.day)

# # #   M A K E   C S V   # # #
categoriesText = [
    "All Categories",
    "Beverages",
    "Bulk Foods",
    "Commercial Bakery",
    "Deli and Cheese",
    "Frozen",
    "Grocery",
    "Health and Beauty",
    "Health Care",
    "Meal Replacement",
    "Bakery",
    "Meat",
    "Produce",
    "Refrigerated Grocery",
    "Seafood"
]

categoriesUrlsExtensions = [
    r"in-promotion",
    r"Beverages/in-promotion",
    r"Bulk%20Foods/in-promotion",
    r"Commercial%20Bakery/in-promotion",
    r"Deli%20and%20Cheese/in-promotion",
    r"Frozen/in-promotion",
    r"Grocery/in-promotion",
    r"Health%20%26%20Beauty/in-promotion",
    r"Health%20Care/in-promotion",
    r"Home%20Meal%20Replacement/in-promotion",
    r"Instore%20Bakery/in-promotion",
    r"Meat/in-promotion",
    r"Produce/in-promotion",
    r"Refrigerated%20Grocery/in-promotion",
    r"Seafood/in-promotion"
]

def clean(istr):
    if "\n" in istr:
        istr = istr.replace(r"\n", "")
    if "$" in istr:
        istr = istr.replace("$", "")
    return istr

print("Please enter desired category number: ")
for i, cat in enumerate(categoriesText, 1):
    print("{}: {}".format(i, cat))

userCategory = int(input("Choose your category: "))
userCategory = 1 if userCategory not in range(1, 16) else int(userCategory)
# down one to account for lists
userCategory -= 1

cv_name = "igaSalesExport {}-{}.csv".format(categoriesText[userCategory], time_string)
with open(cv_name, "w") as csv:
    csv.write("Item Name,Item Category/Brand,Regular Price,Sale Price,Total Sale Discount,Discount Ratio")

# # #   G E T   P A G E S   # # #
# init driver
service = Service(executable_path=r"/Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper")
driver = webdriver.Firefox(service=service)

urlDefaultLanding = r"https://www.iga.net/en/online_grocery/browse/{}".format(categoriesUrlsExtensions[userCategory])
pagesScrape = requests.get(urlDefaultLanding)
pagesHtmlDocument = pagesScrape.content
soup = bs(pagesHtmlDocument, "html.parser")

# # #   C R E A T E   U R L S   # # #

urls = []
for p in range(1, 20):
    urls.append("https://www.iga.net/en/online_grocery/browse/{}?pageSize=200&page={}".format(categoriesUrlsExtensions[userCategory], p))

# # #   P A R S E   U R L S   # # #

itemCatalog = []

driver.get("https://www.iga.net/en/online_grocery/browse/in-promotion")

for url in urls:

    print(f"Parsing {url}")
    driver.get(url)

    htmlDocument = driver.page_source

    soup = bs(htmlDocument, "html.parser")
    counter = 0
    found = 0
    for productGridItem in soup.find_all("div", "item-product__content push--top"):
        found += 1
        ### P R I C E ###
        try:
            productSalePrice = clean(productGridItem.find("span", {"class": "price text--strong"}).text)
            productPrice = clean(productGridItem.find("span", "price-amount").text)
        except AttributeError:
            continue
        ### D E T A I L S ###
        try:
            productCategory = productGridItem.find("div", "item-product__brand push--top").text
            productCategory = re.findall(r'[^\S][A-Za-z]+', productCategory, flags=re.DOTALL)
            productCategory = "".join(productCategory)
        except AttributeError:
            productCategory = "Misc"
        productName = productGridItem.find("a", "js-ga-productname")
        if productName.text[0] == " ":
            productName = productName.text[1:]
        else:
            productName = productName.text
        ### M A T H ###
        productDiscount = "{:2.2f}".format(float(productPrice) - float(productSalePrice))
        productSaleRatio = float(productDiscount) / float(productPrice)
        productSaleRatio = "{0:.0%}".format(productSaleRatio)
        productMetadataGroup = [
            productName.replace(",", "-"),
            productCategory.replace(",", "-"),
            productPrice,
            productSalePrice,
            productDiscount,
            productSaleRatio
        ]
        itemCatalog.append(productMetadataGroup)
        counter += 1
        print(productMetadataGroup)
        print([found, counter])

itemCatalog = sorted(itemCatalog, key=itemgetter(5), reverse=True)

# # #   W R I T E   T O   C S V   # # #

with open(cv_name, "a") as csv:
    for product in itemCatalog:
        csv.write(
            "\n{},{},${},${},${},{}".format(
                product[0],
                product[1],
                product[2],
                product[3],
                product[4],
                product[5]
            )
        )

print("Finished!")


Please enter desired category number: 
1: All Categories
2: Beverages
3: Bulk Foods
4: Commercial Bakery
5: Deli and Cheese
6: Frozen
7: Grocery
8: Health and Beauty
9: Health Care
10: Meal Replacement
11: Bakery
12: Meat
13: Produce
14: Refrigerated Grocery
15: Seafood


In [1]:
import csv
import os
import time
import configparser
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

# Retrieve configuration values
config = configparser.ConfigParser()
config.read('configuration.ini')

category_ignore = str(config.get('Coles','IgnoredCategories'))



# Folder path and other configurations
folderpath = "/Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper"
delay = int(config.get('IGA', 'delayseconds'))
#ccsuburb = str(config.get('IGA', 'clickandcollectsuburb'))
category_ignore = str(config.get('IGA', 'ignoredcategories'))

# Create a new csv file for IGA
filename = "IGA.csv"
filepath = os.path.join(folderpath, filename)
if os.path.exists(filepath):
    os.remove(filepath)

print("Saving to " + filepath)

# Write the header
with open(filepath, "a", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Product Code", "Category", "Item Name", "Best Price", "Best Unit Price", "Item Price", "Unit Price", "Price Was", "Special Text", "Complex Promo Text", "Link"])

# Configure options
options = webdriver.EdgeOptions()
options.add_argument("--app=https://www.igashop.com.au")
options.add_experimental_option('excludeSwitches', ['enable-logging'])

# Start EdgeDriver
print("Starting IGA...")
driver = webdriver.Edge(options=options)

# Navigate to the IGA website
url = "https://www.igashop.com.au"
driver.get(url)
time.sleep(delay)

# Set Click & Collect location (if necessary)
# Note: Update this section based on how IGA sets locations
element = driver.find_element(By.XPATH, "/html/body/div[3]/div/div/button")

# Parse the page content
page_contents = BeautifulSoup(driver.page_source, "html.parser")

# Find all product categories on the page
categories = page_contents.find_all("li", class_="flex h-full shrink-0 cursor-pointer items-center justify-start font-bold text-foreground-inverted transition-all hover")

# Remove categories ignored in the config file
for category in reversed(categories):
    category_endpoint = category.get("href").replace("/browse/", "")
    category_endpoint = category_endpoint.replace("/", "")
    if category_ignore.find(category_endpoint) != -1:
        categories.remove(category)

# Show the user the categories to scrape
print("Categories to Scrape:")
for category in categories:
    print(category.text)

# Iterate through each category and follow the link to get the products
for category in categories:
    driver = webdriver.Edge(options=options)

    # Get the link to the category's page
    category_link = url + category.get("href")
    category_name = category.text.strip()

    print("Loading Category: " + category_name)

    # Follow the link to the category page
    driver.get(category_link)
    time.sleep(delay)

    # Parse page content
    page_contents = BeautifulSoup(driver.page_source, "html.parser")

    # Get the number of pages in this category
    try:
        pagination = page_contents.find("ul", class_="flex flex-row items")
        pages = pagination.find_all("li")
        total_pages = int(pages[-2].text.strip())
    except:
        total_pages = 1

    for page in range(1, total_pages + 1):
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find all products on the page
        products = soup.find_all("div", class_="overflow-hidden rounded border")
        print(category_name + ": Page " + str(page) + " of " + str(total_pages) + " | Products on this page: " + str(len(products)))

        # Iterate through each product and extract the product details
        for product in products:
            name = product.find("div", class_="flex max-w-[85%]")
            itemprice = product.find("span", class_="font-bold leading-none")
            unitprice = product.find("span", class_="leading-none")
            specialtext = product.find("div", class_="relative inline-flex w-fit shrink-0 items-center rounded px-3 py-1 font-sans text-sm font-bold bg-primary")
            productLink = product.find("a", class_="relative justify-center")["href"]
            productcode = productLink.split("/")[-1]
            
            # Extract product details
            if name and itemprice:
                name = name.text.strip()
                itemprice = itemprice.text.strip()
                best_price = itemprice
                link = url + productLink

                # Unit Price and Was Price
                if unitprice:
                    unitprice = unitprice.text.strip().lower()
                    price_was_pos = unitprice.find("was $")

                    if price_was_pos != -1:
                        price_was = unitprice[price_was_pos + 4:].strip()
                        unitprice = unitprice[:price_was_pos].strip()
                        if unitprice[0] == "|":
                            unitprice = None
                        else:
                            unitprice = unitprice[:unitprice.find("| was")].strip()

                    best_unitprice = unitprice
                # Special Text
                if specialtext:
                    specialtext = specialtext.text.strip()
                    if specialtext == "1/2":
                        specialtext = "50%"

                # Complex Promo
                if complexpromo:
                    complexpromo = complexpromo.text.strip()
                    if "Pick any " in complexpromo or "Buy " in complexpromo:
                        try:
                            complexpromo = complexpromo.replace("Pick any ", "")
                            complexpromo = complexpromo.replace("Buy ", "")
                            complex_itemcount = int(complexpromo[:complexpromo.find(" for")])
                            complex_cost = float(complexpromo[complexpromo.find("$") + 1:])
                            best_price = "$" + str(round(complex_cost / complex_itemcount, 2))
                        except:
                            best_price = itemprice

                # Write contents to file
                with open(filepath, "a", newline="") as f:
                    writer = csv.writer(f)
                    writer.writerow([productcode, category_name, name, best_price, best_unitprice, itemprice, unitprice, price_was, specialtext, complexpromo, link])

                # Reset variables
                name = None
                itemprice = None
                unitprice = None
                specialtext = None
                complexpromo = None
                productLink = None
                productcode = None
                price_was = None

        # Get the link to the next page
        next_page_link = f"{category_link}?page={page + 1}"

        # Restart browser every 50 pages
        if page % 50 == 0:
            print("Restarting Browser...")
            driver.close()
            driver = webdriver.Edge(options=options)

        # Navigate to the next page
        if total_pages > 1 and page + 1 <= total_pages:
            driver.get(next_page_link)
        time.sleep(delay)

    time.sleep(delay)
    driver.close()

driver.quit()
print("Finished")


Saving to /Users/sharu/Documents/Deakin/2024ⓣ⓵/SIT764/DiscountMate/Australia_GroceriesScraper/IGA.csv
Starting IGA...
Categories to Scrape:
Finished
