In [1]:
import httpx
from http.cookies import SimpleCookie
import pandas as pd
import math
import re
from datetime import datetime
from datetime import date

now = datetime.now()
today = date.today()



TAG_RE = re.compile(r"<[^>]+>")


def remove_tags(text):
    return TAG_RE.sub(" ", text)


class WoolsWorthScraper:

    all_info = []

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        "Content-Type": "application/json",
        "Request-Id": "|ff65d6a2b5ef40deba161436fc928041.6fdc6ae6448243a2",
        "Request-Context": "appId=cid-v1:4601595d-64c0-46e0-be60-45622438acb3",
        "traceparent": "00-ff65d6a2b5ef40deba161436fc928041-6fdc6ae6448243a2-01",
        "Origin": "https://www.woolworths.com.au",
        "Connection": "keep-alive",
        "Referer": "https://www.woolworths.com.au/shop/browse/pet/dog-puppy?pageNumber=2",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "Sec-GPC": "1",
    }

    dog_json_data = {
        "categoryId": "1_EF205FA",
        "pageNumber": 1,
        "pageSize": 36,
        "sortType": "TraderRelevance",
        "url": "/shop/browse/pet/dog-puppy?pageNumber=1",
        "location": "/shop/browse/pet/dog-puppy?pageNumber=1",
        "formatObject": '{"name":"Dog & Puppy"}',
        "isSpecial": False,
        "isBundle": False,
        "isMobile": True,
        "filters": [],
        "token": "",
        "enableGp": False,
        "isHideUnavailableProducts": False,
    }

    cat_json_data = {
        "categoryId": "1_1969229",
        "pageNumber": 2,
        "pageSize": 36,
        "sortType": "TraderRelevance",
        "url": "/shop/browse/pet/cat-kitten?pageNumber=2",
        "location": "/shop/browse/pet/cat-kitten?pageNumber=2",
        "formatObject": '{"name":"Cat & Kitten"}',
        "isSpecial": False,
        "isBundle": False,
        "isMobile": True,
        "filters": [],
        "token": "",
        "enableGp": False,
        "isHideUnavailableProducts": False,
    }

    list_json_data = [dog_json_data, cat_json_data]

    base_url = "https://www.woolworths.com.au/apis/ui/browse/category"

    initial_url = "https://www.woolworths.com.au/shop/browse/pet/dog-puppy"

    def fetch(self, url, cookies):
        print(f"HTTP POST request to URL: {url}", end="\n")
        with httpx.Client(headers=self.headers) as client:
            for json_data in self.list_json_data:
                resp = client.post(
                    self.base_url,
                    cookies=cookies,
                    json=json_data,
                    timeout=40,
                )
                print(f" | Status Code: {resp.status_code}")
                return resp

    def pagination(self, response, cookies):
        json_blob = response.json()
        products = json_blob["Bundles"]
        total_items = json_blob["TotalRecordCount"]
        total_pages = round(math.ceil(total_items / len(products)))
        for json_data in self.list_json_data:
            for page_no in range(1, total_pages + 1):
                json_data["pageNumber"] = page_no
                print(
                    f"HTTP POST request page {page_no}",
                    end="\n",
                )
                with httpx.Client(headers=self.headers) as client:
                    resp = client.post(
                        self.base_url,
                        cookies=cookies,
                        json=json_data,
                        timeout=40,
                    )
                    self.parse(resp)

    def parse(self, response):
        products = response.json()["Bundles"]
        for prod in products:
            item = {}
            product = prod["Products"][0]
            item["Scraped_Date"] = now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[0]
            item["Scraped_Time"] = now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[1]
            item["Stock_Code"] = product["Stockcode"]
            item["Product_Name"] = product["Name"]
            item["Product_Category"] = (
                product["AdditionalAttributes"]["piescategorynamesjson"]
                .strip("][")
                .strip('"')
            )
            item["Sub_Category"] = (
                product["AdditionalAttributes"]["piessubcategorynamesjson"]
                .strip("][")
                .strip('"')
            )
            item["Brand"] = product["Brand"]
            item["Price/100g"] = product["CupPrice"]
            item["Price"] = product["Price"]
            item["Was_Price"] = product["WasPrice"]
            item["Save"] = product["SavingsAmount"]
            item["Size"] = product["PackageSize"]
            try:
                item["Description"] = remove_tags(
                    product["AdditionalAttributes"]["description"]
                    .replace("\r", "")
                    .replace("\n", "")
                    .strip()
                )
            except:
                item["Description"] = "N/A"
            item["Ingredients"] = product["AdditionalAttributes"]["ingredients"]
            item["Availability"] = (
                "InStock" if product["IsAvailable"] else "Out of Stock"
            )
            item["Image"] = product["LargeImageFile"]

            self.all_info.append(item)

    def to_csv(self):
        df = pd.DataFrame(self.all_info).fillna("N/A")

        df.to_csv(f"woolsworth_{today}.csv", index=False)

        print('Stored results to "woolsworth.csv"')

    def run(self):
        res = httpx.get(self.initial_url, headers=self.headers)
        c = res.cookies
        cookie = SimpleCookie()
        cookie.load(c)
        cookies = {key: value.value for key, value in cookie.items()}

        init_response = self.fetch(self.base_url, cookies)

        self.pagination(init_response, cookies)
        self.to_csv()


if __name__ == "__main__":
    scraper = WoolsWorthScraper()
    scraper.run()


HTTP POST request to URL: https://www.woolworths.com.au/apis/ui/browse/category
 | Status Code: 200
HTTP POST request page 1
HTTP POST request page 2
HTTP POST request page 3
HTTP POST request page 4
HTTP POST request page 5
HTTP POST request page 6
HTTP POST request page 7
HTTP POST request page 8
HTTP POST request page 9
HTTP POST request page 10
HTTP POST request page 11
HTTP POST request page 12
HTTP POST request page 13
HTTP POST request page 14
HTTP POST request page 15
HTTP POST request page 16
HTTP POST request page 17
HTTP POST request page 18
HTTP POST request page 19
HTTP POST request page 20
HTTP POST request page 21
HTTP POST request page 22
HTTP POST request page 23
HTTP POST request page 24
HTTP POST request page 25
HTTP POST request page 26
HTTP POST request page 27
HTTP POST request page 28
HTTP POST request page 29
HTTP POST request page 30
HTTP POST request page 31
HTTP POST request page 32
HTTP POST request page 33
HTTP POST request page 34
HTTP POST request page 35