In [17]:
import requests
from bs4 import BeautifulSoup
import json
import queue
import time
import random
from requests.exceptions import HTTPError

In [18]:
BASE_URL = "https://www.walmart.com"
OUTPUT_FILE = "product_info.jsonl"
FAILED_LOG = "failed_urls.txt"

In [26]:
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0/byA6CjeQHYiZ0Se",
    "Mozilla/5.0 (Macintosh; PPC Mac OS X 10.4.4; rv:60.5.0) Gecko/20100101 Firefox/60.5.0",
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0/KYtdkVMAZUtmong-89",
    "Mozilla/5.0 (X11; Linux; en-AU; rv:135.0) Gecko/20161700 Firefox/135.0",
    "Mozilla/5.0 (iPad; CPU iPad OS 16_7_7 like Mac OS X) AppleWebKit/532.1 (KHTML, like Gecko) FxiOS/11.1o2152.0 Mobile/76S605 Safari/532.1",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; x64; rv:121.0) Gecko/20100101 Firefox/121.0/CetKrEFQ34G7OAO-68",
]


In [27]:
search_queries = [
    "computers", "laptops", "desktops", "monitors", "printers", "hard+drives", "usb", "cords", "cameras",
    "mouse", "keyboard", "microphones", "speakers", "radio", "tablets", "android", "apple", "watch", "smart+watch",
    "fridge", "airconditioning", "wifi", "router", "modem", "desk", "xbox", "playstation", "nintendo"
]

In [28]:
product_queue = queue.Queue()
seen_urls = set()

In [29]:
def get_headers():
    headers = {
        "accept": "application/json",
        "accept-language": "en-US",
        "accept-encoding": "gzip, deflate, br, zstd",
        "user-agent": random.choice(USER_AGENTS)
    }
    return headers

In [30]:
def get_product_links_from_search_page(query, page_number):
    search_url = f"https://www.walmart.com/search?q={query}&page={page_number}"
    max_retries = 5
    backoff_factor = 3
    for attempt in range(max_retries):
        try:
            headers = get_headers()
            response = requests.get(search_url, headers=headers)
            time.sleep(random.uniform(1, 3))
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            product_links = []

            found = False
            for a_tag in soup.find_all('a', href=True):
                if '/ip/' in a_tag['href']:
                    found = True
                    full_url = a_tag['href'] if "https" in a_tag['href'] else BASE_URL + a_tag['href']
                    if full_url not in seen_urls:
                        product_links.append(full_url)

            if not found:
                print("\n\n\nSOUP WHEN NOT FOUND", soup)

            return product_links

        except HTTPError as e:
            if e.response.status_code == 412:
                print(f"Precondition Failed (412): {e}. Skipping URL.")
                break
            wait_time = backoff_factor ** attempt
            print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            print(f"Failed to get product links for query: {query} on page: {page_number}. Error: {e}")
            with open(FAILED_LOG, "a") as log:
                log.write(f"SEARCH_FAILED: {search_url}\n")
            break

    print(f"Skipping query after {max_retries} retries: {query} on page: {page_number}")
    return []


In [31]:
def extract_product_info(product_url):
    print("Processing URL", product_url)
    max_retries = 5
    backoff_factor = 3
    for attempt in range(max_retries):
        try:
            headers = get_headers()
            response = requests.get(product_url, headers=headers)
            time.sleep(random.uniform(1, 3))
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            script_tag = soup.find('script', id='__NEXT_DATA__')

            if script_tag is None:
                return None

            data = json.loads(script_tag.string)
            initial_data = data["props"]["pageProps"]["initialData"]["data"]
            product_data = initial_data["product"]
            reviews_data = initial_data.get("reviews", {})

            product_info = {
                "price": product_data["priceInfo"]["currentPrice"]["price"],
                "review_count": reviews_data.get("totalReviewCount", 0),
                "item_id": product_data["usItemId"],
                "avg_rating": reviews_data.get("averageOverallRating", 0),
                "product_name": product_data["name"],
                "brand": product_data.get("brand", ""),
                "availability": product_data["availabilityStatus"],
                "image_url": product_data["imageInfo"]["thumbnailUrl"],
                "short_description": product_data.get("shortDescription", "")
            }

            return product_info

        except HTTPError as e:
            if e.response.status_code == 412:
                print(f"Precondition Failed (412): {e}. Skipping URL.")
                break
            wait_time = backoff_factor ** attempt
            print(f"HTTP error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
        except Exception as e:
            print(f"Failed to process URL: {product_url}. Error: {e}")
            with open(FAILED_LOG, "a") as log:
                log.write(f"PRODUCT_FAILED: {product_url}\n")
            break

    print(f"Skipping URL after {max_retries} retries: {product_url}")
    return None


In [32]:
def main():
    with open(OUTPUT_FILE, 'w') as file:
        while search_queries:
            current_query = search_queries.pop(0)
            print("\n\nCURRENT QUERY", current_query, "\n\n")
            page_number = 1

            while True:
                product_links = get_product_links_from_search_page(current_query, page_number)
                if not product_links or page_number > 99:
                    print(f"No products found on page {page_number}. Ending search for this query.")
                    break

                for link in product_links:
                    if link not in seen_urls:
                        product_queue.put(link)
                        seen_urls.add(link)

                while not product_queue.empty():
                    product_url = product_queue.get()
                    product_info = extract_product_info(product_url)
                    if product_info:
                        file.write(json.dumps(product_info) + "\n")

                page_number += 1
                print("Next Page:", page_number)

if __name__ == "__main__":
    main()



CURRENT QUERY computers 


Precondition Failed (412): 412 Client Error: Precondition Failed for url: https://www.walmart.com/search?q=computers&page=1. Skipping URL.
Skipping query after 5 retries: computers on page: 1
No products found on page 1. Ending search for this query.


CURRENT QUERY laptops 


Precondition Failed (412): 412 Client Error: Precondition Failed for url: https://www.walmart.com/search?q=laptops&page=1. Skipping URL.
Skipping query after 5 retries: laptops on page: 1
No products found on page 1. Ending search for this query.


CURRENT QUERY desktops 


Precondition Failed (412): 412 Client Error: Precondition Failed for url: https://www.walmart.com/search?q=desktops&page=1. Skipping URL.
Skipping query after 5 retries: desktops on page: 1
No products found on page 1. Ending search for this query.


CURRENT QUERY monitors 


Precondition Failed (412): 412 Client Error: Precondition Failed for url: https://www.walmart.com/search?q=monitors&page=1. Skipping URL.
Ski