In [None]:
import requests
import json
import re
from bs4 import BeautifulSoup


CATEGORY_URLS = [
    "https://www.anhoch.com/products?query=&categories[0]=matichni-plochi&inStockOnly=1&sort=latest&perPage=20&page=",
    "https://www.anhoch.com/products?query=&categories[0]=grafichki-karti&inStockOnly=1&sort=latest&perPage=20&page=",
    "https://www.anhoch.com/products?query=&categories[0]=desktop-ram-memorii&inStockOnly=1&sort=latest&perPage=20&page=",
    "https://www.anhoch.com/products?query=&categories[0]=procesori&inStockOnly=1&sort=latest&perPage=20&page=",
    "https://www.anhoch.com/products?query=&categories[0]=kukjishta-i-napojuvanja&inStockOnly=1&sort=latest&perPage=20&page=",
    "https://www.anhoch.com/products?query=&categories[0]=ventilatori-i-ladilnici&inStockOnly=1&sort=latest&perPage=20&page=",
    "https://www.anhoch.com/products?query=&categories[0]=diskovi-i-skladiranje&inStockOnly=1&sort=latest&perPage=20&page="
]


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Accept": "application/json"
}

def extract_category(url):
    """Extract category name from URL."""
    match = re.search(r"categories\[0\]=([^&]+)", url)
    return match.group(1) if match else "Unknown"

def fetch_description(product_url):
    """Fetch product description using requests + BeautifulSoup."""
    try:
        response = requests.get(product_url, headers=HEADERS)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            
            description_div = soup.find("div", {"id": "description"})
            if not description_div:
                return "No description available"

            
            for br in description_div.find_all("br"):
                br.replace_with("\n")

            
            description_text = description_div.get_text("\n").strip()

            return description_text
        else:
            return f"Failed to fetch description (Status: {response.status_code})"
    except Exception as e:
        return f"Error: {str(e)}"

def fetch_all_pages(base_url):
    """Fetch product data from all available pages in a category."""
    all_products = []
    page = 1

    
    category_name = extract_category(base_url)

    while True:
        print(f"📡 Fetching page {page} for category '{category_name}'...")
        response = requests.get(base_url + str(page), headers=HEADERS)

        if response.status_code == 200:
            data = response.json()
            products = data.get("products", {}).get("data", [])

            if not products:
                print(f"🚫 No more products found for '{category_name}'. Stopping.")
                break  

            for item in products:
                slug = item.get("slug", "")
                product_url = f"https://www.anhoch.com/products/{slug}" if slug else "N/A"
                image_url = item.get("base_image", {}).get("path", "N/A")
                description = fetch_description(product_url)
                all_products.append({
                    "category": category_name,  
                    "name": item.get("name", "N/A"),
                    "price": item.get("price", {}).get("inCurrentCurrency", {}).get("amount", "N/A"),
                    "link": product_url,
                    "image": image_url,
                    "availability": "In Stock" if item.get("is_in_stock", False) else "Out of Stock",
                    "description": description
                    
                })

            page += 1  
        else:
            print(f"❌ Failed to fetch page {page} for '{category_name}'. Status Code: {response.status_code}")
            break  

    return all_products

def scrape_all_categories():
    """Scrape all defined categories and store data."""
    all_products = []
    
    for category_url in CATEGORY_URLS:
        print(f"\n🛒 Processing category URL: {category_url}")
        products = fetch_all_pages(category_url)
        all_products.extend(products)

    return all_products

def save_to_json(data, filename="data/products.json"):
    """Save extracted data to a JSON file."""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"\n✅ Data saved to {filename}")

def main():
    print("🚀 Starting multi-category web scraping...")
    scraped_data = scrape_all_categories()

    if scraped_data:
        save_to_json(scraped_data)
        print("\n🎯 Scraping complete!")
    else:
        print("\n❌ No products scraped.")

if __name__ == "__main__":
    main()


🚀 Starting multi-category web scraping...

🛒 Processing category URL: https://www.anhoch.com/products?query=&categories[0]=matichni-plochi&inStockOnly=1&sort=latest&perPage=20&page=
📡 Fetching page 1 for category 'matichni-plochi'...
📡 Fetching page 2 for category 'matichni-plochi'...
📡 Fetching page 3 for category 'matichni-plochi'...
📡 Fetching page 4 for category 'matichni-plochi'...
🚫 No more products found for 'matichni-plochi'. Stopping.

🛒 Processing category URL: https://www.anhoch.com/products?query=&categories[0]=grafichki-karti&inStockOnly=1&sort=latest&perPage=20&page=
📡 Fetching page 1 for category 'grafichki-karti'...
📡 Fetching page 2 for category 'grafichki-karti'...
🚫 No more products found for 'grafichki-karti'. Stopping.

🛒 Processing category URL: https://www.anhoch.com/products?query=&categories[0]=desktop-ram-memorii&inStockOnly=1&sort=latest&perPage=20&page=
📡 Fetching page 1 for category 'desktop-ram-memorii'...
📡 Fetching page 2 for category 'desktop-ram-memori