In [None]:
import subprocess
import sys
import pkg_resources
from packaging import version
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_latest_version(package):
    try:
        result = subprocess.check_output([sys.executable, '-m', 'pip', 'index', 'versions', package.split('[')[0]]).decode('utf-8')
        versions = [line.split(' ')[-1] for line in result.split('\n') if 'Available versions:' in line]
        if versions:
            return versions[0]
    except subprocess.CalledProcessError:
        return None

def check_and_install_package(package):
    package_name = package.split('[')[0]  # Ignorar extras para verificar a versão
    try:
        # Verificar se o pacote já está instalado e obter sua versão
        installed_version = pkg_resources.get_distribution(package_name).version
        print(f"{package} já está instalado na versão {installed_version}. Verificando atualizações...")
        
        # Verificar a versão mais recente disponível
        latest_version = get_latest_version(package_name)
        
        if latest_version and version.parse(installed_version) < version.parse(latest_version):
            print(f"Atualizando {package} para a versão {latest_version}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--upgrade', package])
        else:
            print(f"{package} já está atualizado.")
    except pkg_resources.DistributionNotFound:
        # Se o pacote não está instalado, instalar
        print(f"Instalando {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

def install_and_update_packages(packages):
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(check_and_install_package, package) for package in packages]
        for future in as_completed(futures):
            future.result()

# Lista de pacotes necessários
required_packages = [
    'beautifulsoup4',
    'requests',
    'httpx[http2]',
    'selenium',
    'webdriver-manager',
    'undetected-chromedriver',
    'setuptools',
    'rich',
    'IPython',
    'ipywidgets',
    'selenium-wire',
    'blinker'
]

install_and_update_packages(required_packages)

In [None]:
import json
import os
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse



def get_html_with_requests(url):
 # Extrair domínio e caminho da URL
  
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path

    headers = {
        ':authority': domain,
        ':method': 'GET',
        ':path': path,
        ':scheme': 'https',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7,ru;q=0.6,es;q=0.5',
        'Cookie': 'ajs_anonymous_id=a03ba2e0-6658-4732-bca0-5c82a729610d; blueID=158f8c36-f884-4eeb-9dd2-41ddb36954f2; _fbp=fb.2.1719565497536.945089496421286543; _tt_enable_cookie=1; _ttp=yDIqAjbcrTkStMgfYKWUtQqV7qI; _scid=fe8f87fb-9164-4128-931f-6db8349a5902; _pin_unauth=dWlkPU16QmxZV015TVRndE16QmhOUzAwWmpRekxUZzROakF0Wm1VeFlUSTROREE1WXpJeg; _hjSessionUser_257734=eyJpZCI6IjZlYmVkNTczLWRlNDItNTljNy1iYjBkLWRjMzQxYzg1ZThkZiIsImNyZWF0ZWQiOjE3MTk1NjU0OTc1ODksImV4aXN0aW5nIjp0cnVlfQ==; __utmz=other; blueULC=blue; OptanonAlertBoxClosed=2024-06-28T10:07:39.896Z; _gac_UA-58792402-1=1.1719578340.CjwKCAjwvvmzBhA2EiwAtHVrbx5PuZ-Y6ZmDRgxSQAE3bhfdMkfQ6qO_h_InopnIh9t314DXR2UOdhoCJPMQAvD_BwE; origem=adwords; _rtbhouse_source_=AdWords; anonymousClient=true; _gcl_aw=GCL.1719608537.CjwKCAjwvvmzBhA2EiwAtHVrb7a4Gf1uUaMKzAcQ623DOKsXA0ynRueURMbfhKPxXrTvmq46s3W5ZBoCXV8QAvD_BwE; _gcl_gs=2.1.k1$i1719608535; _gid=GA1.3.1821486782.1720470235; _ScCbts=%5B%5D; petzCarrinho=true; JSESSIONID=9E56A4F6251336BC2EEC9640328DF1CE; _hjSession_257734=eyJpZCI6ImRmYjFhNzkxLTcwMDAtNDc5ZC1hZDA2LTE4NGYwZGMwOTQ3MSIsImMiOjE3MjA2MzEyODk1MzEsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjowLCJzcCI6MH0=; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Jul+10+2024+14%3A33%3A28+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=202404.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=5d44d058-e0ca-4634-af6a-6ae970855f35&interactionCount=2&isAnonUser=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1&AwaitingReconsent=false&intType=1&geolocation=BR%3BRJ; AWSALB=hZab3iIPYsD0MIUknEJjbESldllV2XbXHeW6ABubXDqQC1w+fB9ShoCYGJIR2z0ndrcA5n/bdu5qP2SU+sm1uOdvT/LotUC6UCm2meSI5Mm4DIVUR9N7D5oqkEID; AWSALBCORS=hZab3iIPYsD0MIUknEJjbESldllV2XbXHeW6ABubXDqQC1w+fB9ShoCYGJIR2z0ndrcA5n/bdu5qP2SU+sm1uOdvT/LotUC6UCm2meSI5Mm4DIVUR9N7D5oqkEID; _ga_JB2JHD7FCJ=GS1.1.1720631306.23.1.1720632809.57.0.0; _ga=GA1.1.604824160.1719565115; _uetsid=006ad4a03d6811ef82e9db6f4cebf44a; _uetvid=7e097870352d11ef8996917c57f63d87; _scid_r=fe8f87fb-9164-4128-931f-6db8349a5902; _derived_epik=dj0yJnU9OVpXTXl0T0xlTmRISjQ3aGwtVFQ1SWZjVmJuT0JiaFkmbj1zc3Z6NkNFQXNmdnZHWmFDdEpIX093Jm09MSZ0PUFBQUFBR2FPeGVzJnJtPTEmcnQ9QUFBQUFHYU94ZXMmc3A9Mg; cto_bundle=Eb4Xml9POEdtRUtaTE12dTVMYjVqT2xtMDIxaDE5blJXQWRHRkZXVEJlcUdNOTdFRGNwdGtMeWslMkZGVnBENzMwM0JYMDVmQkdIeVQ4NW1qMW9laEslMkZRQnl3dzElMkZPWFUlMkZmTFJxNGdLeXUlMkZTMTRqZyUyRndXSFZ4dEsxS1hyMk1ZeGpOQVU4bmU1bkFZY1lwcHp2M3NVZ1FWQWNmYmRYS0RIc1N1ZzFBcWRZcWNlOVcxRkVaeVZGd0YzQUxFMWglMkZheXhJa0cyJTJGdUg0NiUyQk92VCUyRnVZZmtxQWN6QVAlMkZzdyUzRCUzRA; ab.storage.sessionId.8160f757-f57f-4d17-9cfb-44c82d5e8f64=%7B%22g%22%3A%221c387921-d350-d759-9bb8-04a9e46b2359%22%2C%22e%22%3A1720632840493%2C%22c%22%3A1720632809379%2C%22l%22%3A1720632810493%7D; RT="z=1&dm=www.petz.com.br&si=92ca62e5-ee3d-405f-ab43-07f1cffe3709&ss=lyg3fxm8&sl=6&tt=cia&obo=2&rl=1&ld=zeiw&r=ke1m2ztx&ul=zeix"',
        'Priority': 'u=0, i',
        'Referer': f"{domain}/marcas",
        'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
    }

    # Remover cabeçalhos pseudo para requests
    headers = {k: v for k, v in headers.items() if not k.startswith(':')}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Erro ao obter HTML com requests: {response.status_code}")
        return None




def extract_product_details(product_url, last_sku):
    parsed_url = urlparse(product_url)
    domain = parsed_url.netloc
    path = parsed_url.path
    print(f"Extraindo detalhes do produto da URL: {product_url}")
    html_content = get_html_with_requests(product_url)
    if not html_content:
        print(f"Falha ao obter HTML para o produto: {product_url}")
        return None, last_sku
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Criar pasta para imagens se não existir
    if not os.path.exists('imagens'):
        os.makedirs('imagens')
        print("Pasta 'imagens' criada.")
    
    # Extrair categoria principal e subcategorias
    breadcrumb = soup.find('nav', {'aria-label': 'breadcrumb'})
    categories = []
    if breadcrumb:
        breadcrumb_items = breadcrumb.find_all('span', class_='breadcrumb-item')
        if breadcrumb_items:
            main_category = breadcrumb_items[0].find('meta', itemprop='name')['content']
            subcategories = [item.find('meta', itemprop='name')['content'] for item in breadcrumb_items[1:]]
            categories = {
                "name": main_category,
                "link": "",  # Link não disponível
                "subcategories": [{"name": sub, "link": ""} for sub in subcategories]
            }
            print(f"Categorias encontradas: {categories}")
        else:
            print("Nenhuma categoria encontrada no breadcrumb.")
    else:
        print("Breadcrumb não encontrado.")
    
    # Extrair código de barras
    barcode_element = soup.find('span', {'id': 'product-code'})
    if barcode_element:
        barcode = barcode_element.text.replace('Código: ', '').strip()
        print(f"Código de barras encontrado: {barcode}")
    else:
        barcode = None
        print("Código de barras não encontrado.")
    
    # Extrair marca principal e submarca
    brand_element = soup.find('a', {'itemprop': 'brand'})
    if brand_element:
        brand = brand_element.find('span', {'itemprop': 'name'}).text.strip()
        print(f"Marca encontrada: {brand}")
    else:
        brand = None
        print("Marca não encontrada.")
    
    subbrand_element = soup.find('a', {'href': '/4-groomer'})
    if subbrand_element:
        subbrand = subbrand_element.find('span', class_='blue').text.strip()
        print(f"Submarca encontrada: {subbrand}")
    else:
        subbrand = None
        print("Submarca não encontrada.")
    
    # Extrair descrição do produto
    description_element = soup.find('section', {'class': 'description', 'id': 'description'})
    if description_element:
        description = description_element.find('div', class_='spec-content').text.strip()
        print(f"Descrição encontrada: {description}")
    else:
        description = None
        print("Descrição não encontrada.")
    
    # Extrair especificações do produto
    specifications = []
    specifications_container = soup.find('div', class_='container')
    if specifications_container:
        spec_items = specifications_container.find_all('li', class_='specifications')
        for item in spec_items:
            spec_key = item.find('span', class_='spec-key').text.strip()
            spec_value = item.find('span', class_='spec-value').text.strip()
            specifications.append({
                'key': spec_key,
                'value': spec_value
            })
        print(f"Especificações encontradas: {specifications}")
    else:
        print("Especificações não encontradas.")
    
    # Extrair variações do produto
    variations = []
    variations_container = soup.find('div', {'id': 'popupVariacoes'})
    if variations_container:
        print("Container de variações encontrado.")
        print(f"HTML das variações:\n{variations_container.prettify()}\n")
        for variation in variations_container.find_all('div', class_='variacao-item'):
            variation_name = variation.find('div', class_='item-name').text.strip()
            price = variation.find('div', class_='modal-item-price').find('b').text.strip()
            price = re.sub(r'[^\d]', '', price)  # Remover caracteres não numéricos
            barcode_variation = variation['data-code']
            link_variation = domain + variation['data-urlvariacao']
            last_sku += 1
            print(f"Extraindo detalhes da variação: {variation_name}, Preço: {price}, Código de barras: {barcode_variation}, Link: {link_variation}, SKU: {last_sku}")
            
            variations.append({
                'name': variation_name,
                'price': price,
                'barcode': barcode_variation,
                'link': link_variation,
                'sku': last_sku
            })
        print(f"Variações encontradas: {variations}")
    else:
        print("Nenhuma variação encontrada.")
        # Verificar se o preço está diretamente no HTML
        price_element = soup.find('div', class_='current-price-left')
        if price_element:
            price = price_element.find('strong').text.strip()
            price = re.sub(r'[^\d]', '', price)  # Remover caracteres não numéricos
            last_sku += 1
            print(f"Preço encontrado: {price}, SKU: {last_sku}")
            
            variations.append({
                'name': 'default',
                'price': price,
                'barcode': barcode,
                'link': product_url,
                'sku': last_sku
            })
        else:
            print("Preço não encontrado.")
    
    product_details = {
        'categories': categories,
        'barcode': barcode,
        'brand': {
            'name': brand,
            'subbrand': {
                'name': subbrand
            } if subbrand else None
        },
        'description': description,
        'specifications': specifications,
        'variations': variations
    }
    
    print(f"Detalhes do produto extraídos: {product_details}")
    return product_details, last_sku

def extract_products(html_content, marca_name, marca_link, last_sku):
    soup = BeautifulSoup(html_content, 'html.parser')
    product_elements = soup.select('li.card-product.card-product-showcase[itemtype="http://schema.org/Product"]')
    
    print(f"Encontrados {len(product_elements)} elementos de produto para a marca '{marca_name}'")
    
    products = []
    for product in product_elements:
        print(f"HTML do produto:\n{product.prettify()}\n")
        
        json_data = product.find('textarea', class_='jsonGa').text.strip()
        product_info = json.loads(json_data)
        
        name = product_info['name']
        link = product.find('meta', itemprop='url')['content']
        
        link = 'https://' + link
        print(f"Processando produto: {name}, URL: {link}")
        price = product_info['price']
        product_id = product_info['id']
        category = product_info['category']
        brand = product_info['brand']
        price_for_subs = product_info['priceForSubs']
        hide_subscriber_discount_price = product_info['hideSubscriberDiscountPrice']
        
        product_data = {
            "name": name,
            "link": link,
            "price": price,
            "id": product_id,
            "category": category,
            "brand": brand,
            "priceForSubs": price_for_subs,
            "hideSubscriberDiscountPrice": hide_subscriber_discount_price
        }
        
        # Obter detalhes adicionais do produto
        product_details, last_sku = extract_product_details(link, last_sku)
        if product_details:
            product_data.update(product_details)
        
        products.append(product_data)
        
        # Inserir o produto no arquivo produtos.json
        with open('produtos.json', 'r', encoding='utf-8') as f:
            produtos_data = json.load(f)
        
        # Verifica se a marca já existe no produtos_data
        existing_brand = next((m for m in produtos_data["marcas"] if m["name"] == marca_name), None)
        
        if existing_brand:
            existing_brand["produtos"].append(product_data)
        else:
            marca_data = {
                "name": marca_name,
                "link": marca_link,
                "produtos": [product_data]
            }
            produtos_data["marcas"].append(marca_data)
        
        # Salvar o arquivo produtos.json
        with open('produtos.json', 'w', encoding='utf-8') as f:
            json.dump(produtos_data, f, ensure_ascii=False, indent=4)
        
        print(f"Produto '{name}' adicionado à marca '{marca_name}'")
    
    return products, last_sku

def process_brand(marca, last_sku):
    url = marca['link']
    print(f"Processando marca: {marca['name']} - URL: {url}")
    try:
        html_content = get_html_with_requests(url)
    except Exception as e:
        print(f"Erro ao obter HTML para a marca {marca['name']} - URL: {url} - Erro: {e}")
        return False, last_sku
    
    if html_content:
        print(f"HTML obtido para a marca {marca['name']}:\n{html_content}\n")
        products, last_sku = extract_products(html_content, marca['name'], url, last_sku)
        print(f"{len(products)} produtos processados para a marca '{marca['name']}'")
    else:
        print(f"Falha ao obter HTML para a marca {marca['name']} - URL: {url}")
        return False, last_sku
    
    return True, last_sku

def main():
    print("Iniciando o processo principal")
    
    with open('marcas.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        print("Arquivo marcas.json carregado")
    
    if 'subcategories' not in data:
        print("Erro: A chave 'subcategories' não está presente no arquivo marcas.json")
        return
    
    # Criar o arquivo produtos.json se não existir
    if not os.path.exists('produtos.json'):
        with open('produtos.json', 'w', encoding='utf-8') as f:
            json.dump({"marcas": []}, f, ensure_ascii=False, indent=4)
        print("Arquivo produtos.json criado")
    
    # Adicionar as marcas ao arquivo produtos.json antes de processar os produtos
    with open('produtos.json', 'r', encoding='utf-8') as f:
        produtos_data = json.load(f)
    
    for marca in data['subcategories']:
        if not any(m["name"] == marca["name"] for m in produtos_data["marcas"]):
            produtos_data["marcas"].append({
                "name": marca["name"],
                "link": marca["link"],
                "produtos": []
            })
            print(f"Marca '{marca['name']}' adicionada ao arquivo produtos.json")
    
    with open('produtos.json', 'w', encoding='utf-8') as f:
        json.dump(produtos_data, f, ensure_ascii=False, indent=4)
    
    # Obter o último SKU usado
    last_sku = 0
    for marca in produtos_data["marcas"]:
        for produto in marca["produtos"]:
            for variacao in produto.get("variations", []):
                if variacao["sku"] > last_sku:
                    last_sku = variacao["sku"]
    
    # Processar todas as marcas
    for marca in data['subcategories']:
        _, last_sku = process_brand(marca, last_sku)
    
    print("Processo concluído")

if __name__ == "__main__":
    main()

In [None]:
import json
import os
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from urllib.parse import urlparse






def get_html_with_requests(url):
 # Extrair domínio e caminho da URL
  
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path

    headers = {
        ':authority': domain,
        ':method': 'GET',
        ':path': path,
        ':scheme': 'https',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7,ru;q=0.6,es;q=0.5',
        'Cookie': 'ajs_anonymous_id=a03ba2e0-6658-4732-bca0-5c82a729610d; blueID=158f8c36-f884-4eeb-9dd2-41ddb36954f2; _fbp=fb.2.1719565497536.945089496421286543; _tt_enable_cookie=1; _ttp=yDIqAjbcrTkStMgfYKWUtQqV7qI; _scid=fe8f87fb-9164-4128-931f-6db8349a5902; _pin_unauth=dWlkPU16QmxZV015TVRndE16QmhOUzAwWmpRekxUZzROakF0Wm1VeFlUSTROREE1WXpJeg; _hjSessionUser_257734=eyJpZCI6IjZlYmVkNTczLWRlNDItNTljNy1iYjBkLWRjMzQxYzg1ZThkZiIsImNyZWF0ZWQiOjE3MTk1NjU0OTc1ODksImV4aXN0aW5nIjp0cnVlfQ==; __utmz=other; blueULC=blue; OptanonAlertBoxClosed=2024-06-28T10:07:39.896Z; _gac_UA-58792402-1=1.1719578340.CjwKCAjwvvmzBhA2EiwAtHVrbx5PuZ-Y6ZmDRgxSQAE3bhfdMkfQ6qO_h_InopnIh9t314DXR2UOdhoCJPMQAvD_BwE; origem=adwords; _rtbhouse_source_=AdWords; anonymousClient=true; _gcl_aw=GCL.1719608537.CjwKCAjwvvmzBhA2EiwAtHVrb7a4Gf1uUaMKzAcQ623DOKsXA0ynRueURMbfhKPxXrTvmq46s3W5ZBoCXV8QAvD_BwE; _gcl_gs=2.1.k1$i1719608535; _gid=GA1.3.1821486782.1720470235; _ScCbts=%5B%5D; petzCarrinho=true; JSESSIONID=9E56A4F6251336BC2EEC9640328DF1CE; _hjSession_257734=eyJpZCI6ImRmYjFhNzkxLTcwMDAtNDc5ZC1hZDA2LTE4NGYwZGMwOTQ3MSIsImMiOjE3MjA2MzEyODk1MzEsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjowLCJzcCI6MH0=; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Jul+10+2024+14%3A33%3A28+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=202404.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=5d44d058-e0ca-4634-af6a-6ae970855f35&interactionCount=2&isAnonUser=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1&AwaitingReconsent=false&intType=1&geolocation=BR%3BRJ; AWSALB=hZab3iIPYsD0MIUknEJjbESldllV2XbXHeW6ABubXDqQC1w+fB9ShoCYGJIR2z0ndrcA5n/bdu5qP2SU+sm1uOdvT/LotUC6UCm2meSI5Mm4DIVUR9N7D5oqkEID; AWSALBCORS=hZab3iIPYsD0MIUknEJjbESldllV2XbXHeW6ABubXDqQC1w+fB9ShoCYGJIR2z0ndrcA5n/bdu5qP2SU+sm1uOdvT/LotUC6UCm2meSI5Mm4DIVUR9N7D5oqkEID; _ga_JB2JHD7FCJ=GS1.1.1720631306.23.1.1720632809.57.0.0; _ga=GA1.1.604824160.1719565115; _uetsid=006ad4a03d6811ef82e9db6f4cebf44a; _uetvid=7e097870352d11ef8996917c57f63d87; _scid_r=fe8f87fb-9164-4128-931f-6db8349a5902; _derived_epik=dj0yJnU9OVpXTXl0T0xlTmRISjQ3aGwtVFQ1SWZjVmJuT0JiaFkmbj1zc3Z6NkNFQXNmdnZHWmFDdEpIX093Jm09MSZ0PUFBQUFBR2FPeGVzJnJtPTEmcnQ9QUFBQUFHYU94ZXMmc3A9Mg; cto_bundle=Eb4Xml9POEdtRUtaTE12dTVMYjVqT2xtMDIxaDE5blJXQWRHRkZXVEJlcUdNOTdFRGNwdGtMeWslMkZGVnBENzMwM0JYMDVmQkdIeVQ4NW1qMW9laEslMkZRQnl3dzElMkZPWFUlMkZmTFJxNGdLeXUlMkZTMTRqZyUyRndXSFZ4dEsxS1hyMk1ZeGpOQVU4bmU1bkFZY1lwcHp2M3NVZ1FWQWNmYmRYS0RIc1N1ZzFBcWRZcWNlOVcxRkVaeVZGd0YzQUxFMWglMkZheXhJa0cyJTJGdUg0NiUyQk92VCUyRnVZZmtxQWN6QVAlMkZzdyUzRCUzRA; ab.storage.sessionId.8160f757-f57f-4d17-9cfb-44c82d5e8f64=%7B%22g%22%3A%221c387921-d350-d759-9bb8-04a9e46b2359%22%2C%22e%22%3A1720632840493%2C%22c%22%3A1720632809379%2C%22l%22%3A1720632810493%7D; RT="z=1&dm=www.petz.com.br&si=92ca62e5-ee3d-405f-ab43-07f1cffe3709&ss=lyg3fxm8&sl=6&tt=cia&obo=2&rl=1&ld=zeiw&r=ke1m2ztx&ul=zeix"',
        'Priority': 'u=0, i',
        'Referer': f"{domain}/marcas",
        'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
    }

    # Remover cabeçalhos pseudo para requests
    headers = {k: v for k, v in headers.items() if not k.startswith(':')}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Erro ao obter HTML com requests: {response.status_code}")
        return None






def extract_images(html_content, sku, variations):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Extrair imagens padrão da div 'swiper-wrapper'
        swiper_div = soup.find('div', {'class': 'swiper-wrapper'})
        standard_images = set()
        if swiper_div:
            for img_tag in swiper_div.find_all('img', {'class': 'swiper-thumbnail-image'}):
                img_url = img_tag.get('src')
                if img_url:
                    standard_images.add(img_url)
        else:
            print("Div 'swiper-wrapper' não encontrada.")
        
        # Extrair imagens das variações
        variation_images = {}
        for variation in variations:
            barcode = variation['barcode']
            variation_images[barcode] = set()
            for img_tag in soup.find_all('img', {'class': f'variation-{barcode}'}):  # Ajuste a classe conforme necessário
                img_url = img_tag.get('src')
                if img_url:
                    variation_images[barcode].add(img_url)
        
        # Remover duplicatas
        unique_standard_images = list(standard_images)
        for images in variation_images.values():
            unique_standard_images = list(set(unique_standard_images) - images)
        
        return unique_standard_images, variation_images
    except Exception as e:
        print(f"Erro ao extrair imagens: {e}")
        return [], {}

def format_images(sku, standard_images, variation_images):
    try:
        formatted_images = {
            "imagesDefault": [f"{sku}_{idx}.jpg" for idx in range(len(standard_images))],
            "variations": []
        }
        
        for barcode, images in variation_images.items():
            formatted_images["variations"].append({
                "name": barcode,
                "images": [f"{barcode}_{idx}.jpg" for idx in range(len(images))]
            })
        
        return formatted_images
    except Exception as e:
        print(f"Erro ao formatar imagens: {e}")
        return {}

def download_images(image_urls, base_name, folder='imagens',referer="dd"):
    if not os.path.exists(folder):
        os.makedirs(folder)
    
    for idx, image_url in enumerate(image_urls):
        image_name = f"{folder}/{base_name}_{idx}.jpg" if len(image_urls) > 1 else f"{folder}/{base_name}.jpg"
        print(f"Baixando imagem de {image_url} com o nome {image_name}")
        
        # Extrair domínio e caminho da URL
        parsed_url = urlparse(image_url)
        domain = parsed_url.netloc
        path = parsed_url.path

        
        headers = {
            'Host': domain,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7,ru;q=0.6,es;q=0.5',
            'Cache-Control': 'max-age=0',
            'Referer': referer,
            'Sec-Ch-Ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"Windows"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
            'Cookie': 'ajs_anonymous_id=a03ba2e0-6658-4732-bca0-5c82a729610d; blueID=158f8c36-f884-4eeb-9dd2-41ddb36954f2; _fbp=fb.2.1719565497536.945089496421286543; _tt_enable_cookie=1; _ttp=yDIqAjbcrTkStMgfYKWUtQqV7qI; _scid=fe8f87fb-9164-4128-931f-6db8349a5902; _pin_unauth=dWlkPU16QmxZV015TVRndE16QmhOUzAwWmpRekxUZzROakF0Wm1VeFlUSTROREE1WXpJeg; _hjSessionUser_257734=eyJpZCI6IjZlYmVkNTczLWRlNDItNTljNy1iYjBkLWRjMzQxYzg1ZThkZiIsImNyZWF0ZWQiOjE3MTk1NjU0OTc1ODksImV4aXN0aW5nIjp0cnVlfQ==; blueULC=blue; OptanonAlertBoxClosed=2024-06-28T10:07:39.896Z; _gac_UA-58792402-1=1.1719578340.CjwKCAjwvvmzBhA2EiwAtHVrbx5PuZ-Y6ZmDRgxSQAE3bhfdMkfQ6qO_h_InopnIh9t314DXR2UOdhoCJPMQAvD_BwE; origem=adwords; _rtbhouse_source_=AdWords; anonymousClient=true; _gcl_aw=GCL.1719608537.CjwKCAjwvvmzBhA2EiwAtHVrb7a4Gf1uUaMKzAcQ623DOKsXA0ynRueURMbfhKPxXrTvmq46s3W5ZBoCXV8QAvD_BwE; _gcl_gs=2.1.k1$i1719608535; _gid=GA1.3.1821486782.1720470235; _ScCbts=%5B%5D; petzCarrinho=true; _hjSession_257734=eyJpZCI6IjAxN2UxZjI1LWU1OTUtNDVmYS1hNmUxLWJjYjlmZjQxOWQ1NSIsImMiOjE3MjA2OTYxODM2ODUsInMiOjAsInIiOjAsInNiIjowLCJzciI6MCwic2UiOjAsImZzIjowLCJzcCI6MX0=; OptanonConsent=isGpcEnabled=0&datestamp=Thu+Jul+11+2024+08%3A13%3A33+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=202404.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&consentId=5d44d058-e0ca-4634-af6a-6ae970855f35&interactionCount=2&isAnonUser=1&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1&AwaitingReconsent=false&intType=1&geolocation=BR%3BRJ; _gat=1; _ga_JB2JHD7FCJ=GS1.1.1720696191.1.1.1720696414.59.0.0; _ga=GA1.1.604824160.1719565115; _uetsid=006ad4a03d6811ef82e9db6f4cebf44a; _uetvid=7e097870352d11ef8996917c57f63d87; _scid_r=fe8f87fb-9164-4128-931f-6db8349a5902; _derived_epik=dj0yJnU9SHFvU3E2eWRKWGNSMTJIdUJUV2xrWUdhOUxCdGtIZGYmbj1mUTRSTnZ3d0NFWm9fQW0taEVGUGJnJm09MSZ0PUFBQUFBR2FQdm1FJnJtPTEmcnQ9QUFBQUFHYVB2bUUmc3A9Mg; cto_bundle=41bqJl9POEdtRUtaTE12dTVMYjVqT2xtMDI0bHIlMkZyRjJKcFlzc2JmN0R6R2xhMVF4ZWE1WmwlMkZwY0s5dDRMZjg2bGw3Y3EzSTlzVnc3MGo5MWN6N3Y0TnpqWHpZY1VWbXhaZGZhQ29FTDZob1RIVHFBVjhGQkNuRXZUJTJCVDBlc0MxVXdXZGo2M3Z6JTJGSUVTUjVHYU5wM09BRUVKbyUyQjU1djBQZnYwa1I3Z0pzcVAyTUQwWm8xMSUyRlBiUFBISDQ4JTJGM1c5S0xCOXZ5OTk5RCUyRnRQa2JObmtvSUZCdGNyUSUzRCUzRA; ab.storage.sessionId.8160f757-f57f-4d17-9cfb-44c82d5e8f64=%7B%22g%22%3A%22c7472c1b-0387-fd36-9939-a2bb242a337d%22%2C%22e%22%3A1720696445818%2C%22c%22%3A1720696414673%2C%22l%22%3A1720696415818%7D'
        }
        
        try:
            response = requests.get(image_url, headers=headers)
            response.raise_for_status()  # Levanta um erro para códigos de status HTTP 4xx/5xx
            
            # Verificar se o conteúdo da resposta não está vazio
            if not response.content:
                print(f"Falha ao baixar a imagem {image_name}. O conteúdo está vazio.")
                continue
            
            with open(image_name, 'wb') as file:
                file.write(response.content)
            print(f"Imagem {image_name} baixada com sucesso.")
        except requests.exceptions.RequestException as e:
            print(f"Falha ao baixar a imagem {image_name}. Erro: {e}")




def save_images_and_update_json(json_file_path):
    print("Iniciando a função save_images_and_update_json")
    
    # Verificar se o arquivo JSON existe, se não, criar um novo arquivo com estrutura básica
    if not os.path.exists(json_file_path):
        os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
        with open(json_file_path, 'w', encoding='utf-8') as file:
            json.dump({"marcas": []}, file, ensure_ascii=False, indent=4)
        print(f"Arquivo {json_file_path} criado.")
    else:
        print(f"Arquivo {json_file_path} encontrado.")

    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        print(f"Arquivo {json_file_path} carregado com sucesso.")
    
    for marca in data['marcas']:
        print(f"Processando marca: {marca['name']}")
        for produto in marca['produtos']:
            print(f"Processando produto: {produto['name']} (SKU: {produto['sku']})")
            sku = produto['sku']
            variations = produto['variations']
            link = produto['link']
            domain = urlparse(link).netloc
            
            # Obter HTML do produto
            html_content = get_html_with_requests(produto['link'])
            if not html_content:
                print(f"Falha ao obter HTML para o produto: {produto['name']} (SKU: {sku})")
                continue
            
            # Extrair imagens
            standard_images, variation_images = extract_images(html_content, sku, variations)
            print(f"Imagens extraídas para o produto: {produto['name']} (SKU: {sku})")
            
            # Formatar imagens
            formatted_images = format_images(sku, standard_images, variation_images)
            print(f"Imagens formatadas para o produto: {produto['name']} (SKU: {sku})")
            
            # Baixar imagens padrão
            download_images(standard_images, sku, referer=domain)
            print(f"Imagens padrão baixadas para o produto: {produto['name']} (SKU: {sku})")
            
            # Baixar imagens das variações
            for barcode, images in variation_images.items():
                download_images(images, barcode)
                print(f"Imagens da variação {barcode} baixadas para o produto: {produto['name']} (SKU: {sku})")
            
            # Atualizar informações de imagens no produto
            produto['images'] = formatted_images
            print(f"Informações de imagens atualizadas para o produto: {produto['name']} (SKU: {sku})")
    
    # Salvar as atualizações no arquivo JSON
    with open(json_file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)
        print(f"Arquivo {json_file_path} atualizado com sucesso.")

# Chamar a função para salvar imagens e atualizar o JSON
save_images_and_update_json('produtos.json')





