# Bibliotecas

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json
import time
import re
import csv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Scrapping Drogaria Pacheco

## Funções Comuns

In [3]:
def acessa_site(url, headers, max_tentativas=3, intervalo_tentativas=10):
    for tentativa in range(1, max_tentativas, 1):
        try:
            r = requests.get(url, headers=headers)
            if r.status_code==200 or r.status_code==206:
                #print(f'Solicitação bem-sucedida para acessar a url: {url}')
                return r
            else:
                print(f"Falha na solicitação. Código de status: {r.status_code}")
        except Exception as e:
            print(f"Erro durante a solicitação: {e}")

        if tentativa < max_tentativas:
            print(f"Aguardando {intervalo_tentativas} segundos antes da próxima tentativa...")
            time.sleep(intervalo_tentativas)
            
    print(f"Até {max_tentativas} tentativas foram feitas, mas a solicitação não foi bem-sucedida.")
    return None

In [4]:
def url_catalogo(skuIds):
    url_catalogo = 'https://www.drogariasaopaulo.com.br/api/catalog_system/pub/products/search?_from=0&_to=49'
    valores_formatados = []
    # Loop para formatar os valores no padrão &fq=skuId:<valor>
    for skuId in skuIds:
        formatacao = f'&fq=skuId:{skuId}'
        valores_formatados.append(formatacao)
    sufixo_busca = ''.join(valores_formatados)
    return url_catalogo + sufixo_busca

In [5]:
def cria_header(referer, cookies):
    header = {
    'authority': 'www.drogariasaopaulo.com.br',
    'accept': '*/*',
    'accept-language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'cookie': cookies,
    'if-none-match': '624CDF92BB674EDAAF34D2BC6E267B82',
    'referer': referer,
    'sec-ch-ua': '"Opera GX";v="105", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
    }
    return header

In [6]:
def extrai_skuid(url_api, headers, page_inicial=1, page_final=None):
    skuId = []
    page = 1
    done = False
    while not done:
        url = url_api + f'&PageNumber={page}'
        r = acessa_site(url, headers=headers)
        page += 1
        soup = BeautifulSoup(r.text, 'html.parser')
        sku_produtos = soup.findAll('input', class_='product-sku')
        if len(sku_produtos) != 0:
            sku_pagina = []
            for produto in sku_produtos:
                sku = produto['value']
                sku_pagina.append(sku)
            skuId.extend(sku_pagina)
        else:
            done = True
        if page_final is not None and page >= page_final:
            done = True
    return skuId

In [7]:
def exporta_skuId(skuId_list, categoria, indent=2):
    json_data = json.dumps(skuId_list, indent=indent)
    file_name = f'ids_produtos/' + categoria + '_ids.json'
    with open(file_name, 'w') as _:
        _.write(json_data)
    print(f'Arquivo {file_name} exportado com sucesso')

In [8]:
def exporta_scrapping(lista_produtos, categoria, indent=2):
    json_data = json.dumps(lista_produtos, indent=indent)
    file_name = categoria + f'_scrapping.json'
    with open(file_name, 'w') as _:
        _.write(json_data)

In [9]:
def quantidade_produtos_na_pagina(url_principal, extensao_categoria, cookie):
    header = cria_header(url_principal+extensao_categoria, cookie)
    r = acessa_site(url_principal+extensao_categoria, header)
    soup = BeautifulSoup(r.text, 'html.parser')
    qtd = soup.find('p', class_='searchResultsTime').find('span', class_='value').text
    return int(qtd)

In [10]:
def exportar_para_csv(lista_de_dicionarios, categoria):
    nome_arquivo = f'scrapping_produtos_por_categoria/' + categoria + f'_scrapping.csv'
    # Extrai as chaves do primeiro dicionário na lista
    chaves = lista_de_dicionarios[0].keys() if lista_de_dicionarios else []

    with open(nome_arquivo, 'w', newline='', encoding='utf-8') as arquivo_csv:
        # Cria um escritor CSV com as chaves como cabeçalho
        escritor_csv = csv.DictWriter(arquivo_csv, fieldnames=chaves)

        # Escreve o cabeçalho no arquivo
        escritor_csv.writeheader()

        # Escreve cada dicionário como uma linha no arquivo
        for dicionario in lista_de_dicionarios:
            escritor_csv.writerow(dicionario)

In [11]:
def pipeline_extracao_categoria_ANTIGO(url_principal, extensao_categoria, url_api_busca, cookie):
    inicio = time.time()
    categoria = re.search(r'/(.*)', extensao_categoria).group(1)
    print(f'Extraindo dados da categoria {categoria}')
    headers_categoria = cria_header(referer=url_api_busca,
                                    cookies=cookie)
    skuId_categoria = extrai_skuid(url_api_busca, headers=headers_categoria)
    exporta_skuId(skuId_categoria, categoria)
    print(f'Há {len(skuId_categoria)} produtos para serem extraídos')

    todos_produtos = []
    passo = 50
    for i in range(0, len(skuId_categoria), passo):
        sku_produtos = skuId_categoria[i:i+passo]
        url = url_catalogo(sku_produtos)
        r = acessa_site(url, headers_categoria)
        soup = BeautifulSoup(r.text, 'html.parser')
        produtos = soup.text.split('{"productId":"')[1:]

        produtos_da_pagina = []
        for produto in produtos:
            pattern1 = re.compile(r'(\d+)","productName":"([^"]*)","brand":"([^"]*)","brandId":(\d+),"brandImageUrl":([^,]*),' \
                                r'"linkText":"([^"]*)","productReference":"([^"]*)","productReferenceCode":([^,]*),"categoryId":"([^"]*)",' \
                                r'"productTitle":"([^"]*)","metaTagDescription":"([^"]*)","releaseDate":"([^"]*)T[^"]*Z"')
            
            pattern2 = re.compile(r'"Price":(?P<Price>\d+\.\d+),"ListPrice":(?P<ListPrice>\d+\.\d+),"PriceWithoutDiscount":(?P<PriceWithoutDiscount>\d+\.\d+),' \
                                r'"RewardValue":(?P<RewardValue>\d+\.\d+),"PriceValidUntil":"(?P<PriceValidUntil>[^"]*)","AvailableQuantity":(?P<AvailableQuantity>\d+),' \
                                r'"IsAvailable":(?P<IsAvailable>true|false),"Tax":(?P<Tax>\d+\.\d+)')
            matches1 = pattern1.findall(produto)
            matches2 = pattern2.findall(produto)

            produto_info = []
            for match in matches1:
                productId, productName, brand, brandId, brandImageUrl, linkText, productReference, productReferenceCode, categoryId, productTitle, metaTagDescription, releaseDate = match
                produto_info.append({
                    "productId": productId,
                    "productName": productName,
                    "brand": brand,
                    "brandId": brandId,
                    "brandImageUrl": brandImageUrl,
                    "linkText": linkText,
                    "productReference": productReference,
                    "productReferenceCode": productReferenceCode,
                    "categoryId": categoryId,
                    "productTitle": productTitle,
                    "metaTagDescription": metaTagDescription,
                    "releaseDate": releaseDate
                })

            for match in matches2:
                Price, ListPrice, PriceWithoutDiscount, RewardValue, PriceValidUntil, AvailableQuantity, IsAvailable, Tax = match
                produto_info.append({
                    "Price": Price,
                    "ListPrice": ListPrice,
                    "PriceWithoutDiscount": PriceWithoutDiscount,
                    "RewardValue": RewardValue,
                    "PriceValidUntil": PriceValidUntil,
                    "AvailableQuantity": AvailableQuantity,
                    "IsAvailable": IsAvailable,
                    "Tax": Tax
                })

            produtos_da_pagina.append(produto_info)
        todos_produtos.append(produtos_da_pagina)
    lista_de_produtos_unificada = [produto for lista in todos_produtos for produto in lista]
    exporta_scrapping(lista_de_produtos_unificada, categoria)
    fim = time.time()
    print(f'Extração finalizada - tempo de extração: {fim-inicio} segundos')

In [12]:
def pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_busca, cookie):
    inicio = time.time()
    categoria = re.search(r'/(.*)', extensao_categoria).group(1)
    print(f'Extraindo dados da categoria {categoria}')
    qtd_produtos = quantidade_produtos_na_pagina(url_principal, extensao_categoria, cookie)
    print(f'Há {qtd_produtos} produtos na página')
    headers_categoria = cria_header(referer=url_api_busca,
                                    cookies=cookie)
    
    filters = ['', 'O=OrderByNameASC&', 'O=OrderByNameDESC&', 'O=OrderByPriceDESC&', 'O=OrderByPriceASC&', 'O=OrderByTopSaleDESC&']
    skuId_all = []
    ampersand_index = url_api_busca.find('&')
    for filter_str in filters:
        modified_url = url_api_busca[:ampersand_index] + filter_str + url_api_busca[ampersand_index:]
        print(f'Extraindo ids de {modified_url}')
        skuId_categoria = extrai_skuid(modified_url, headers=headers_categoria)
        print(f'Extraiu {len(skuId_categoria)} ids')
        skuId_all.extend(skuId_categoria)
    # Remove duplicates from skuId_categoria_all
    skuId_all = list(set(skuId_all))


    exporta_skuId(skuId_all, categoria)
    print(f'Há {len(skuId_all)} skuIDs de produtos para serem usados na extração')

    todos_produtos = []
    passo = 50
    for i in range(0, len(skuId_all), passo):
        sku_produtos = skuId_all[i:i+passo]
        url = url_catalogo(sku_produtos)
        r = acessa_site(url, headers_categoria)
        soup = BeautifulSoup(r.text, 'html.parser')
        produtos = soup.text.split('{"productId":"')[1:]

        produtos_da_pagina = []
        for produto in produtos:
            padrao1 = re.compile(
                r'"(?:productName|brand|link|description|itemId|ean)":"?([^"]*)"?')
                
            padrao2 = re.compile(r'"(?:Price|ListPrice|PriceWithoutDiscount|IsAvailable)":([^",]*)')
            corresp1 = padrao1.findall(produto)
            corresp2 = padrao2.findall(produto)
            correspondencias = corresp1[:6]+corresp2[:4]
            if correspondencias:
                # Agora, correspondencias é uma lista de tuplas com os valores capturados para cada campo
                campos = [
                    "productName",
                    "brand",
                    "link",
                    "description",
                    "skuId",
                    "ean",
                    "Price",
                    "ListPrice",
                    "PriceWithoutDiscount",
                    "IsAvailable",
                ]
                infos_produto = dict(zip(campos, correspondencias))
            else:
                print("Nenhuma correspondência encontrada.")

            # Adiciona os dicionarios com as infos dos produtos na lista da pagina
            produtos_da_pagina.append(infos_produto)
        # Unifica as listas das paginas em uma lista maior
        todos_produtos.append(produtos_da_pagina)
    # Transforma em uma única lista
    todos_produtos = [produtos for pagina in todos_produtos for produtos in pagina]
    exportar_para_csv(todos_produtos, categoria)

    qtd_extraida = len(todos_produtos)
    qtd_faltante = qtd_produtos - qtd_extraida
    fim = time.time()
    print(f'Extração de {qtd_extraida} produtos finalizada - Faltaram {qtd_faltante} produtos - Tempo de Extração: {fim-inicio} segundos')

## Saúde e Bem-Estar

In [12]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/saude-bem-estar'
url_api_saude = f'https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; SGTS=0994DAA03BCAC015953D68B12F34B6F7; _dc_gtm_UA-31155422-1=1; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705619305.60.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705619305.60.0.0; urlLastSearch=http://www.drogariaspacheco.com.br/saude-bem-estarhttp://www.drogariaspacheco.com.br/saude-bem-estar; _vss=089865D9D0097EDA66C965D543EDF92107A6DA0952E95CDEFD829C44742B967C; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A08%3A27+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false; OptanonAlertBoxClosed=2024-01-18T23:08:27.087Z'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_saude, cookie)

Extraindo dados da categoria saude-bem-estar


Há 5188 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606O=OrderByNameASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606O=OrderByNameDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606O=OrderByPriceDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606O=OrderByPriceASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13606O=OrderByTopSaleDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583

## Mamãe e bebê

In [13]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/mamae-e-bebe'
url_api_mamaebebe = f'https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13602&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; SGTS=0994DAA03BCAC015953D68B12F34B6F7; janus_sid=51c6fd94-32ff-46bc-84fb-ff6c83b8734b; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705619669.60.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705619670.59.0.0; urlLastSearch=http://www.drogariaspacheco.com.br/mamae-e-bebehttp://www.drogariaspacheco.com.br/mamae-e-bebe; _vss=37AB6CBF1B51C602B813CAAA93C95731CCA44136B8A8DB23ECC26283FC2C9EC5; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A14%3A35+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false; OptanonAlertBoxClosed=2024-01-18T23:14:35.278Z'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_mamaebebe, cookie)

Extraindo dados da categoria mamae-e-bebe
Há 797 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13602&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 797 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13602O=OrderByNameASC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13602O=OrderByNameDESC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13602O=OrderByPriceDESC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0


## Beleza

In [13]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/beleza'
url_api_beleza = f'https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; SGTS=0994DAA03BCAC015953D68B12F34B6F7; _dc_gtm_UA-31155422-1=1; urlLastSearch=http://www.drogariaspacheco.com.br/belezahttp://www.drogariaspacheco.com.br/beleza; janus_sid=2df1f9b2-a832-44b9-b585-04e03dfae636; _vss=FC631026100F83C1D3BCCB7A95657EF0AD3264595B1885CE13B6CF9BB9AE818D; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A24%3A19+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false; OptanonAlertBoxClosed=2024-01-18T23:24:19.308Z; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705620261.51.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705620261.51.0.0'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_beleza, cookie)

Extraindo dados da categoria beleza
Há 7009 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605O=OrderByNameASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605O=OrderByNameDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605O=OrderByPriceDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605O=OrderByPriceASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13605O=OrderByTopSaleDESC&&PS=48&

## Cabelos

In [14]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/cuidados-para-cabelos'
url_api_cabelos = f'https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; SGTS=0994DAA03BCAC015953D68B12F34B6F7; janus_sid=51c6fd94-32ff-46bc-84fb-ff6c83b8734b; _dc_gtm_UA-31155422-1=1; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705619879.56.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705619879.56.0.0; urlLastSearch=http://www.drogariaspacheco.com.br/cuidados-para-cabeloshttp://www.drogariaspacheco.com.br/cuidados-para-cabelos; _vss=B4343A689896ADBB2D240D48981B9CA4F33B388CE69D7BB55F95DF3D82F4A622; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A18%3A02+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false; OptanonAlertBoxClosed=2024-01-18T23:18:02.601Z'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_cabelos, cookie)

Extraindo dados da categoria cuidados-para-cabelos
Há 7346 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910O=OrderByNameASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910O=OrderByNameDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910O=OrderByPriceDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910O=OrderByPriceASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13910O=OrderByTopS

## Higiene Pessoal

In [15]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/higiene-pessoal'
url_api_higiene = f'https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13604&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; SGTS=0994DAA03BCAC015953D68B12F34B6F7; janus_sid=2df1f9b2-a832-44b9-b585-04e03dfae636; _dc_gtm_UA-31155422-1=1; urlLastSearch=http://www.drogariaspacheco.com.br/higiene-pessoalhttp://www.drogariaspacheco.com.br/higiene-pessoal; _vss=AA21188D512EB658B1D141BDFB50F7BB1B2FFCECD7984D067FF2D2831D1831E8; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A25%3A33+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false; OptanonAlertBoxClosed=2024-01-18T23:25:33.260Z; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705620337.53.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705620337.53.0.0'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_higiene, cookie)

Extraindo dados da categoria higiene-pessoal
Há 15381 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13604&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13604O=OrderByNameASC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13604O=OrderByNameDESC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13604O=OrderByPriceDESC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a13604O=OrderByPriceASC&&O=OrderByTopSaleDESC&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu

## Lojas Parceiras

In [16]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/lojas-parceiras'
url_api_lojas_parceiras = f'https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a10752&O=reviewRate+desc%2c+score+desc&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; _dc_gtm_UA-31155422-1=1; janus_sid=ed0385f7-0bbe-49cc-ad9b-fda4b7410e95; ISICI=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; ISS=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; SGTS=C725567B04AA8AD000002B4C1ED850EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOiJkcHNwLWF0aXZhX3BoaWxpcHNfMDgtMTItMjMtbWFyY2EtbWVudS0xMjB4MTIwIiwiY3VycmVuY3lDb2RlIjoiQlJMIiwiY3VycmVuY3lTeW1ib2wiOiJSJCIsImNvdW50cnlDb2RlIjoiQlJBIiwiY3VsdHVyZUluZm8iOiJwdC1CUiIsImNoYW5uZWxQcml2YWN5IjoicHVibGljIn0; _ga_HDQHBSR58M=deleted; urlLastSearch=http://www.drogariaspacheco.com.br/lojas-parceirashttp://www.drogariaspacheco.com.br/lojas-parceiras; _vss=3BEC7FA0D15C9082B0F7F9085EC31636F15A504F01C6D5EE2C52A529F0DF7D2A; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A26%3A43+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false; OptanonAlertBoxClosed=2024-01-18T23:26:43.915Z; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705620407.45.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705620407.45.0.0'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_lojas_parceiras, cookie)

Extraindo dados da categoria lojas-parceiras
Há 672 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a10752&O=reviewRate+desc%2c+score+desc&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 672 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a10752O=OrderByNameASC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a10752O=OrderByNameDESC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a10752O=OrderByPriceDESC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=H%3a10752O=OrderByPriceASC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=bb3

## Medicamentos

### Método API

In [17]:
'''
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/medicamentos'
url_api_medicamentos = f'https://www.drogariaspacheco.com.br/buscapagina?fq=C%3a%2f800%2f&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; janus_sid=ed0385f7-0bbe-49cc-ad9b-fda4b7410e95; ISICI=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; ISS=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; SGTS=C725567B04AA8AD000002B4C1ED850EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOiJkcHNwLWF0aXZhX3BoaWxpcHNfMDgtMTItMjMtbWFyY2EtbWVudS0xMjB4MTIwIiwiY3VycmVuY3lDb2RlIjoiQlJMIiwiY3VycmVuY3lTeW1ib2wiOiJSJCIsImNvdW50cnlDb2RlIjoiQlJBIiwiY3VsdHVyZUluZm8iOiJwdC1CUiIsImNoYW5uZWxQcml2YWN5IjoicHVibGljIn0; _ga_HDQHBSR58M=deleted; _dc_gtm_UA-31155422-1=1; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705620672.60.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705620672.60.0.0; urlLastSearch=http://www.drogariaspacheco.com.br/medicamentoshttp://www.drogariaspacheco.com.br/medicamentos; _vss=EDC3639ABF394DE554F31FE6A9A31061B35019BBBCD05F123E4E529273B7B57C; OptanonAlertBoxClosed=2024-01-18T23:31:19.949Z; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A31%3A20+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_medicamentos, cookie)
'''

Extraindo dados da categoria medicamentos
Há 10217 produtos na página
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=C%3a%2f800%2f&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByNameASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByNameDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByPriceDESC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=0
Falha na solicitação. Código de status: 500
Aguardando 10 segundos antes da próxima tentativa...
Extraiu 2400 ids
Extraindo ids de https://www.drogariaspacheco.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByPriceASC&&PS=48&sl=bb3695b9-ec9e-49c5-9e6a-715118583877&cc=48&sm=

### Método iteração de páginas e script

In [19]:
def extrai_skuid_medicamento(url_busca, headers, initial_page=1):
    """
    Extracts the shelfProductIds (skuIDs) for medicinal products from a paginated website.

    Parameters:
    - url_busca (str): The base URL for the search with pagination, e.g., 'http://www.example.com/medicamentos'.
    - headers (dict): HTTP headers to be included in the request.
    - initial_page (int): The starting page number for pagination. Default is 1.

    Returns:
    - list: A list of skuIDs extracted from the website.

    Note:
    The function iterates through paginated pages, extracts the skuIDs from the JSON data embedded
    in the page's script tags, and continues until no more skuIDs are found or an error occurs.

    Example:
    ```python
    url = 'http://www.example.com/medicamentos'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    skuIDs = extrai_skuid_medicamento(url, headers)
    print(skuIDs)
    ```
    """
    s = requests.Session()
    page = initial_page
    done = False
    skuID_meds = []
    while not done:
        url = url_busca + f'?PageNumber={page}'
        try:
            r = s.get(url, headers=headers)
            page += 1
            try:
                soup = BeautifulSoup(r.text, 'html.parser')
                json_object = json.loads(soup.find_all('script')[26].text.split('(')[-1].split(')')[0])
                if len(json_object['shelfProductIds']) != 0:
                    skuID_meds.extend(json_object['shelfProductIds'])
                else:
                    done = True
            except Exception as e:
                if e == "'NoneType' object has no attribute 'text'":
                    done = True
                print(f'Error page {page-1} ---> {e}')
        except:
            print('dormiu')
            time.sleep(5)
    return skuID_meds

In [20]:
def pipeline_medicamento(url_principal, extensao_categoria, cookie, initial_page):
    """
    Performs a data extraction pipeline for medicinal products from a paginated website.

    Parameters:
    - url_principal (str): The main URL of the website.
    - extensao_categoria (str): The category extension for medicinal products, e.g., '/medicamentos'.
    - cookie (str): The cookie information for the HTTP request.
    - initial_page (int): The starting page number for pagination.

    Returns:
    - None

    Note:
    The function extracts product information, including SKU IDs, from the specified category on the website.
    It performs pagination, accesses individual product pages, and exports the collected data to a CSV file.

    Example:
    ```python
    url = 'http://www.example.com'
    category_extension = '/medicamentos'
    cookie_info = 'your_cookie_information_here'
    initial_page_number = 1
    pipeline_medicamento(url, category_extension, cookie_info, initial_page_number)
    ```
    """
    inicio = time.time()
    categoria = re.search(r'/(.*)', extensao_categoria).group(1)
    print(f'Extraindo dados da categoria {categoria}')
    qtd_produtos = quantidade_produtos_na_pagina(url_principal, extensao_categoria, cookie)
    print(f'Há {qtd_produtos} produtos na página')
    headers_categoria = cria_header(referer=url_principal+extensao_categoria,cookies=cookie)
    url_busca = url_principal+extensao_categoria
    skuId_categoria = extrai_skuid_medicamento(url_busca, headers_categoria, initial_page)
    exporta_skuId(skuId_categoria, categoria)
    print(f'Há {len(skuId_categoria)} skuIDs de produtos para serem usados na extração')

    todos_produtos = []
    passo = 50
    for i in range(0, len(skuId_categoria), passo):
        sku_produtos = skuId_categoria[i:i+passo]
        url = url_catalogo(sku_produtos)
        r = acessa_site(url, headers_categoria)
        soup = BeautifulSoup(r.text, 'html.parser')
        produtos = soup.text.split('{"productId":"')[1:]

        produtos_da_pagina = []
        for produto in produtos:
            padrao1 = re.compile(
                r'"(?:productName|brand|link|description|itemId|ean)":"?([^"]*)"?')
                
            padrao2 = re.compile(r'"(?:Price|ListPrice|PriceWithoutDiscount|IsAvailable)":([^",]*)')
            corresp1 = padrao1.findall(produto)
            corresp2 = padrao2.findall(produto)
            correspondencias = corresp1[:6]+corresp2[:4]
            if correspondencias:
                # Agora, correspondencias é uma lista de tuplas com os valores capturados para cada campo
                campos = [
                    "productName",
                    "brand",
                    "link",
                    "description",
                    "skuId",
                    "ean",
                    "Price",
                    "ListPrice",
                    "PriceWithoutDiscount",
                    "IsAvailable",
                ]
                infos_produto = dict(zip(campos, correspondencias))
            else:
                print("Nenhuma correspondência encontrada.")

            # Adiciona os dicionarios com as infos dos produtos na lista da pagina
            produtos_da_pagina.append(infos_produto)
        # Unifica as listas das paginas em uma lista maior
        todos_produtos.append(produtos_da_pagina)
    # Transforma em uma única lista
    todos_produtos = [produtos for pagina in todos_produtos for produtos in pagina]
    exportar_para_csv(todos_produtos, categoria)

    qtd_extraida = len(todos_produtos)
    qtd_faltante = qtd_produtos - qtd_extraida
    fim = time.time()
    print(f'Extração de {qtd_extraida} produtos finalizada - Faltaram {qtd_faltante} produtos - Tempo de Extração: {fim-inicio} segundos')

In [21]:
url_principal = 'https://www.drogariaspacheco.com.br'
extensao_categoria = f'/medicamentos'
cookie = f'VtexRCMacIdv7=077e06e2-5b38-40d9-bcfb-55976c11db6e; _gcl_au=1.1.1049783792.1705502337; _DPSP_GA=GA1.3.766143064.1705502337; vitrioLastClickPaidSource=google; origem=adwords; _pm_id=796501705502337144; lmd_cj=google_anuncio; lmd_orig=google_paid; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; _ga=GA1.1.766143064.1705502337; checkout.vtex.com=__ofid=82a755c0c9ef47bb978e338c6edd40a9; CheckoutOrderFormOwnership=; nav_id=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; _fbp=fb.2.1705502337489.1847261636; _tt_enable_cookie=1; _ttp=eJuRcjVOqT_O98h0GyZqIpoKbYh; legacy_p=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; chaordic_browserId=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_c=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; legacy_s=d9ee7205-c56e-4e42-8fb4-d3a22f05d086; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6IkU3QkQ1QzZCNUQ5MDEwNUU0MUIzQkEzMDVEMTU3NTE2NDZGNDU3MUYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNTdmOWJjOGItYzlmYi00MzBkLWI1NjQtMzA3NTU1YTRlZWY4IiwiaWQiOiJhMmExM2YwZi02YTE2LTRkOGQtODI0MS04OWVkN2YwODQ0ZWMiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYxOTM1MzYsImlhdCI6MTcwNTUwMjMzNiwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6IjBmNjUwOWFmLWIyNWMtNDlmNi05MTMxLWYxMjYyYzVhODE0OSJ9.3gjyeUPesAcsnAEEWFeadWR0e7pL3KEWyXIxNKDDYrxBmBusvnGbPMyhHqWvYRU9bq_NGxTbKnWTW2JfroS4EA; analytic_id=1705502340577555; _vt_shop=2346; _vt_user=2561364766504499_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A39%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=0&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _hjSessionUser_3397161=eyJpZCI6IjI2MTNmYjUyLTVjMGEtNTA2My1hYWFjLWYzODc0NzQ5MTYwZSIsImNyZWF0ZWQiOjE3MDU1MDIzMzc1MjEsImV4aXN0aW5nIjp0cnVlfQ==; _gcl_aw=GCL.1705603635.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; _DPSP_GA_gid=GA1.3.443097878.1705603636; _gac_UA-31155422-1=1.1705603636.Cj0KCQiAtaOtBhCwARIsAN_x-3Jzb2rjrMzotETTIt620SxXffCvMCnkvdHfd1FY2sIm8-KuOP8pkdwaApHREALw_wcB; lmd_traf=google_paid-1705502337156&google_paid-1705603635596; VTEXSC=sc=1; _uetsid=011762b0b63211ee827cc3fd2f2735a9; _uetvid=01186410b63211eea64dcd7e5cb73803; IPS=Midia=0&Campanha=0&Parceiro=0; CYB_ID=2561364766504499; CYB_AB=1; cybSessionID=1; janus_sid=ed0385f7-0bbe-49cc-ad9b-fda4b7410e95; ISICI=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; ISS=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; SGTS=C725567B04AA8AD000002B4C1ED850EA; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOiJkcHNwLWF0aXZhX3BoaWxpcHNfMDgtMTItMjMtbWFyY2EtbWVudS0xMjB4MTIwIiwiY3VycmVuY3lDb2RlIjoiQlJMIiwiY3VycmVuY3lTeW1ib2wiOiJSJCIsImNvdW50cnlDb2RlIjoiQlJBIiwiY3VsdHVyZUluZm8iOiJwdC1CUiIsImNoYW5uZWxQcml2YWN5IjoicHVibGljIn0; _ga_HDQHBSR58M=deleted; _dc_gtm_UA-31155422-1=1; _ga_HDQHBSR58M=GS1.1.1705619217.3.1.1705620672.60.0.0; _ga_X010SFPX09=GS1.1.1705619224.3.1.1705620672.60.0.0; urlLastSearch=http://www.drogariaspacheco.com.br/medicamentoshttp://www.drogariaspacheco.com.br/medicamentos; _vss=EDC3639ABF394DE554F31FE6A9A31061B35019BBBCD05F123E4E529273B7B57C; OptanonAlertBoxClosed=2024-01-18T23:31:19.949Z; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+20%3A31%3A20+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=c19d81aa-ba05-4b6a-a3f8-2e14ebf96a7c&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&geolocation=BR%3BSP&AwaitingReconsent=false'
pipeline_medicamento(url_principal, extensao_categoria, cookie, 1)

Extraindo dados da categoria medicamentos
Há 10206 produtos na página
Arquivo medicamentos_ids.json exportado com sucesso
Há 5934 skuIDs de produtos para serem usados na extração
Extração de 5895 produtos finalizada - Faltaram 4311 produtos - Tempo de Extração: 570.9710371494293 segundos


### Unificação dos arquivos para Entrega Final

In [5]:
arquivos = [
    'beleza_scrapping.csv',
    'cuidados-para-cabelos_scrapping.csv',
    'higiene-pessoal_scrapping.csv',
    'lojas-parceiras_scrapping.csv',
    'mamae-e-bebe_scrapping.csv',
    'medicamentos_scrapping.csv',
    'saude-bem-estar_scrapping.csv'
]

columns_rename = {'productName': 'produto',
                  'brand': 'marca',
                  'description': 'descricao',
                  'Price': 'preco com desconto',
                  'ListPrice': 'preco sem desconto'}

ordem_colunas = ['ean', 'produto', 'marca', 'farmacia', 'preco com desconto', 'preco sem desconto', '% desconto', 'descricao']

df = pd.DataFrame()
for arquivo in arquivos:
    categoria = arquivo.split('_scrapping')[0]
    df_temp = pd.read_csv(f'scrapping_produtos_por_categoria/'+arquivo)
    df_temp['categoria'] = categoria
    df = pd.concat([df, df_temp], axis=0)

df = df.reset_index(drop=True)
df = df[~df.duplicated()]
df.rename(columns=columns_rename, inplace=True)
df['farmacia'] = 'Drogaria Pacheco'
df['% desconto'] = (1 - (df['preco com desconto'] / df['preco sem desconto'])) * 100
df = df[ordem_colunas]

df.to_csv('arquivo_final.csv')