# Bibliotecas

In [1]:
# Suppress DeprecationWarning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
import time
import re
import csv

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Scrapping Drogaria São Paulo

## Informações Gerais sobre o processo
---
Basicamente o sistema de obtenção de dados nesse site é idêntico ao da Drogaria Pacheco, a qual o Pedro Remus (aka Remin, o homem Inovação) é responsável pela extração e já fez parte do trabalho.

Curiosidade: os sistemas são idênticos porque as drogarias pertencem ao mesmo grupo, que se chama DPSP

---

O scrapping será divido nas diferentes seções que o site apresenta: 
- Medicamentos ( */medicamentos* );
- Saúde e Bem-estar ( */saude-bem-estar* );
- Mamãe e Bebê ( */mamae-e-bebe* );
- Beleza ( */beleza* );
- Cabelos ( */cuidados-para-cabelos* );
- Higiene Pessoal ( */higiene-pessoal* );
- Lojas Parceiras ( */lojas-parceiras* );
- Serviços ( */servicos* );
- Marca Própria ( */marca/propria* ).

Para conseguir extrair os dados, antes de tudo é necessário ter acesso ao skuID dos produtos exibidos na página. Isso porque a API que fornece as informações dos produtos precisa desses ID's. 

Dessa forma, o trabalho pode ser resumido em:
- Obter skudID's dos produtos na *API[1]*
- Buscar produtos usando esses ID's na *API[2]*



## Funções Comuns

In [8]:
def acessa_site(url, headers, max_tentativas=3, intervalo_tentativas=10):
    for tentativa in range(1, max_tentativas, 1):
        try:
            r = requests.get(url, headers=headers)
            if r.status_code==200 or r.status_code==206:
                #print(f'Solicitação bem-sucedida para acessar a url: {url}')
                return r
            else:
                print(f"Falha na solicitação. Código de status: {r.status_code}")
        except Exception as e:
            print(f"Erro durante a solicitação: {e}")

        if tentativa < max_tentativas:
            print(f"Aguardando {intervalo_tentativas} segundos antes da próxima tentativa...")
            time.sleep(intervalo_tentativas)
            
    print(f"Até {max_tentativas} tentativas foram feitas, mas a solicitação não foi bem-sucedida.")
    return None

In [9]:
def url_catalogo(skuIds):
    url_catalogo = 'https://www.drogariasaopaulo.com.br/api/catalog_system/pub/products/search?_from=0&_to=49'
    valores_formatados = []
    # Loop para formatar os valores no padrão &fq=skuId:<valor>
    for skuId in skuIds:
        formatacao = f'&fq=skuId:{skuId}'
        valores_formatados.append(formatacao)
    sufixo_busca = ''.join(valores_formatados)
    return url_catalogo + sufixo_busca

In [10]:
def cria_header(referer, cookies):
    header = {
    'authority': 'www.drogariasaopaulo.com.br',
    'accept': '*/*',
    'accept-language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'cookie': cookies,
    'if-none-match': '624CDF92BB674EDAAF34D2BC6E267B82',
    'referer': referer,
    'sec-ch-ua': '"Opera GX";v="105", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
    }
    return header

In [11]:
def extrai_skuid(url_api, headers, page_final=None):
    skuId = []
    page = 1
    done = False
    while not done:
        url = url_api + f'&PageNumber={page}'
        r = acessa_site(url, headers=headers)
        page += 1
        soup = BeautifulSoup(r.text, 'html.parser')
        sku_produtos = soup.findAll('input', class_='product-sku')
        if len(sku_produtos) != 0:
            sku_pagina = []
            for produto in sku_produtos:
                sku = produto['value']
                sku_pagina.append(sku)
            skuId.extend(sku_pagina)
        else:
            done = True
        if page_final is not None and page >= page_final:
            done = True
    return skuId

In [12]:
def exporta_skuId(skuId_list, categoria, indent=2):
    json_data = json.dumps(skuId_list, indent=indent)
    file_name = f'ids_produtos/' + categoria + '_ids.json'
    with open(file_name, 'w') as _:
        _.write(json_data)
    print(f'Arquivo {file_name} exportado com sucesso')

In [13]:
def exporta_scrapping(lista_produtos, categoria, indent=2):
    json_data = json.dumps(lista_produtos, indent=indent)
    file_name = categoria + f'_scrapping.json'
    with open(file_name, 'w') as _:
        _.write(json_data)

In [14]:
def quantidade_produtos_na_pagina(url_principal, extensao_categoria, cookie):
    header = cria_header(url_principal+extensao_categoria, cookie)
    r = acessa_site(url_principal+extensao_categoria, header)
    soup = BeautifulSoup(r.text, 'html.parser')
    qtd = soup.find('p', class_='searchResultsTime').find('span', class_='value').text
    return int(qtd)

In [15]:
def exportar_para_csv(lista_de_dicionarios, categoria):
    nome_arquivo = f'scrapping_produtos_por_categoria/' + categoria + f'_scrapping.csv'
    # Extrai as chaves do primeiro dicionário na lista
    chaves = lista_de_dicionarios[0].keys() if lista_de_dicionarios else []

    with open(nome_arquivo, 'w', newline='', encoding='utf-8') as arquivo_csv:
        # Cria um escritor CSV com as chaves como cabeçalho
        escritor_csv = csv.DictWriter(arquivo_csv, fieldnames=chaves)

        # Escreve o cabeçalho no arquivo
        escritor_csv.writeheader()

        # Escreve cada dicionário como uma linha no arquivo
        for dicionario in lista_de_dicionarios:
            escritor_csv.writerow(dicionario)

In [16]:
def tabela_scrapping(arquivo):
    try:
        # Lê o arquivo CSV e coloca-o em um DataFrame
        df = pd.read_csv(arquivo)

        # Imprime as primeiras linhas do DataFrame
        print("Primeiras linhas do DataFrame:")
        print(df.head())

        # Imprime informações básicas sobre o DataFrame
        print("\nInformações sobre o DataFrame:")
        print(df.info())

        # Imprime estatísticas descritivas do DataFrame
        print("\nEstatísticas descritivas:")
        print(df.describe())

    except FileNotFoundError:
        print(f"Erro: O arquivo {arquivo} não foi encontrado.")
    except Exception as e:
        print(f"Erro ao processar o arquivo: {e}")

In [17]:
def pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_busca, cookie):
    """
    Performs a data extraction pipeline for a specified category on a paginated website using an API endpoint.

    Parameters:
    - url_principal (str): The main URL of the website.
    - extensao_categoria (str): The category extension, e.g., '/categoria'.
    - url_api_busca (str): The API endpoint URL for category-specific product search.
    - cookie (str): The cookie information for the HTTP request.

    Returns:
    - None

    Note:
    The function extracts product information, including SKU IDs, from the specified category using an API endpoint.
    It performs pagination, accesses individual product pages, and exports the collected data to a CSV file.

    Example:
    ```python
    url = 'http://www.example.com'
    category_extension = '/categoria'
    api_search_url = 'http://api.example.com/search'
    cookie_info = 'your_cookie_information_here'
    pipeline_extracao_categoria(url, category_extension, api_search_url, cookie_info)
    ```
    """
    inicio = time.time()
    categoria = re.search(r'/(.*)', extensao_categoria).group(1)
    print(f'Extraindo dados da categoria {categoria}')
    qtd_produtos = quantidade_produtos_na_pagina(url_principal, extensao_categoria, cookie)
    print(f'Há {qtd_produtos} produtos na página')
    headers_categoria = cria_header(referer=url_api_busca,cookies=cookie)

    filters = ['', 'O=OrderByNameASC&', 'O=OrderByNameDESC&', 'O=OrderByPriceDESC&', 'O=OrderByPriceASC&', 'O=OrderByTopSaleDESC&']
    skuId_all = []
    ampersand_index = url_api_busca.find('&')
    for filter_str in filters:
        modified_url = url_api_busca[:ampersand_index] + filter_str + url_api_busca[ampersand_index:]
        print(f'Extraindo ids de {modified_url}')
        skuId_categoria = extrai_skuid(modified_url, headers=headers_categoria)
        print(f'Extraiu {len(skuId_categoria)} ids')
        skuId_all.extend(skuId_categoria)
    # Remove duplicates from skuId_categoria_all
    skuId_all = list(set(skuId_all))
    
    exporta_skuId(skuId_all, categoria)
    print(f'Há {len(skuId_all)} skuIDs únicos de produtos para serem usados na extração')

    todos_produtos = []
    passo = 50
    for i in range(0, len(skuId_all), passo):
        sku_produtos = skuId_all[i:i+passo]
        url = url_catalogo(sku_produtos)
        r = acessa_site(url, headers_categoria)
        soup = BeautifulSoup(r.text, 'html.parser')
        produtos = soup.text.split('{"productId":"')[1:]

        produtos_da_pagina = []
        for produto in produtos:
            padrao1 = re.compile(
                r'"(?:productName|brand|link|description|itemId|ean)":"?([^"]*)"?')
                
            padrao2 = re.compile(r'"(?:Price|ListPrice|PriceWithoutDiscount|IsAvailable)":([^",]*)')
            corresp1 = padrao1.findall(produto)
            corresp2 = padrao2.findall(produto)
            correspondencias = corresp1[:6]+corresp2[:4]
            if correspondencias:
                # Agora, correspondencias é uma lista de tuplas com os valores capturados para cada campo
                campos = [
                    "productName",
                    "brand",
                    "link",
                    "description",
                    "skuId",
                    "ean",
                    "Price",
                    "ListPrice",
                    "PriceWithoutDiscount",
                    "IsAvailable",
                ]
                infos_produto = dict(zip(campos, correspondencias))
            else:
                print("Nenhuma correspondência encontrada.")

            # Adiciona os dicionarios com as infos dos produtos na lista da pagina
            produtos_da_pagina.append(infos_produto)
        # Unifica as listas das paginas em uma lista maior
        todos_produtos.append(produtos_da_pagina)
    # Transforma em uma única lista
    todos_produtos = [produtos for pagina in todos_produtos for produtos in pagina]
    exportar_para_csv(todos_produtos, categoria)

    qtd_extraida = len(todos_produtos)
    qtd_faltante = qtd_produtos - qtd_extraida
    fim = time.time()
    print(f'Extração de {qtd_extraida} produtos finalizada - Faltaram {qtd_faltante} produtos - Tempo de Extração: {fim-inicio} segundos')

## Medicamentos

In [26]:
def extrai_skuid_medicamento(url_busca, headers, initial_page=1):
    """
    Extracts the shelfProductIds (skuIDs) for medicinal products from a paginated website.

    Parameters:
    - url_busca (str): The base URL for the search with pagination, e.g., 'http://www.example.com/medicamentos'.
    - headers (dict): HTTP headers to be included in the request.
    - initial_page (int): The starting page number for pagination. Default is 1.

    Returns:
    - list: A list of skuIDs extracted from the website.

    Note:
    The function iterates through paginated pages, extracts the skuIDs from the JSON data embedded
    in the page's script tags, and continues until no more skuIDs are found or an error occurs.

    Example:
    ```python
    url = 'http://www.example.com/medicamentos'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    skuIDs = extrai_skuid_medicamento(url, headers)
    print(skuIDs)
    ```
    """
    s = requests.Session()
    page = initial_page
    done = False
    skuID_meds = []
    while not done:
        url = url_busca + f'?PageNumber={page}'
        try:
            r = s.get(url, headers=headers)
            page += 1
            try:
                soup = BeautifulSoup(r.text, 'html.parser')
                json_object = json.loads(soup.find_all('script')[26].text.split('(')[-1].split(')')[0])
                if len(json_object['shelfProductIds']) != 0:
                    skuID_meds.extend(json_object['shelfProductIds'])
                else:
                    done = True
            except Exception as e:
                if e == "'NoneType' object has no attribute 'text'":
                    done = True
                print(f'Error page {page-1} ---> {e}')
        except:
            print('dormiu')
            time.sleep(5)
    return skuID_meds

In [27]:
def pipeline_medicamento(url_principal, extensao_categoria, cookie, initial_page):
    """
    Performs a data extraction pipeline for medicinal products from a paginated website.

    Parameters:
    - url_principal (str): The main URL of the website.
    - extensao_categoria (str): The category extension for medicinal products, e.g., '/medicamentos'.
    - cookie (str): The cookie information for the HTTP request.
    - initial_page (int): The starting page number for pagination.

    Returns:
    - None

    Note:
    The function extracts product information, including SKU IDs, from the specified category on the website.
    It performs pagination, accesses individual product pages, and exports the collected data to a CSV file.

    Example:
    ```python
    url = 'http://www.example.com'
    category_extension = '/medicamentos'
    cookie_info = 'your_cookie_information_here'
    initial_page_number = 1
    pipeline_medicamento(url, category_extension, cookie_info, initial_page_number)
    ```
    """
    inicio = time.time()
    categoria = re.search(r'/(.*)', extensao_categoria).group(1)
    print(f'Extraindo dados da categoria {categoria}')
    qtd_produtos = quantidade_produtos_na_pagina(url_principal, extensao_categoria, cookie)
    print(f'Há {qtd_produtos} produtos na página')
    headers_categoria = cria_header(referer=url_principal+extensao_categoria,cookies=cookie)
    url_busca = url_principal+extensao_categoria

    skuId_categoria = extrai_skuid_medicamento(url_busca, headers_categoria, initial_page)
    
    exporta_skuId(skuId_categoria, categoria)
    print(f'Há {len(skuId_categoria)} skuIDs de produtos para serem usados na extração')

    todos_produtos = []
    passo = 50
    for i in range(0, len(skuId_categoria), passo):
        sku_produtos = skuId_categoria[i:i+passo]
        url = url_catalogo(sku_produtos)
        r = acessa_site(url, headers_categoria)
        soup = BeautifulSoup(r.text, 'html.parser')
        produtos = soup.text.split('{"productId":"')[1:]

        produtos_da_pagina = []
        for produto in produtos:
            padrao1 = re.compile(
                r'"(?:productName|brand|link|description|itemId|ean)":"?([^"]*)"?')
                
            padrao2 = re.compile(r'"(?:Price|ListPrice|PriceWithoutDiscount|IsAvailable)":([^",]*)')
            corresp1 = padrao1.findall(produto)
            corresp2 = padrao2.findall(produto)
            correspondencias = corresp1[:6]+corresp2[:4]
            if correspondencias:
                # Agora, correspondencias é uma lista de tuplas com os valores capturados para cada campo
                campos = [
                    "productName",
                    "brand",
                    "link",
                    "description",
                    "skuId",
                    "ean",
                    "Price",
                    "ListPrice",
                    "PriceWithoutDiscount",
                    "IsAvailable",
                ]
                infos_produto = dict(zip(campos, correspondencias))
            else:
                print("Nenhuma correspondência encontrada.")

            # Adiciona os dicionarios com as infos dos produtos na lista da pagina
            produtos_da_pagina.append(infos_produto)
        # Unifica as listas das paginas em uma lista maior
        todos_produtos.append(produtos_da_pagina)
    # Transforma em uma única lista
    todos_produtos = [produtos for pagina in todos_produtos for produtos in pagina]
    exportar_para_csv(todos_produtos, categoria)

    qtd_extraida = len(todos_produtos)
    qtd_faltante = qtd_produtos - qtd_extraida
    fim = time.time()
    print(f'Extração de {qtd_extraida} produtos finalizada - Faltaram {qtd_faltante} produtos - Tempo de Extração: {fim-inicio} segundos')

In [28]:
'''
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/medicamentos'
url_api_medicamentos = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2f&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; CheckoutOrderFormOwnership=; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _DPSP_GA=GA1.3.549971006.1705172455; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; VTEXSC=sc=1; SGTS=22FF14762A97F1122F9C24FA0AE43FDD; _ga_S3V8W6KJ9D=GS1.1.1705502934.5.1.1705503420.60.0.0; _ga_WV12XD1259=GS1.1.1705502934.5.1.1705503420.60.0.0; urlLastSearch=http://www.drogariasaopaulo.com.br/medicamentoshttp://www.drogariasaopaulo.com.br/medicamentos; janus_sid=ee6d7d79-9869-4ed4-a5da-5a655c44e540; _vss=A13749831100AF6EF19F668593CC382C5182FE019F5069C6B713B0F21613BB3C; OptanonConsent=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A57%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-17T14:57:05.783Z'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_medicamentos, cookie)
'''

Extraindo dados da categoria medicamentos
Há 10192 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2f&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByNameASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByNameDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByPriceDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2fO=OrderByPriceASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800

Observação importante: no site indica que há 10129 produtos, porém por alguma razão desconhecida não está sendo adquirido todos os skuIds

### Testando outro método de obtenção de skuId


In [29]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/medicamentos'
url_api_medicamentos = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=C%3a%2f800%2f&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; CheckoutOrderFormOwnership=; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _DPSP_GA=GA1.3.549971006.1705172455; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; VTEXSC=sc=1; SGTS=22FF14762A97F1122F9C24FA0AE43FDD; _ga_S3V8W6KJ9D=GS1.1.1705502934.5.1.1705503420.60.0.0; _ga_WV12XD1259=GS1.1.1705502934.5.1.1705503420.60.0.0; urlLastSearch=http://www.drogariasaopaulo.com.br/medicamentoshttp://www.drogariasaopaulo.com.br/medicamentos; janus_sid=ee6d7d79-9869-4ed4-a5da-5a655c44e540; _vss=A13749831100AF6EF19F668593CC382C5182FE019F5069C6B713B0F21613BB3C; OptanonConsent=isIABGlobal=false&datestamp=Wed+Jan+17+2024+11%3A57%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-17T14:57:05.783Z'

pipeline_medicamento(url_principal, extensao_categoria, cookie, 1)

Extraindo dados da categoria medicamentos
Há 10192 produtos na página
dormiu
Arquivo ids_produtos/medicamentos_ids.json exportado com sucesso
Há 6728 skuIDs de produtos para serem usados na extração
Falha na solicitação. Código de status: 500
Aguardando 10 segundos antes da próxima tentativa...
Extração de 6716 produtos finalizada - Faltaram 3476 produtos - Tempo de Extração: 662.5717089176178 segundos


## Saúde e Bem-estar

Diferentemente dos medicamentos, para obter os skuIds dessa categoria será necessário iterar pela API de busca página e ler os skuIds usando BeautifulSoup.

Essa diferença ocorre porque não é possível obter os skuIds através do script usados nos medicamentos, já que, por algum motivo desconhecido, independente da páginas buscada os skuIds não mudam e são sempre o da primeira página.

In [18]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/saude-bem-estar'
url_api_saude = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; ISICI=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; ISS=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOiJkcHNwLWF0aXZhX3BoaWxpcHNfMDgtMTItMjMtbWFyY2EtbWVudS0xMjB4MTIwIiwiY3VycmVuY3lDb2RlIjoiQlJMIiwiY3VycmVuY3lTeW1ib2wiOiJSJCIsImNvdW50cnlDb2RlIjoiQlJBIiwiY3VsdHVyZUluZm8iOiJwdC1CUiIsImNoYW5uZWxQcml2YWN5IjoicHVibGljIn0; VTEXSC=sc=1; _DPSP_GA=GA1.3.1687758385.1705521184; CheckoutOrderFormOwnership=WjaAYgxCCkdauKsBqT4Q9VEHSOJ730JBGrMEAOrugg%2FEHnAxl6TIePYPZcPzCDrI; _ga_WV12XD1259=GS1.1.1705537303.9.0.1705537303.60.0.0; SGTS=40DFA1C07D15F7A1BD3A3A8761D76F47; _ga_S3V8W6KJ9D=GS1.1.1705573214.10.0.1705573214.60.0.0; urlLastSearch=http://www.drogariasaopaulo.com.br/saude-bem-estarhttp://www.drogariasaopaulo.com.br/saude-bem-estar; _vss=4CD85C060E4EA45A9BDD9B3ACC224AB3EB489E568CAD3D1082935A645BFFA04F; _dc_gtm_UA-21096705-11=1; janus_sid=bdbe4d73-5c0e-4dd5-a1ce-0558f5cc7c9a; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+07%3A27%3A18+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-18T10:27:18.910Z'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_saude, cookie)

Extraindo dados da categoria saude-bem-estar
Há 3633 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070O=OrderByNameASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070O=OrderByNameDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070O=OrderByPriceDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070O=OrderByPriceASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15070O=OrderByTopSaleDES

## Mamãe e Bebê

In [19]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/mamae-e-bebe'
url_api_mamaebebe = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; _DPSP_GA=GA1.3.1687758385.1705521184; VTEXSC=sc=1; ISICI=InternalPage=_mamae-e-bebe&InternalPart=Refino+Full+Text&InternalCampaign=; SGTS=22FF14762A97F1122F9C24FA0AE43FDD; ISS=InternalPage=_mamae-e-bebe&InternalPart=Refino%2bFull%2bText&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; CheckoutOrderFormOwnership=adFXpRLvXaGanRdQ7Hk8E%2F79OhMV58fZh5A%2BKNJd743mLYhFy3iJiPttbUExVOnB%2FCJAgBm2%2BX7pnw2aMRLRShoyR%2Bwuvfea6h7YiR3qQEOO%2FXoXFEch8qPFZ6JaicCR; janus_sid=b3165109-f944-4d34-8e6c-92a3ac8d2979; _ga_S3V8W6KJ9D=GS1.1.1705605995.13.1.1705606381.45.0.0; _ga_WV12XD1259=GS1.1.1705605995.13.1.1705606381.45.0.0; urlLastSearch=http://www.drogariasaopaulo.com.br/mamae-e-bebehttp://www.drogariasaopaulo.com.br/mamae-e-bebe; _vss=5476C08E4AAD1346D8C4C7F253AA2DBCFFB3D9C1CF40BDCD14C5DE04DA5291F6; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+16%3A33%3A03+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-18T19:33:03.006Z'

pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_mamaebebe, cookie)


Extraindo dados da categoria mamae-e-bebe
Há 574 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 574 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071O=OrderByNameASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071O=OrderByNameDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071O=OrderByPriceDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071O=OrderByPriceASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15071O=OrderByTopSaleDESC&&PS

## Beleza

In [20]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/beleza'
url_api_beleza = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; _DPSP_GA=GA1.3.1687758385.1705521184; VTEXSC=sc=1; ISICI=InternalPage=_mamae-e-bebe&InternalPart=Refino+Full+Text&InternalCampaign=; SGTS=22FF14762A97F1122F9C24FA0AE43FDD; ISS=InternalPage=_mamae-e-bebe&InternalPart=Refino%2bFull%2bText&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; CheckoutOrderFormOwnership=adFXpRLvXaGanRdQ7Hk8E%2F79OhMV58fZh5A%2BKNJd743mLYhFy3iJiPttbUExVOnB%2FCJAgBm2%2BX7pnw2aMRLRShoyR%2Bwuvfea6h7YiR3qQEOO%2FXoXFEch8qPFZ6JaicCR; _ga_S3V8W6KJ9D=GS1.1.1705605995.13.1.1705606381.45.0.0; _ga_WV12XD1259=GS1.1.1705605995.13.1.1705606381.45.0.0; janus_sid=d548788f-ed08-4f5a-b8aa-f6dab7f9f851; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+17%3A10%3A27+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-18T20:10:27.307Z; _dc_gtm_UA-21096705-11=1; urlLastSearch=http://www.drogariasaopaulo.com.br/belezahttp://www.drogariasaopaulo.com.br/beleza; _vss=363A0D021571FBE6DA1B121ACDD6412174B156AF099EFBB7932CA1DDABE16957'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_beleza, cookie)

Extraindo dados da categoria beleza
Há 7032 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069O=OrderByNameASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069O=OrderByNameDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069O=OrderByPriceDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069O=OrderByPriceASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15069O=OrderByTopSaleDESC&&PS=48&

Travou em 2400, como as demais categorias que usam essa API

## Cabelo

In [22]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/cuidados-para-cabelos'
url_api_cabelo = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; _DPSP_GA=GA1.3.1687758385.1705521184; VTEXSC=sc=1; ISICI=InternalPage=_mamae-e-bebe&InternalPart=Refino+Full+Text&InternalCampaign=; SGTS=22FF14762A97F1122F9C24FA0AE43FDD; ISS=InternalPage=_mamae-e-bebe&InternalPart=Refino%2bFull%2bText&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; CheckoutOrderFormOwnership=adFXpRLvXaGanRdQ7Hk8E%2F79OhMV58fZh5A%2BKNJd743mLYhFy3iJiPttbUExVOnB%2FCJAgBm2%2BX7pnw2aMRLRShoyR%2Bwuvfea6h7YiR3qQEOO%2FXoXFEch8qPFZ6JaicCR; _dc_gtm_UA-21096705-11=1; janus_sid=5eb38371-e4b9-4e11-b737-7be27da1d1f5; _ga_S3V8W6KJ9D=GS1.1.1705612678.14.0.1705612678.60.0.0; _ga_WV12XD1259=GS1.1.1705612678.14.0.1705612678.60.0.0; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+18%3A17%3A58+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-18T21:17:58.689Z; urlLastSearch=http://www.drogariasaopaulo.com.br/cuidados-para-cabeloshttp://www.drogariasaopaulo.com.br/cuidados-para-cabelos; _vss=EAF0C889A747E90607D5B46E3D22A8B89D2268FC2A64C714906275E146F669DC'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_cabelo, cookie)

Extraindo dados da categoria cuidados-para-cabelos
Há 7164 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381O=OrderByNameASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381O=OrderByNameDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381O=OrderByPriceDESC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381O=OrderByPriceASC&&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15381O=OrderByTopS

## Higiene Pessoal

In [23]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/higiene-pessoal'
url_api_higiene = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15068&O=OrderByTopSaleDESC&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; _DPSP_GA=GA1.3.1687758385.1705521184; VTEXSC=sc=1; ISICI=InternalPage=_mamae-e-bebe&InternalPart=Refino+Full+Text&InternalCampaign=; SGTS=22FF14762A97F1122F9C24FA0AE43FDD; ISS=InternalPage=_mamae-e-bebe&InternalPart=Refino%2bFull%2bText&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOm51bGwsImN1cnJlbmN5Q29kZSI6IkJSTCIsImN1cnJlbmN5U3ltYm9sIjoiUiQiLCJjb3VudHJ5Q29kZSI6IkJSQSIsImN1bHR1cmVJbmZvIjoicHQtQlIiLCJjaGFubmVsUHJpdmFjeSI6InB1YmxpYyJ9; CheckoutOrderFormOwnership=adFXpRLvXaGanRdQ7Hk8E%2F79OhMV58fZh5A%2BKNJd743mLYhFy3iJiPttbUExVOnB%2FCJAgBm2%2BX7pnw2aMRLRShoyR%2Bwuvfea6h7YiR3qQEOO%2FXoXFEch8qPFZ6JaicCR; _dc_gtm_UA-21096705-11=1; _ga_S3V8W6KJ9D=GS1.1.1705612678.14.1.1705612754.60.0.0; _ga_WV12XD1259=GS1.1.1705612678.14.1.1705612754.60.0.0; urlLastSearch=http://www.drogariasaopaulo.com.br/higiene-pessoalhttp://www.drogariasaopaulo.com.br/higiene-pessoal; _vss=5657781E79789DFFEBA4E4CF511973F4E42D22592550C2D42F1621299013649B; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+18%3A19%3A15+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-18T21:19:15.562Z; janus_sid=b75c7916-86c6-4371-b003-202d4ed34409'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_higiene, cookie)

Extraindo dados da categoria higiene-pessoal
Há 13655 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15068&O=OrderByTopSaleDESC&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15068O=OrderByNameASC&&O=OrderByTopSaleDESC&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15068O=OrderByNameDESC&&O=OrderByTopSaleDESC&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15068O=OrderByPriceDESC&&O=OrderByTopSaleDESC&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a15068O=OrderByPriceASC&&O=OrderByTopSaleDESC&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu

## Lojas Parceiras

In [24]:
url_principal = 'https://www.drogariasaopaulo.com.br'
extensao_categoria = f'/lojas-parceiras'
url_api_parceiras = f'https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a12086&O=reviewRate+desc%2c+score+desc&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0'
cookie = f'VtexRCMacIdv7=3b0b86ac-b8ae-4052-bd9f-64025695b229; checkout.vtex.com=__ofid=bc9c71585fac410f96b210f24c259e87; _gcl_aw=GCL.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _gcl_au=1.1.1537744002.1705172455; vitrioLastClickPaidSource=google; origem=adwords; _gac_UA-21096705-11=1.1705172455.Cj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB; _ga=GA1.1.549971006.1705172455; IPI=UrlReferrer=https%3a%2f%2fwww.google.com%2f; ISSMB=ScreenMedia=0&UserAcceptMobile=False; nav_id=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_p=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; chaordic_browserId=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_c=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; legacy_s=2f7e3c5c-480c-45ac-8f8f-a8e3674bfa0c; _fbp=fb.2.1705172456301.954212052; _pm_id=633901705172456686; _tt_enable_cookie=1; _ttp=S5wcerEBpiuWWywCPGgvd_9AIb5; lmd_cj=google_anuncio; lmd_orig=google_paid; lmd_traf=google_paid-1705172457230; _hjSessionUser_3478270=eyJpZCI6IjRhZGU5NTk2LWUzYjQtNWQ3OS1hM2Q1LWJkOWU1ZjlmZDQxMCIsImNyZWF0ZWQiOjE3MDUxNzI0NTc3NTIsImV4aXN0aW5nIjpmYWxzZX0=; analytic_id=1705172462367658; _vt_shop=2347; _vt_user=5330046838616593_1_false_false; _DPSP_LGPD-Consentimento_Cookie=isIABGlobal=false&datestamp=Sat+Jan+13+2024+16%3A01%3A05+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=https%3A%2F%2Fwww.drogariasaopaulo.com.br%2F%3Fgad_source%3D1%26gclid%3DCj0KCQiAhomtBhDgARIsABcaYymEQZ37tYI0uC4tZ-wMJu5njC0KSrb1-WFV-Re9EHwoR_g1rbdHfe8aAp4iEALw_wcB&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1; _DPSP_LGPD-Consentimento_CookiePublicidade=granted; _DPSP_LGPD-Consentimento_CookieAnalytics=granted; _DPSP_GA_gid=GA1.3.1389298824.1705364098; vtex_session=eyJhbGciOiJFUzI1NiIsImtpZCI6Ijc1RDFEOEQ4ODdDRTdGRjlBODg0RDc3QjYzM0M3NEE5NjI5QUZDQkYiLCJ0eXAiOiJqd3QifQ.eyJhY2NvdW50LmlkIjoiNGY5NDgwMzUtMjRhNC00Y2E5LTk2NWYtZGQzMTVjODExYTM2IiwiaWQiOiJhZWIyODcwYy01YWJiLTQxNGMtYjljZS1mODE5ODI3NTc5ZDIiLCJ2ZXJzaW9uIjoyLCJzdWIiOiJzZXNzaW9uIiwiYWNjb3VudCI6InNlc3Npb24iLCJleHAiOjE3MDYwNTUyOTksImlhdCI6MTcwNTM2NDA5OSwiaXNzIjoidG9rZW4tZW1pdHRlciIsImp0aSI6ImYwYzY1NzUzLWEyNGUtNDVhZS1iNjMyLWM3NGUzYjkzMWU2YSJ9.0Fmqr3NkbFlHV4qgVh-uYChJXa8iX1uwJXW2eKRMBUnn-xWYvI-k1BG5WgkDWR90VoVaNgCNmugvgm06MzxbHQ; IPS=Midia=0&Campanha=0&Parceiro=0; _DPSP_GA=GA1.3.1687758385.1705521184; VTEXSC=sc=1; CheckoutOrderFormOwnership=adFXpRLvXaGanRdQ7Hk8E%2F79OhMV58fZh5A%2BKNJd743mLYhFy3iJiPttbUExVOnB%2FCJAgBm2%2BX7pnw2aMRLRShoyR%2Bwuvfea6h7YiR3qQEOO%2FXoXFEch8qPFZ6JaicCR; _dc_gtm_UA-21096705-11=1; janus_sid=65e44b0b-30d0-4047-8f1a-86a6397ea05b; ISICI=InternalPage=_higiene-pessoal&InternalPart=Html&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; SGTS=40DFA1C07D15F7A1BD3A3A8761D76F47; ISS=InternalPage=_mamae-e-bebe&InternalPart=Refino%2bFull%2bText&InternalCampaign=dpsp-ativa_philips_08-12-23-marca-menu-120x120; vtex_segment=eyJjYW1wYWlnbnMiOm51bGwsImNoYW5uZWwiOiIxIiwicHJpY2VUYWJsZXMiOm51bGwsInJlZ2lvbklkIjpudWxsLCJ1dG1fY2FtcGFpZ24iOm51bGwsInV0bV9zb3VyY2UiOm51bGwsInV0bWlfY2FtcGFpZ24iOiJkcHNwLWF0aXZhX3BoaWxpcHNfMDgtMTItMjMtbWFyY2EtbWVudS0xMjB4MTIwIiwiY3VycmVuY3lDb2RlIjoiQlJMIiwiY3VycmVuY3lTeW1ib2wiOiJSJCIsImNvdW50cnlDb2RlIjoiQlJBIiwiY3VsdHVyZUluZm8iOiJwdC1CUiIsImNoYW5uZWxQcml2YWN5IjoicHVibGljIn0; _ga_S3V8W6KJ9D=GS1.1.1705612678.14.1.1705613140.60.0.0; _ga_WV12XD1259=GS1.1.1705612678.14.1.1705613140.60.0.0; urlLastSearch=http://www.drogariasaopaulo.com.br/lojas-parceirashttp://www.drogariasaopaulo.com.br/lojas-parceiras; _vss=BCEC2EA933B88747C2A00B77D0EA005963D431CC708CAE48B45954601903F601; OptanonConsent=isIABGlobal=false&datestamp=Thu+Jan+18+2024+18%3A25%3A41+GMT-0300+(Hor%C3%A1rio+Padr%C3%A3o+de+Bras%C3%ADlia)&version=6.9.0&hosts=&consentId=4f4c3742-8a94-4e84-b9d5-5df95bf65ca8&interactionCount=1&landingPath=NotLandingPage&groups=C0004%3A1%2CC0001%3A1%2CC0003%3A1&AwaitingReconsent=false&geolocation=BR%3BSP; OptanonAlertBoxClosed=2024-01-18T21:25:41.825Z'
pipeline_extracao_categoria(url_principal, extensao_categoria, url_api_parceiras, cookie)

Extraindo dados da categoria lojas-parceiras
Há 5756 produtos na página
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a12086&O=reviewRate+desc%2c+score+desc&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a12086O=OrderByNameASC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a12086O=OrderByNameDESC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Falha na solicitação. Código de status: 500
Aguardando 10 segundos antes da próxima tentativa...
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaopaulo.com.br/buscapagina?fq=H%3a12086O=OrderByPriceDESC&&O=reviewRate+desc%2c+score+desc&PS=48&sl=d4f23c65-3062-452f-a536-0939348d687c&cc=48&sm=0
Extraiu 2400 ids
Extraindo ids de https://www.drogariasaop

## Unificação dos arquivos para Entrega Final

In [18]:
arquivos = [
    'beleza_scrapping.csv',
    'cuidados-para-cabelos_scrapping.csv',
    'higiene-pessoal_scrapping.csv',
    'lojas-parceiras_scrapping.csv',
    'mamae-e-bebe_scrapping.csv',
    'medicamentos_scrapping.csv',
    'saude-bem-estar_scrapping.csv'
]

columns_rename = {'productName': 'produto',
                  'brand': 'marca',
                  'description': 'descricao',
                  'Price': 'preco com desconto',
                  'ListPrice': 'preco sem desconto'}

ordem_colunas = ['ean', 'produto', 'marca', 'farmacia', 'preco com desconto', 'preco sem desconto', '% desconto', 'descricao']

df = pd.DataFrame()
for arquivo in arquivos:
    categoria = arquivo.split('_scrapping')[0]
    df_temp = pd.read_csv(f'scrapping_produtos_por_categoria/'+arquivo)
    df_temp['categoria'] = categoria
    df = pd.concat([df, df_temp], axis=0)


df = df[~df.duplicated()]
df = df.reset_index(drop=True)
df.rename(columns=columns_rename, inplace=True)
df['farmacia'] = 'Drogaria São Paulo'
df['% desconto'] = (1 - (df['preco com desconto'] / df['preco sem desconto'])) * 100
df = df[ordem_colunas]

df.to_csv('arquivo_final.csv')
