In [2]:
import os
import re
from concurrent.futures import ThreadPoolExecutor
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Extraindo os links dos produtos

In [3]:
categorias = ['oncologia', 'medicamentos-especiais', 'imunologia']
url_base = 'https://www.mundialfarma.com.br'
workers = os.cpu_count()

In [11]:
lista_links = []

def extrair_links_pagina(categoria, pagina):
    url = f'{url_base}/{categoria}?page={pagina}'
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', {'class': 'vtex-product-summary-2-x-clearLink h-100 flex flex-column'})]
        return links
    else:
        return []

def extrair_links_todas_paginas():
    with ThreadPoolExecutor(max_workers=workers) as executor:
        for categoria in categorias:
            for pagina in range(1, 51):
                print(pagina)
                arguments = (categoria, pagina)
                links_por_pagina = list(executor.map(lambda args: extrair_links_pagina(*args), [arguments]))

                if not links_por_pagina[0]:
                    break

                lista_links.extend(links_por_pagina[0])

extrair_links_todas_paginas()

lista_links_completa = [f'{url_base}{link}' for link in lista_links]
df = pd.DataFrame({'URL': lista_links_completa})
df_links = df.drop_duplicates().reset_index(drop=True)

df_links.to_csv('df_links.csv')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
1
2
3
4
5
6
7
8
9
1
2


### Extraindo as informações a partir de cada link

In [23]:
def extract_info(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Encontre todas as tags script
        script_tags = soup.find_all('script')

        # Procurar a string '__RUNTIME__' em cada tag script
        for script_tag in script_tags:
            if '__RUNTIME__' in script_tag.string:
                # Use expressões regulares para extrair informações adicionais
                match_ean = re.search(r'"ean"\s*:\s*"(.*?)"', script_tag.string)
                match_name = re.search(r'"productName"\s*:\s*"(.*?)"', script_tag.string)
                match_description = re.search(r'"description"\s*:\s*"(.*?)"', script_tag.string)
                match_price_c = re.search(r'"Price"\s*:\s*([\d.]+)', script_tag.string)
                match_price_s = re.search(r'"ListPrice"\s*:\s*([\d.]+)', script_tag.string)
                match_lab = re.search(r'"name"\s*:\s*"Laboratório","values"\s*:\s*{"type":"json","json":\["(.*?)"\]', script_tag.string)

                # Extrair valores correspondentes se encontrados
                ean = match_ean.group(1) if match_ean else None
                name = match_name.group(1) if match_name else None
                description = match_description.group(1) if match_description else None
                price_c = float(match_price.group(1)) if match_price_c else None
                price_s = float(match_price.group(1)) if match_price_s else None

                if price_c == price_s:
                    desconto = 0
                else:
                    desconto = round(((price_s - price_c) / price_s) * 100)

                lab = match_lab.group(1) if match_lab else None

                info = {'Produto': name, 'EAN': ean, 'Marca': lab, 'Descrição': description, 
                        'Preço sem desconto': price_s, 'Preço com desconto': price_c, 
                        '% de desconto': desconto, 'Farmácia': 'Mundial Farma', 'Região': 'Sudeste', 'Cidade': 'São Paulo'}
                return info
        else:
            print(f"Não foi encontrado '__RUNTIME__' em nenhuma tag script para {url}")
            return None
    except Exception as e:
        print(f'Erro ao processar a URL {url}: {e}')
        return None

infos = []

with ThreadPoolExecutor(max_workers=5) as executor:  # Ajuste o número de workers conforme necessário
    results = executor.map(extract_info, df_links['URL'])

    for result in results:
        if result:
            infos.append(result)

df_infos = pd.DataFrame(infos)
df_infos.to_csv('df_infos.csv')