
Scraper de headlines do Investing.com (PT-BR) por empresa.
Lê um Excel (BaseRefAtivos.xlsx) com:
  Empresa | Setor | Ticker BDR | Ticker Original (EUA) | Bolsa (EUA) | Link News

Saída: Parquet com colunas:
  id, datetime, source, headline, ticker, sector, country, url, language

Destaques de debug:
- logging com níveis, --verbose/--debug
- --only para filtrar por ticker ou empresa
- --max-pages (padrão 15) para smoke test
- --save-html-debug salva HTML bruto de páginas em ./_html_debug
- retries com backoff/jitter e tratamento de 429/5xx
- checkpoint incremental (--resume): mescla com parquet existente e evita retrabalho


In [7]:

from __future__ import annotations
import re
import time
import uuid
import os
import random
import argparse
from pathlib import Path
from typing import Optional, List, Dict
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as du
from tqdm import tqdm
import logging

# ---------------- Config & Globals ----------------
SP_TZ = ZoneInfo("America/Sao_Paulo")

DEFAULT_EXCEL = "../data/BaseRefAtivos.xlsx"
DEFAULT_OUT = "../data/investing_news.parquet"
DEFAULT_MAX_PAGES = 1400  # <- limite padrão para facilitar debug rápido

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://br.investing.com/", # Informa de onde você "veio"
    "DNT": "1", # Do Not Track
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

RELATIVE_REGEX = re.compile(
    r"(?P<num>\d+)\s*(?P<unit>min|mins|minutos|minuto|hora|horas|dia|dias)\s*atr[aá]s",
    flags=re.IGNORECASE,
)

# ---------------- Logging ----------------
def setup_logging(verbose: bool = False, debug: bool = False) -> None:
    level = logging.INFO
    if verbose or debug:
        level = logging.DEBUG
    logging.basicConfig(
        level=level,
        format="%(asctime)s | %(levelname)-7s | %(message)s",
        datefmt="%H:%M:%S",
    )

log = logging.getLogger(__name__)

# ---------------- Utils ----------------
def normalize_news_url(url: str) -> str:
    """Garante base ...-news e remove sufixo /<n> para paginar manualmente."""
    url = (url or "").strip()
    if not url:
        return url
    if url.endswith("-new"):
        url = url + "s"
    url = re.sub(r"/+$", "", url)
    m = re.search(r"/(\d+)$", url)
    if m:
        url = url[: - (len(m.group(0)))]
    return url

def page_url(url_base: str, page: int) -> str:
    return f"{url_base}/{page}"

def parse_datetime_from_time_tag(time_tag) -> Optional[datetime]:
    if time_tag is None:
        return None

    text = (time_tag.get_text() or "").strip().lower()
    m = RELATIVE_REGEX.search(text)
    if m:
        num = int(m.group("num"))
        unit = m.group("unit")
        now_sp = datetime.now(SP_TZ)

        if unit.startswith("min"):
            dt = now_sp - timedelta(minutes=num)
        elif unit.startswith("hora"):
            dt = now_sp - timedelta(hours=num)
        else:
            dt = now_sp - timedelta(days=num)
        return dt

    dt_attr = time_tag.get("datetime")
    if dt_attr:
        try:
            dt = du.parse(dt_attr)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
        except Exception as e:
            log.debug(f"Falha parse datetime attr '{dt_attr}': {e}")

    # fallback: parse do texto absoluto, ex.: "12 de out. de 2025"
    try:
        dt = du.parse(text, dayfirst=True, fuzzy=True, languages=["pt"])
        if dt is not None:
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
    except Exception as e:
        log.debug(f"Falha parse texto de data '{text}': {e}")

    return None

def deduce_country_language_from_url(url: str) -> tuple[str, str]:
    if "br.investing.com" in url:
        return ("BR", "pt-BR")
    return ("", "")

def make_news_id(url: str, title: str) -> str:
    base = (url or "").strip() + "||" + (title or "").strip()
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

def html_debug_dump(html: str, company: str, page: int, outdir: Path) -> None:
    outdir.mkdir(parents=True, exist_ok=True)
    fn = outdir / f"{safe_filename(company)}_p{page:04d}.html"
    try:
        fn.write_text(html, encoding="utf-8")
        log.debug(f"HTML salvo para debug: {fn}")
    except Exception as e:
        log.warning(f"Falha ao salvar HTML de debug: {e}")

def safe_filename(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s.strip())

# ---------------- HTTP com backoff ----------------
def fetch(url: str, max_retries: int = 5, timeout: int = 25) -> Optional[requests.Response]:
    for i in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r
            if r.status_code in (404, 410):
                log.info(f"HTTP {r.status_code} em {url} (provável fim).")
                return None
            if r.status_code in (429, 500, 502, 503, 504):
                wait = (1.5 * (i + 1)) + random.random()
                log.warning(f"HTTP {r.status_code} em {url}; retry em {wait:.1f}s")
                time.sleep(wait)
                continue
            log.warning(f"HTTP {r.status_code} em {url}; sem retry programado.")
            return None
        except requests.RequestException as e:
            wait = (1.2 * (i + 1)) + random.random()
            log.warning(f"Erro rede: {e}; retry em {wait:.1f}s")
            time.sleep(wait)
    log.error(f"Falhou após {max_retries} tentativas: {url}")
    return None

# ---------------- Parsing ----------------
def parse_news_items(html: str, ticker: str, sector: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    ul = soup.find("ul", attrs={"data-test": "news-list"})
    if not ul:
        log.debug("Ul[data-test='news-list'] não encontrada — página pode ter mudado.")
        return []

    items: List[Dict] = []
    arts = ul.select("article[data-test='article-item']")
    if not arts:
        log.debug("Nenhum article[data-test='article-item'] encontrado nesta página.")
        return []

    for art in arts:
        a_title = art.select_one("a[data-test='article-title-link']")
        if not a_title:
            log.debug("a[data-test='article-title-link'] ausente em um article; pulando.")
            continue

        headline = (a_title.get_text() or "").strip()
        url = a_title.get("href") or ""
        if url.startswith("/"):
            url = "https://br.investing.com" + url

        a_provider = art.select_one("a[data-test='article-provider-link']")
        source = (a_provider.get_text().strip() if a_provider else "").strip()

        t = art.select_one("time[data-test='article-publish-date']")
        dt = parse_datetime_from_time_tag(t)

        country, language = deduce_country_language_from_url(url)
        _id = make_news_id(url, headline)

        items.append(
            {
                "id": _id,
                "datetime": dt.isoformat() if dt else None,
                "source": source,
                "headline": headline,
                "ticker": ticker,
                "sector": sector,
                "country": country,
                "url": url,
                "language": language,
            }
        )
    return items

def guess_last_page(html: str) -> Optional[int]:
    soup = BeautifulSoup(html, "html.parser")
    pag_links = soup.select("div.flex.items-center.gap-2 a")
    nums = []
    for a in pag_links:
        txt = (a.get_text() or "").strip()
        if txt.isdigit():
            nums.append(int(txt))
    return max(nums) if nums else None

# ---------------- Scraper por empresa ----------------
def scrape_company(
    link_news: str,
    ticker_orig: str,
    sector: str,
    company_label: str,
    polite_sleep: float = 0.7,
    max_pages: Optional[int] = DEFAULT_MAX_PAGES,
    save_html_debug: bool = False,
    html_debug_dir: Path = Path("./_html_debug"),
) -> List[Dict]:
    url_base = normalize_news_url(link_news)
    if not url_base:
        log.warning(f"[{company_label}] Link News vazio.")
        return []

    first_url = page_url(url_base, 1)
    r1 = fetch(first_url)
    if not r1:
        log.warning(f"[{company_label}] Não foi possível carregar a página 1: {first_url}")
        return []

    if save_html_debug:
        html_debug_dump(r1.text, company_label, 1, html_debug_dir)

    items = parse_news_items(r1.text, ticker_orig, sector)
    log.info(f"[{company_label}] p1: {len(items)} items.")

    last_page = guess_last_page(r1.text)
    if last_page is None:
        # fallback: iterar enquanto vier notícia (parando após X vazias seguidas)
        page = 2
        empty_streak = 0
        while True:
            if max_pages and page > max_pages:
                log.info(f"[{company_label}] max_pages atingido ({max_pages}).")
                break

            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                empty_streak += 1
                log.debug(f"[{company_label}] página {page} falhou ({empty_streak} vazias).")
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas vazias seguidas; encerrando.")
                    break
                page += 1
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            if not chunk:
                empty_streak += 1
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas sem itens; encerrando.")
                    break
            else:
                items.extend(chunk)
                empty_streak = 0
            page += 1
    else:
        total_pages = last_page
        if max_pages:
            total_pages = min(total_pages, max_pages)
        for page in range(2, total_pages + 1):
            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                log.debug(f"[{company_label}] Falha ao carregar p{page}.")
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            items.extend(chunk)

    return items

# ---------------- Execução principal ----------------
def read_excel(excel_path: Path) -> pd.DataFrame:
    df = pd.read_excel(excel_path)
    expected_cols = {
        "Empresa", "Setor", "Ticker BDR", "Ticker Original (EUA)", "Bolsa (EUA)", "Link News",
    }
    missing = expected_cols - set(df.columns)
    if missing:
        raise ValueError(f"Colunas faltantes no Excel: {sorted(missing)}")
    return df

def merge_incremental(df_new: pd.DataFrame, out_parquet: Path) -> pd.DataFrame:
    if out_parquet.exists():
        df_old = pd.read_parquet(out_parquet)
        df_all = pd.concat([df_old, df_new], ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["id"]).reset_index(drop=True)
        return df_all
    return df_new

def sort_by_datetime(df: pd.DataFrame) -> pd.DataFrame:
    def _safe_parse_iso(x):
        try:
            return du.parse(x)
        except Exception:
            return None
    if "datetime" in df.columns:
        df["_dt_sort"] = df["datetime"].map(_safe_parse_iso)
        df = df.sort_values("_dt_sort", ascending=False).drop(columns=["_dt_sort"])
    return df

def run(
    excel: Path,
    out_parquet: Path,
    only: Optional[str],
    max_pages: Optional[int],
    resume: bool,
    save_html_debug: bool,
    workers: int,
):
    df_ref = read_excel(excel)

    # filtro --only por ticker original OU empresa (case-insensitive, contém)
    if only:
        mask = (
            df_ref["Ticker Original (EUA)"].astype(str).str.contains(only, case=False, na=False) |
            df_ref["Empresa"].astype(str).str.contains(only, case=False, na=False)
        )
        df_ref = df_ref[mask].copy()
        log.info(f"Filtrando --only '{only}'. {len(df_ref)} linha(s) no Excel após filtro.")

    all_rows: List[Dict] = []

    # processamento sequencial (simples e mais debugável)
    for _, row in tqdm(df_ref.iterrows(), total=len(df_ref), desc="Empresas"):
        empresa = str(row["Empresa"]).strip()
        setor = str(row["Setor"]).strip()
        ticker_orig = str(row["Ticker Original (EUA)"]).strip()
        link_news = str(row["Link News"]).strip()

        if not link_news or link_news.lower() == "nan":
            log.warning(f"[{empresa}] Link News vazio; pulando.")
            continue

        label = ticker_orig or empresa
        items = scrape_company(
            link_news=link_news,
            ticker_orig=ticker_orig or empresa,
            sector=setor,
            company_label=label,
            polite_sleep=0.7,
            max_pages=max_pages if max_pages is not None else DEFAULT_MAX_PAGES,
            save_html_debug=save_html_debug,
        )
        if not items:
            log.info(f"[{label}] Nenhuma notícia encontrada.")
            continue

        df_company = pd.DataFrame(items).drop_duplicates(subset=["id"]).reset_index(drop=True)

        if resume and out_parquet.exists():
            df_merged = merge_incremental(df_company, out_parquet)
            df_merged = sort_by_datetime(df_merged)
            df_merged.to_parquet(out_parquet, index=False)
            log.info(f"[{label}] Merge incremental -> {len(df_merged)} linhas em {out_parquet}")
        else:
            all_rows.extend(df_company.to_dict("records"))

    # flush final quando não está em modo resume
    if not resume and all_rows:
        df = pd.DataFrame(all_rows).drop_duplicates(subset=["id"]).reset_index(drop=True)
        df = sort_by_datetime(df)
        df.to_parquet(out_parquet, index=False)
        log.info(f"Salvo {len(df):,} notícias em {out_parquet}")
    elif not all_rows and not out_parquet.exists():
        log.warning("Nenhuma notícia encontrada e arquivo de saída ainda não existe.")

# ---------------- CLI ----------------
def parse_args():
    p = argparse.ArgumentParser(description="Scraper de notícias do Investing.com (PT-BR)")
    p.add_argument("--excel", default=DEFAULT_EXCEL, help="Caminho do Excel de referência")
    p.add_argument("--out", default=DEFAULT_OUT, help="Arquivo Parquet de saída")
    p.add_argument("--only", default=None, help="Filtra por Ticker Original (EUA) ou Empresa (contém, case-insensitive)")
    p.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES, help="Limite máx. de páginas por ativo (padrão 15)")
    p.add_argument("--resume", action="store_true", help="Mescla incremental com parquet existente (checkpoint por empresa)")
    p.add_argument("--save-html-debug", action="store_true", help="Salva HTML das páginas em ./_html_debug")
    p.add_argument("--workers", type=int, default=1, help="(reservado) Nº de workers em paralelo (mantido sequencial por debug)")
    p.add_argument("--verbose", action="store_true", help="Logs detalhados (DEBUG)")
    p.add_argument("--debug", action="store_true", help="Equivalente a --verbose")

    # tolerar args estranhos do Jupyter/VSCode:
    args, _unknown = p.parse_known_args()
    return args

def main():
    import sys
    # sanitiza argv quando rodar dentro de notebooks com ipykernel
    if any("ipykernel" in x for x in sys.argv):
        # mantém os próprios args reconhecidos, graças ao parse_known_args acima
        pass

    args = parse_args()
    setup_logging(verbose=args.verbose or args.debug, debug=args.debug)

    excel = Path(args.excel)
    out_parquet = Path(args.out)

    log.info(f"Excel: {excel.resolve()}")
    log.info(f"Saída: {out_parquet.resolve()}")
    if args.only:
        log.info(f"Filtro --only: {args.only}")
    log.info(f"Limite de páginas: {args.max_pages}")
    if args.resume:
        log.info("Modo incremental: ON")
    if args.save_html_debug:
        log.info("Salvar HTML debug: ON")

    try:
        run(
            excel=excel,
            out_parquet=out_parquet,
            only=args.only,
            max_pages=args.max_pages,
            resume=args.resume,
            save_html_debug=args.save_html_debug,
            workers=args.workers,
        )
    except Exception as e:
        log.exception(f"Falha fatal: {e}")

if __name__ == "__main__":
    main()

00:50:54 | INFO    | Excel: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/BaseRefAtivos.xlsx
00:50:54 | INFO    | Saída: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/investing_news.parquet
00:50:54 | INFO    | Limite de páginas: 1400
Empresas:   0%|          | 0/43 [00:00<?, ?it/s]00:51:45 | INFO    | [AAPL] p1: 10 items.
00:51:47 | INFO    | [AAPL] p2: 10 items.
00:51:49 | INFO    | [AAPL] p3: 10 items.
00:51:51 | INFO    | [AAPL] p4: 10 items.
00:51:53 | INFO    | [AAPL] p5: 10 items.
00:51:55 | INFO    | [AAPL] p6: 10 items.
00:51:56 | INFO    | [AAPL] p7: 10 items.
00:51:58 | INFO    | [AAPL] p8: 10 items.
00:52:00 | INFO    | [AAPL] p9: 10 items.
00:52:02 | INFO    | [AAPL] p10: 10 items.
00:52:04 | INFO    | [AAPL] p11: 10 items.
00:52:06 | INFO    | [AAPL] p12: 10 items.
00:52:08 | INFO    | [AAPL] p13: 10 items.
00:52:10 | INFO    | [AAPL] p14: 10 items.
00:52:12 | INFO    | [AAPL] p15: 1

In [11]:
from __future__ import annotations
import re
import time
import uuid
import os
import random
import argparse
from pathlib import Path
from typing import Optional, List, Dict
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as du
from tqdm import tqdm
import logging

# ---------------- Config & Globals ----------------
SP_TZ = ZoneInfo("America/Sao_Paulo")

DEFAULT_EXCEL = "../data/BaseRefAtivos.xlsx"
DEFAULT_OUT = "../data/investing_news.parquet"
DEFAULT_MAX_PAGES = 1400

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://br.investing.com/",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

RELATIVE_REGEX = re.compile(
    r"(?P<num>\d+)\s*(?P<unit>min|mins|minutos|minuto|hora|horas|dia|dias)\s*atr[aá]s",
    flags=re.IGNORECASE,
)

# ---------------- Logging ----------------
def setup_logging(verbose: bool = False, debug: bool = False) -> None:
    level = logging.INFO
    if verbose or debug:
        level = logging.DEBUG
    logging.basicConfig(
        level=level,
        format="%(asctime)s | %(levelname)-7s | %(message)s",
        datefmt="%H:%M:%S",
    )

log = logging.getLogger(__name__)

# ---------------- Utils ----------------
def normalize_news_url(url: str) -> str:
    """Garante base ...-news e remove sufixo /<n> para paginar manualmente."""
    url = (url or "").strip()
    if not url:
        return url
    if url.endswith("-new"):
        url = url + "s"
    url = re.sub(r"/+$", "", url)
    m = re.search(r"/(\d+)$", url)
    if m:
        url = url[: - (len(m.group(0)))]
    return url

def page_url(url_base: str, page: int) -> str:
    return f"{url_base}/{page}"

def parse_datetime_from_time_tag(time_tag) -> Optional[datetime]:
    if time_tag is None:
        return None

    text = (time_tag.get_text() or "").strip().lower()
    m = RELATIVE_REGEX.search(text)
    if m:
        num = int(m.group("num"))
        unit = m.group("unit")
        now_sp = datetime.now(SP_TZ)

        if unit.startswith("min"):
            dt = now_sp - timedelta(minutes=num)
        elif unit.startswith("hora"):
            dt = now_sp - timedelta(hours=num)
        else:
            dt = now_sp - timedelta(days=num)
        return dt

    dt_attr = time_tag.get("datetime")
    if dt_attr:
        try:
            dt = du.parse(dt_attr)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
        except Exception as e:
            log.debug(f"Falha parse datetime attr '{dt_attr}': {e}")

    try:
        dt = du.parse(text, dayfirst=True, fuzzy=True, languages=["pt"])
        if dt is not None:
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
    except Exception as e:
        log.debug(f"Falha parse texto de data '{text}': {e}")

    return None

def deduce_country_language_from_url(url: str) -> tuple[str, str]:
    if "br.investing.com" in url:
        return ("BR", "pt-BR")
    return ("", "")

def make_news_id(url: str, title: str) -> str:
    base = (url or "").strip() + "||" + (title or "").strip()
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

def html_debug_dump(html: str, company: str, page: int, outdir: Path) -> None:
    outdir.mkdir(parents=True, exist_ok=True)
    fn = outdir / f"{safe_filename(company)}_p{page:04d}.html"
    try:
        fn.write_text(html, encoding="utf-8")
        log.debug(f"HTML salvo para debug: {fn}")
    except Exception as e:
        log.warning(f"Falha ao salvar HTML de debug: {e}")

def safe_filename(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s.strip())

# ---------------- HTTP com backoff ----------------
def fetch(url: str, max_retries: int = 5, timeout: int = 25) -> Optional[requests.Response]:
    for i in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r
            if r.status_code in (404, 410):
                log.info(f"HTTP {r.status_code} em {url} (provável fim).")
                return None
            if r.status_code in (429, 500, 502, 503, 504):
                wait = (1.5 * (i + 1)) + random.random()
                log.warning(f"HTTP {r.status_code} em {url}; retry em {wait:.1f}s")
                time.sleep(wait)
                continue
            log.warning(f"HTTP {r.status_code} em {url}; sem retry programado.")
            return None
        except requests.RequestException as e:
            wait = (1.2 * (i + 1)) + random.random()
            log.warning(f"Erro rede: {e}; retry em {wait:.1f}s")
            time.sleep(wait)
    log.error(f"Falhou após {max_retries} tentativas: {url}")
    return None

# ---------------- Parsing ----------------
def parse_news_items(html: str, ticker: str, sector: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    ul = soup.find("ul", attrs={"data-test": "news-list"})
    if not ul:
        log.debug("Ul[data-test='news-list'] não encontrada — página pode ter mudado.")
        return []

    items: List[Dict] = []
    arts = ul.select("article[data-test='article-item']")
    if not arts:
        log.debug("Nenhum article[data-test='article-item'] encontrado nesta página.")
        return []

    for art in arts:
        a_title = art.select_one("a[data-test='article-title-link']")
        if not a_title:
            log.debug("a[data-test='article-title-link'] ausente em um article; pulando.")
            continue

        headline = (a_title.get_text() or "").strip()
        url = a_title.get("href") or ""
        if url.startswith("/"):
            url = "https://br.investing.com" + url

        a_provider = art.select_one("a[data-test='article-provider-link']")
        source = (a_provider.get_text().strip() if a_provider else "").strip()

        t = art.select_one("time[data-test='article-publish-date']")
        dt = parse_datetime_from_time_tag(t)

        country, language = deduce_country_language_from_url(url)
        _id = make_news_id(url, headline)

        items.append(
            {
                "id": _id,
                "datetime": dt.isoformat() if dt else None,
                "source": source,
                "headline": headline,
                "ticker": ticker,
                "sector": sector,
                "country": country,
                "url": url,
                "language": language,
            }
        )
    return items

def guess_last_page(html: str) -> Optional[int]:
    soup = BeautifulSoup(html, "html.parser")
    pag_links = soup.select("div.flex.items-center.gap-2 a")
    nums = []
    for a in pag_links:
        txt = (a.get_text() or "").strip()
        if txt.isdigit():
            nums.append(int(txt))
    return max(nums) if nums else None

# ---------------- Scraper por empresa ----------------
def scrape_company(
    link_news: str,
    ticker_orig: str,
    sector: str,
    company_label: str,
    polite_sleep: float = 0.7,
    max_pages: Optional[int] = DEFAULT_MAX_PAGES,
    save_html_debug: bool = False,
    html_debug_dir: Path = Path("./_html_debug"),
) -> List[Dict]:
    url_base = normalize_news_url(link_news)
    if not url_base:
        log.warning(f"[{company_label}] Link News vazio.")
        return []

    first_url = page_url(url_base, 1)
    r1 = fetch(first_url)
    if not r1:
        log.warning(f"[{company_label}] Não foi possível carregar a página 1: {first_url}")
        return []

    if save_html_debug:
        html_debug_dump(r1.text, company_label, 1, html_debug_dir)

    items = parse_news_items(r1.text, ticker_orig, sector)
    log.info(f"[{company_label}] p1: {len(items)} items.")

    last_page = guess_last_page(r1.text)
    if last_page is None:
        # fallback: iterar enquanto vier notícia (parando após X vazias seguidas)
        page = 2
        empty_streak = 0
        while True:
            if max_pages and page > max_pages:
                log.info(f"[{company_label}] max_pages atingido ({max_pages}).")
                break

            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                empty_streak += 1
                log.debug(f"[{company_label}] página {page} falhou ({empty_streak} vazias).")
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas vazias seguidas; encerrando.")
                    break
                page += 1
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            if not chunk:
                empty_streak += 1
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas sem itens; encerrando.")
                    break
            else:
                items.extend(chunk)
                empty_streak = 0
            page += 1
    else:
        total_pages = last_page
        if max_pages:
            total_pages = min(total_pages, max_pages)
        for page in range(2, total_pages + 1):
            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                log.debug(f"[{company_label}] Falha ao carregar p{page}.")
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            items.extend(chunk)

    return items

# ---------------- Execução principal ----------------
def read_excel(excel_path: Path) -> pd.DataFrame:
    df = pd.read_excel(excel_path)
    expected_cols = {
        "Empresa", "Setor", "Ticker BDR", "Ticker Original (EUA)", "Bolsa (EUA)", "Link News",
    }
    missing = expected_cols - set(df.columns)
    if missing:
        raise ValueError(f"Colunas faltantes no Excel: {sorted(missing)}")
    return df

def merge_incremental(df_new: pd.DataFrame, out_parquet: Path) -> pd.DataFrame:
    if out_parquet.exists():
        df_old = pd.read_parquet(out_parquet)
        df_all = pd.concat([df_old, df_new], ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["id"]).reset_index(drop=True)
        return df_all
    return df_new

def sort_by_datetime(df: pd.DataFrame) -> pd.DataFrame:
    def _safe_parse_iso(x):
        try:
            return du.parse(x)
        except Exception:
            return None
    if "datetime" in df.columns:
        df["_dt_sort"] = df["datetime"].map(_safe_parse_iso)
        df = df.sort_values("_dt_sort", ascending=False).drop(columns=["_dt_sort"])
    return df

def run(
    excel: Path,
    out_parquet: Path,
    only: Optional[str],
    max_pages: Optional[int],
    save_html_debug: bool,
    workers: int,
):
    df_ref = read_excel(excel)
    if only:
        mask = (
            df_ref["Ticker Original (EUA)"].astype(str).str.contains(only, case=False, na=False) |
            df_ref["Empresa"].astype(str).str.contains(only, case=False, na=False)
        )
        df_ref = df_ref[mask].copy()
        log.info(f"Filtrando --only '{only}'. {len(df_ref)} linha(s) no Excel após filtro.")

    backup_dir = Path("./backup")
    backup_dir.mkdir(exist_ok=True)
    log.info(f"Diretório de backup individual: {backup_dir.resolve()}")

    for _, row in tqdm(df_ref.iterrows(), total=len(df_ref), desc="Empresas"):
        empresa = str(row["Empresa"]).strip()
        setor = str(row["Setor"]).strip()
        ticker_orig = str(row["Ticker Original (EUA)"]).strip()
        link_news = str(row["Link News"]).strip()

        if not link_news or link_news.lower() == "nan":
            log.warning(f"[{empresa}] Link News vazio; pulando.")
            continue

        label = ticker_orig or empresa
        items = scrape_company(
            link_news=link_news,
            ticker_orig=ticker_orig or empresa,
            sector=setor,
            company_label=label,
            polite_sleep=0.7,
            max_pages=max_pages if max_pages is not None else DEFAULT_MAX_PAGES,
            save_html_debug=save_html_debug,
        )
        if not items:
            log.info(f"[{label}] Nenhuma notícia nova encontrada.")
            continue

        df_company = pd.DataFrame(items).drop_duplicates(subset=["id"]).reset_index(drop=True)

        backup_filename = f"{safe_filename(label)}.parquet"
        backup_filepath = backup_dir / backup_filename
        df_company.to_parquet(backup_filepath, index=False)
        log.info(f"[{label}] Backup individual com {len(df_company)} notícias salvo em: {backup_filepath}")

        df_merged = merge_incremental(df_company, out_parquet)
        df_merged = sort_by_datetime(df_merged)
        df_merged.to_parquet(out_parquet, index=False)
        log.info(f"[{label}] Salvo. O arquivo principal agora tem {len(df_merged):,} linhas em {out_parquet}")

    log.info("Processo concluído.")
    if not out_parquet.exists():
         log.warning("Nenhuma notícia foi coletada e o arquivo de saída não foi criado.")

# ---------------- CLI ----------------
def parse_args():
    p = argparse.ArgumentParser(description="Scraper de notícias do Investing.com (PT-BR)")
    p.add_argument("--excel", default=DEFAULT_EXCEL, help="Caminho do Excel de referência")
    p.add_argument("--out", default=DEFAULT_OUT, help="Arquivo Parquet de saída")
    p.add_argument("--only", default=None, help="Filtra por Ticker Original (EUA) ou Empresa")
    p.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES, help="Limite máx. de páginas por ativo")
    p.add_argument("--save-html-debug", action="store_true", help="Salva HTML das páginas em ./_html_debug")
    p.add_argument("--workers", type=int, default=1, help="(reservado) Nº de workers em paralelo")
    p.add_argument("--verbose", action="store_true", help="Logs detalhados (DEBUG)")
    p.add_argument("--debug", action="store_true", help="Equivalente a --verbose")
    args, _unknown = p.parse_known_args()
    return args

def main():
    import sys
    if any("ipykernel" in x for x in sys.argv):
        pass
    args = parse_args()
    setup_logging(verbose=args.verbose or args.debug, debug=args.debug)
    excel = Path(args.excel)
    out_parquet = Path(args.out)
    log.info(f"Excel: {excel.resolve()}")
    log.info(f"Saída: {out_parquet.resolve()}")
    if args.only:
        log.info(f"Filtro --only: {args.only}")
    log.info(f"Limite de páginas: {args.max_pages}")
    if args.save_html_debug:
        log.info("Salvar HTML debug: ON")
    try:
        run(
            excel=excel,
            out_parquet=out_parquet,
            only=args.only,
            max_pages=args.max_pages,
            save_html_debug=args.save_html_debug,
            workers=args.workers,
        )
    except Exception as e:
        log.exception(f"Falha fatal: {e}")

if __name__ == "__main__":
    main()

12:57:49 | INFO    | Excel: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/BaseRefAtivos.xlsx
12:57:49 | INFO    | Saída: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/investing_news.parquet
12:57:49 | INFO    | Limite de páginas: 1400
12:57:49 | INFO    | Diretório de backup individual: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/notebooks/backup
Empresas:   0%|          | 0/43 [00:00<?, ?it/s]12:57:50 | INFO    | [AAPL] p1: 10 items.
12:57:52 | INFO    | [AAPL] p2: 10 items.
12:57:54 | INFO    | [AAPL] p3: 10 items.
12:57:56 | INFO    | [AAPL] p4: 10 items.
12:57:57 | INFO    | [AAPL] p5: 10 items.
12:57:59 | INFO    | [AAPL] p6: 10 items.
12:58:01 | INFO    | [AAPL] p7: 10 items.
12:58:03 | INFO    | [AAPL] p8: 10 items.
12:58:05 | INFO    | [AAPL] p9: 10 items.
12:58:07 | INFO    | [AAPL] p10: 10 items.
12:58:09 | INFO    | [AAPL] p11: 10 items.
12:58:1

In [18]:
import pandas as pd
dados = pd.read_parquet("../data/investing_news.parquet")
dados

Unnamed: 0,id,datetime,source,headline,ticker,sector,country,url,language
0,e032930c-144c-5b07-9c80-9c2d77d27db6,2025-10-18T00:06:35-03:00,Reuters,Trump approves expanding credits for US auto p...,F,Automotivo,,https://www.investing.com/news/stock-market-ne...,
1,c2406a55-93a4-5461-a9f6-e6eadf9c4cb5,2025-10-17T22:14:28-03:00,Investing.com,"Salesforce CEO Benioff sells $558,809 in CRM s...",CRM,Tecnologia,,https://www.investing.com/news/insider-trading...,
2,398307ca-a2ab-597f-9749-37ca7f4cdd4f,2025-10-17T22:00:30-03:00,Reuters,Boeing can hike 737 MAX production to 42 plane...,BA,Aeroespacial,,https://www.investing.com/news/stock-market-ne...,
3,b5633d31-0f35-5321-b23e-ab1950271838,2025-10-17T21:48:46-03:00,Reuters,Novo Nordisk hires US pharma veteran as Trump ...,PFE,Farmacêutica,,https://www.investing.com/news/stock-market-ne...,
4,cfd3cde0-5790-5c89-9cbc-059573e6558c,2025-10-17T21:40:21-03:00,Investing.com,Boeing stock rises after FAA reportedly approv...,BA,Aeroespacial,,https://www.investing.com/news/stock-market-ne...,
...,...,...,...,...,...,...,...,...,...
137826,59410706-7745-5fdf-a621-795f3e99b47a,2008-11-09T15:12:07-02:00,Reuters,"RPT-Wall St Wk Ahead: Stocks eye retail data, ...",WMT,Varejo,,https://www.investing.com/news/forex-news/rpt-...,
137827,4efd2681-1b83-5391-b534-2c71713fa603,2008-10-30T10:54:07-02:00,Reuters,Fairtrade coffee sales steady in economic down...,SBUX,Alimentício,,https://www.investing.com/news/forex-news/fair...,
137828,e4b36c45-4e67-5f8b-a0c9-313f54e1d6a5,2008-10-29T10:54:19-02:00,Reuters,China vows harsh penalties for melamine eggs s...,WMT,Varejo,,https://www.investing.com/news/forex-news/chin...,
137829,52185042-a9c5-5882-a5ef-374e39a5f031,2008-10-29T10:40:42-02:00,Reuters,FACTBOX-Industry bailouts agreed or called for,F,Automotivo,,https://www.investing.com/news/forex-news/fact...,


In [15]:
from __future__ import annotations
import re
import time
import uuid
import os
import random
import argparse
from pathlib import Path
from typing import Optional, List, Dict
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as du
from tqdm import tqdm
import logging

# ---------------- Config & Globals ----------------
SP_TZ = ZoneInfo("America/Sao_Paulo")

DEFAULT_EXCEL = "../data/BaseRefAtivosNacionais.xlsx"
DEFAULT_OUT = "../data/investing_news_nacionais.parquet"
DEFAULT_MAX_PAGES = 1400

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://br.investing.com/",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

RELATIVE_REGEX = re.compile(
    r"(?P<num>\d+)\s*(?P<unit>min|mins|minutos|minuto|hora|horas|dia|dias)\s*atr[aá]s",
    flags=re.IGNORECASE,
)

# ---------------- Logging ----------------
def setup_logging(verbose: bool = False, debug: bool = False) -> None:
    level = logging.INFO
    if verbose or debug:
        level = logging.DEBUG
    logging.basicConfig(
        level=level,
        format="%(asctime)s | %(levelname)-7s | %(message)s",
        datefmt="%H:%M:%S",
    )

log = logging.getLogger(__name__)

# ---------------- Utils ----------------
def normalize_news_url(url: str) -> str:
    """Garante base ...-news e remove sufixo /<n> para paginar manualmente."""
    url = (url or "").strip()
    if not url:
        return url
    if url.endswith("-new"):
        url = url + "s"
    url = re.sub(r"/+$", "", url)
    m = re.search(r"/(\d+)$", url)
    if m:
        url = url[: - (len(m.group(0)))]
    return url

def page_url(url_base: str, page: int) -> str:
    return f"{url_base}/{page}"

def parse_datetime_from_time_tag(time_tag) -> Optional[datetime]:
    if time_tag is None:
        return None

    text = (time_tag.get_text() or "").strip().lower()
    m = RELATIVE_REGEX.search(text)
    if m:
        num = int(m.group("num"))
        unit = m.group("unit")
        now_sp = datetime.now(SP_TZ)

        if unit.startswith("min"):
            dt = now_sp - timedelta(minutes=num)
        elif unit.startswith("hora"):
            dt = now_sp - timedelta(hours=num)
        else:
            dt = now_sp - timedelta(days=num)
        return dt

    dt_attr = time_tag.get("datetime")
    if dt_attr:
        try:
            dt = du.parse(dt_attr)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
        except Exception as e:
            log.debug(f"Falha parse datetime attr '{dt_attr}': {e}")

    try:
        dt = du.parse(text, dayfirst=True, fuzzy=True, languages=["pt"])
        if dt is not None:
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
    except Exception as e:
        log.debug(f"Falha parse texto de data '{text}': {e}")

    return None

def deduce_country_language_from_url(url: str) -> tuple[str, str]:
    if "br.investing.com" in url:
        return ("BR", "pt-BR")
    return ("", "")

def make_news_id(url: str, title: str) -> str:
    base = (url or "").strip() + "||" + (title or "").strip()
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

def html_debug_dump(html: str, company: str, page: int, outdir: Path) -> None:
    outdir.mkdir(parents=True, exist_ok=True)
    fn = outdir / f"{safe_filename(company)}_p{page:04d}.html"
    try:
        fn.write_text(html, encoding="utf-8")
        log.debug(f"HTML salvo para debug: {fn}")
    except Exception as e:
        log.warning(f"Falha ao salvar HTML de debug: {e}")

def safe_filename(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s.strip())

# ---------------- HTTP com backoff ----------------
def fetch(url: str, max_retries: int = 5, timeout: int = 25) -> Optional[requests.Response]:
    for i in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r
            if r.status_code in (404, 410):
                log.info(f"HTTP {r.status_code} em {url} (provável fim).")
                return None
            if r.status_code in (429, 500, 502, 503, 504):
                wait = (1.5 * (i + 1)) + random.random()
                log.warning(f"HTTP {r.status_code} em {url}; retry em {wait:.1f}s")
                time.sleep(wait)
                continue
            log.warning(f"HTTP {r.status_code} em {url}; sem retry programado.")
            return None
        except requests.RequestException as e:
            wait = (1.2 * (i + 1)) + random.random()
            log.warning(f"Erro rede: {e}; retry em {wait:.1f}s")
            time.sleep(wait)
    log.error(f"Falhou após {max_retries} tentativas: {url}")
    return None

# ---------------- Parsing ----------------
def parse_news_items(html: str, ticker: str, sector: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    ul = soup.find("ul", attrs={"data-test": "news-list"})
    if not ul:
        log.debug("Ul[data-test='news-list'] não encontrada — página pode ter mudado.")
        return []

    items: List[Dict] = []
    arts = ul.select("article[data-test='article-item']")
    if not arts:
        log.debug("Nenhum article[data-test='article-item'] encontrado nesta página.")
        return []

    for art in arts:
        a_title = art.select_one("a[data-test='article-title-link']")
        if not a_title:
            log.debug("a[data-test='article-title-link'] ausente em um article; pulando.")
            continue

        headline = (a_title.get_text() or "").strip()
        url = a_title.get("href") or ""
        if url.startswith("/"):
            url = "https://br.investing.com" + url

        a_provider = art.select_one("a[data-test='article-provider-link']")
        source = (a_provider.get_text().strip() if a_provider else "").strip()

        t = art.select_one("time[data-test='article-publish-date']")
        dt = parse_datetime_from_time_tag(t)

        country, language = deduce_country_language_from_url(url)
        _id = make_news_id(url, headline)

        items.append(
            {
                "id": _id,
                "datetime": dt.isoformat() if dt else None,
                "source": source,
                "headline": headline,
                "ticker": ticker,
                "sector": sector,
                "country": country,
                "url": url,
                "language": language,
            }
        )
    return items

def guess_last_page(html: str) -> Optional[int]:
    soup = BeautifulSoup(html, "html.parser")
    pag_links = soup.select("div.flex.items-center.gap-2 a")
    nums = []
    for a in pag_links:
        txt = (a.get_text() or "").strip()
        if txt.isdigit():
            nums.append(int(txt))
    return max(nums) if nums else None

# ---------------- Scraper por empresa ----------------
def scrape_company(
    link_news: str,
    ticker_orig: str,
    sector: str,
    company_label: str,
    polite_sleep: float = 0.7,
    max_pages: Optional[int] = DEFAULT_MAX_PAGES,
    save_html_debug: bool = False,
    html_debug_dir: Path = Path("./_html_debug"),
) -> List[Dict]:
    url_base = normalize_news_url(link_news)
    if not url_base:
        log.warning(f"[{company_label}] Link News vazio.")
        return []

    first_url = page_url(url_base, 1)
    r1 = fetch(first_url)
    if not r1:
        log.warning(f"[{company_label}] Não foi possível carregar a página 1: {first_url}")
        return []

    if save_html_debug:
        html_debug_dump(r1.text, company_label, 1, html_debug_dir)

    items = parse_news_items(r1.text, ticker_orig, sector)
    log.info(f"[{company_label}] p1: {len(items)} items.")

    last_page = guess_last_page(r1.text)
    if last_page is None:
        # fallback: iterar enquanto vier notícia (parando após X vazias seguidas)
        page = 2
        empty_streak = 0
        while True:
            if max_pages and page > max_pages:
                log.info(f"[{company_label}] max_pages atingido ({max_pages}).")
                break

            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                empty_streak += 1
                log.debug(f"[{company_label}] página {page} falhou ({empty_streak} vazias).")
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas vazias seguidas; encerrando.")
                    break
                page += 1
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            if not chunk:
                empty_streak += 1
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas sem itens; encerrando.")
                    break
            else:
                items.extend(chunk)
                empty_streak = 0
            page += 1
    else:
        total_pages = last_page
        if max_pages:
            total_pages = min(total_pages, max_pages)
        for page in range(2, total_pages + 1):
            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                log.debug(f"[{company_label}] Falha ao carregar p{page}.")
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            items.extend(chunk)

    return items

# ---------------- Execução principal ----------------
def read_excel(excel_path: Path) -> pd.DataFrame:
    df = pd.read_excel(excel_path)
    # ATUALIZADO: Verifica as colunas do novo formato para ativos nacionais
    expected_cols = {
        "Empresa", "Setor", "Ticker B3", "Ticker ADR", "Bolsa (EUA)", "Link News",
    }
    missing = expected_cols - set(df.columns)
    if missing:
        raise ValueError(f"Colunas faltantes no Excel: {sorted(missing)}")
    return df

def merge_incremental(df_new: pd.DataFrame, out_parquet: Path) -> pd.DataFrame:
    if out_parquet.exists():
        df_old = pd.read_parquet(out_parquet)
        df_all = pd.concat([df_old, df_new], ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["id"]).reset_index(drop=True)
        return df_all
    return df_new

def sort_by_datetime(df: pd.DataFrame) -> pd.DataFrame:
    def _safe_parse_iso(x):
        try:
            return du.parse(x)
        except Exception:
            return None
    if "datetime" in df.columns:
        df["_dt_sort"] = df["datetime"].map(_safe_parse_iso)
        df = df.sort_values("_dt_sort", ascending=False).drop(columns=["_dt_sort"])
    return df

def run(
    excel: Path,
    out_parquet: Path,
    only: Optional[str],
    max_pages: Optional[int],
    save_html_debug: bool,
    workers: int,
):
    df_ref = read_excel(excel)
    if only:
        # ATUALIZADO: Filtra usando a coluna 'Ticker B3'
        mask = (
            df_ref["Ticker B3"].astype(str).str.contains(only, case=False, na=False) |
            df_ref["Empresa"].astype(str).str.contains(only, case=False, na=False)
        )
        df_ref = df_ref[mask].copy()
        log.info(f"Filtrando --only '{only}'. {len(df_ref)} linha(s) no Excel após filtro.")

    backup_dir = Path("./backup")
    backup_dir.mkdir(exist_ok=True)
    log.info(f"Diretório de backup individual: {backup_dir.resolve()}")

    for _, row in tqdm(df_ref.iterrows(), total=len(df_ref), desc="Empresas"):
        empresa = str(row["Empresa"]).strip()
        setor = str(row["Setor"]).strip()
        # ATUALIZADO: Pega o ticker da coluna 'Ticker B3'
        ticker_orig = str(row["Ticker B3"]).strip()
        link_news = str(row["Link News"]).strip()

        if not link_news or link_news.lower() == "nan":
            log.warning(f"[{empresa}] Link News vazio; pulando.")
            continue

        label = ticker_orig or empresa
        items = scrape_company(
            link_news=link_news,
            ticker_orig=ticker_orig or empresa,
            sector=setor,
            company_label=label,
            polite_sleep=0.7,
            max_pages=max_pages if max_pages is not None else DEFAULT_MAX_PAGES,
            save_html_debug=save_html_debug,
        )
        if not items:
            log.info(f"[{label}] Nenhuma notícia nova encontrada.")
            continue

        df_company = pd.DataFrame(items).drop_duplicates(subset=["id"]).reset_index(drop=True)

        backup_filename = f"{safe_filename(label)}.parquet"
        backup_filepath = backup_dir / backup_filename
        df_company.to_parquet(backup_filepath, index=False)
        log.info(f"[{label}] Backup individual com {len(df_company)} notícias salvo em: {backup_filepath}")

        df_merged = merge_incremental(df_company, out_parquet)
        df_merged = sort_by_datetime(df_merged)
        df_merged.to_parquet(out_parquet, index=False)
        log.info(f"[{label}] Salvo. O arquivo principal agora tem {len(df_merged):,} linhas em {out_parquet}")

    log.info("Processo concluído.")
    if not out_parquet.exists():
         log.warning("Nenhuma notícia foi coletada e o arquivo de saída não foi criado.")

# ---------------- CLI ----------------
def parse_args():
    p = argparse.ArgumentParser(description="Scraper de notícias do Investing.com (PT-BR)")
    p.add_argument("--excel", default=DEFAULT_EXCEL, help="Caminho do Excel de referência")
    p.add_argument("--out", default=DEFAULT_OUT, help="Arquivo Parquet de saída")
    # ATUALIZADO: Ajuda reflete a mudança para Ticker B3
    p.add_argument("--only", default=None, help="Filtra por Ticker B3 ou Empresa")
    p.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES, help="Limite máx. de páginas por ativo")
    p.add_argument("--save-html-debug", action="store_true", help="Salva HTML das páginas em ./_html_debug")
    p.add_argument("--workers", type=int, default=1, help="(reservado) Nº de workers em paralelo")
    p.add_argument("--verbose", action="store_true", help="Logs detalhados (DEBUG)")
    p.add_argument("--debug", action="store_true", help="Equivalente a --verbose")
    args, _unknown = p.parse_known_args()
    return args

def main():
    import sys
    if any("ipykernel" in x for x in sys.argv):
        pass
    args = parse_args()
    setup_logging(verbose=args.verbose or args.debug, debug=args.debug)
    excel = Path(args.excel)
    out_parquet = Path(args.out)
    log.info(f"Excel: {excel.resolve()}")
    log.info(f"Saída: {out_parquet.resolve()}")
    if args.only:
        log.info(f"Filtro --only: {args.only}")
    log.info(f"Limite de páginas: {args.max_pages}")
    if args.save_html_debug:
        log.info("Salvar HTML debug: ON")
    try:
        run(
            excel=excel,
            out_parquet=out_parquet,
            only=args.only,
            max_pages=args.max_pages,
            save_html_debug=args.save_html_debug,
            workers=args.workers,
        )
    except Exception as e:
        log.exception(f"Falha fatal: {e}")

if __name__ == "__main__":
    main()

02:32:34 | INFO    | Excel: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/BaseRefAtivosNacionais.xlsx
02:32:34 | INFO    | Saída: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/investing_news_nacionais.parquet
02:32:34 | INFO    | Limite de páginas: 1400
02:32:34 | INFO    | Diretório de backup individual: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/notebooks/backup
Empresas:   0%|          | 0/34 [00:00<?, ?it/s]02:32:36 | INFO    | [VALE3] p1: 10 items.
02:32:38 | INFO    | [VALE3] p2: 10 items.
02:32:40 | INFO    | [VALE3] p3: 10 items.
02:32:42 | INFO    | [VALE3] p4: 10 items.
02:32:44 | INFO    | [VALE3] p5: 10 items.
02:32:45 | INFO    | [VALE3] p6: 10 items.
02:32:47 | INFO    | [VALE3] p7: 10 items.
02:32:50 | INFO    | [VALE3] p8: 10 items.
02:32:52 | INFO    | [VALE3] p9: 10 items.
02:32:54 | INFO    | [VALE3] p10: 10 items.
02:32:55 | INFO    | 

In [19]:
from __future__ import annotations
import re
import time
import uuid
import os
import random
import argparse
from pathlib import Path
from typing import Optional, List, Dict
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as du
from tqdm import tqdm
import logging

# ---------------- Config & Globals ----------------
SP_TZ = ZoneInfo("America/Sao_Paulo")

DEFAULT_EXCEL = "../data/BaseRefAtivosNacionais.xlsx"
DEFAULT_OUT = "../data/investing_news_nacionais_que_faltaram.parquet"
DEFAULT_MAX_PAGES = 1400

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://br.investing.com/",
    "DNT": "1",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

RELATIVE_REGEX = re.compile(
    r"(?P<num>\d+)\s*(?P<unit>min|mins|minutos|minuto|hora|horas|dia|dias)\s*atr[aá]s",
    flags=re.IGNORECASE,
)

# ---------------- Logging ----------------
def setup_logging(verbose: bool = False, debug: bool = False) -> None:
    level = logging.INFO
    if verbose or debug:
        level = logging.DEBUG
    logging.basicConfig(
        level=level,
        format="%(asctime)s | %(levelname)-7s | %(message)s",
        datefmt="%H:%M:%S",
    )

log = logging.getLogger(__name__)

# ---------------- Utils ----------------
def normalize_news_url(url: str) -> str:
    """Garante base ...-news e remove sufixo /<n> para paginar manualmente."""
    url = (url or "").strip()
    if not url:
        return url
    if url.endswith("-new"):
        url = url + "s"
    url = re.sub(r"/+$", "", url)
    m = re.search(r"/(\d+)$", url)
    if m:
        url = url[: - (len(m.group(0)))]
    return url

def page_url(url_base: str, page: int) -> str:
    return f"{url_base}/{page}"

def parse_datetime_from_time_tag(time_tag) -> Optional[datetime]:
    if time_tag is None:
        return None

    text = (time_tag.get_text() or "").strip().lower()
    m = RELATIVE_REGEX.search(text)
    if m:
        num = int(m.group("num"))
        unit = m.group("unit")
        now_sp = datetime.now(SP_TZ)

        if unit.startswith("min"):
            dt = now_sp - timedelta(minutes=num)
        elif unit.startswith("hora"):
            dt = now_sp - timedelta(hours=num)
        else:
            dt = now_sp - timedelta(days=num)
        return dt

    dt_attr = time_tag.get("datetime")
    if dt_attr:
        try:
            dt = du.parse(dt_attr)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
        except Exception as e:
            log.debug(f"Falha parse datetime attr '{dt_attr}': {e}")

    try:
        dt = du.parse(text, dayfirst=True, fuzzy=True, languages=["pt"])
        if dt is not None:
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
    except Exception as e:
        log.debug(f"Falha parse texto de data '{text}': {e}")

    return None

def deduce_country_language_from_url(url: str) -> tuple[str, str]:
    if "br.investing.com" in url:
        return ("BR", "pt-BR")
    return ("", "")

def make_news_id(url: str, title: str) -> str:
    base = (url or "").strip() + "||" + (title or "").strip()
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

def html_debug_dump(html: str, company: str, page: int, outdir: Path) -> None:
    outdir.mkdir(parents=True, exist_ok=True)
    fn = outdir / f"{safe_filename(company)}_p{page:04d}.html"
    try:
        fn.write_text(html, encoding="utf-8")
        log.debug(f"HTML salvo para debug: {fn}")
    except Exception as e:
        log.warning(f"Falha ao salvar HTML de debug: {e}")

def safe_filename(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s.strip())

# ---------------- HTTP com backoff ----------------
def fetch(url: str, max_retries: int = 5, timeout: int = 25) -> Optional[requests.Response]:
    for i in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r
            if r.status_code in (404, 410):
                log.info(f"HTTP {r.status_code} em {url} (provável fim).")
                return None
            if r.status_code in (429, 500, 502, 503, 504):
                wait = (1.5 * (i + 1)) + random.random()
                log.warning(f"HTTP {r.status_code} em {url}; retry em {wait:.1f}s")
                time.sleep(wait)
                continue
            log.warning(f"HTTP {r.status_code} em {url}; sem retry programado.")
            return None
        except requests.RequestException as e:
            wait = (1.2 * (i + 1)) + random.random()
            log.warning(f"Erro rede: {e}; retry em {wait:.1f}s")
            time.sleep(wait)
    log.error(f"Falhou após {max_retries} tentativas: {url}")
    return None

# ---------------- Parsing ----------------
def parse_news_items(html: str, ticker: str, sector: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    ul = soup.find("ul", attrs={"data-test": "news-list"})
    if not ul:
        log.debug("Ul[data-test='news-list'] não encontrada — página pode ter mudado.")
        return []

    items: List[Dict] = []
    arts = ul.select("article[data-test='article-item']")
    if not arts:
        log.debug("Nenhum article[data-test='article-item'] encontrado nesta página.")
        return []

    for art in arts:
        a_title = art.select_one("a[data-test='article-title-link']")
        if not a_title:
            log.debug("a[data-test='article-title-link'] ausente em um article; pulando.")
            continue

        headline = (a_title.get_text() or "").strip()
        url = a_title.get("href") or ""
        if url.startswith("/"):
            url = "https://br.investing.com" + url

        a_provider = art.select_one("a[data-test='article-provider-link']")
        source = (a_provider.get_text().strip() if a_provider else "").strip()

        t = art.select_one("time[data-test='article-publish-date']")
        dt = parse_datetime_from_time_tag(t)

        country, language = deduce_country_language_from_url(url)
        _id = make_news_id(url, headline)

        items.append(
            {
                "id": _id,
                "datetime": dt.isoformat() if dt else None,
                "source": source,
                "headline": headline,
                "ticker": ticker,
                "sector": sector,
                "country": country,
                "url": url,
                "language": language,
            }
        )
    return items

def guess_last_page(html: str) -> Optional[int]:
    soup = BeautifulSoup(html, "html.parser")
    pag_links = soup.select("div.flex.items-center.gap-2 a")
    nums = []
    for a in pag_links:
        txt = (a.get_text() or "").strip()
        if txt.isdigit():
            nums.append(int(txt))
    return max(nums) if nums else None

# ---------------- Scraper por empresa ----------------
def scrape_company(
    link_news: str,
    ticker_orig: str,
    sector: str,
    company_label: str,
    polite_sleep: float = 0.7,
    max_pages: Optional[int] = DEFAULT_MAX_PAGES,
    save_html_debug: bool = False,
    html_debug_dir: Path = Path("./_html_debug"),
) -> List[Dict]:
    url_base = normalize_news_url(link_news)
    if not url_base:
        log.warning(f"[{company_label}] Link News vazio.")
        return []

    first_url = page_url(url_base, 1)
    r1 = fetch(first_url)
    if not r1:
        log.warning(f"[{company_label}] Não foi possível carregar a página 1: {first_url}")
        return []

    if save_html_debug:
        html_debug_dump(r1.text, company_label, 1, html_debug_dir)

    items = parse_news_items(r1.text, ticker_orig, sector)
    log.info(f"[{company_label}] p1: {len(items)} items.")

    last_page = guess_last_page(r1.text)
    if last_page is None:
        # fallback: iterar enquanto vier notícia (parando após X vazias seguidas)
        page = 2
        empty_streak = 0
        while True:
            if max_pages and page > max_pages:
                log.info(f"[{company_label}] max_pages atingido ({max_pages}).")
                break

            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                empty_streak += 1
                log.debug(f"[{company_label}] página {page} falhou ({empty_streak} vazias).")
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas vazias seguidas; encerrando.")
                    break
                page += 1
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            if not chunk:
                empty_streak += 1
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas sem itens; encerrando.")
                    break
            else:
                items.extend(chunk)
                empty_streak = 0
            page += 1
    else:
        total_pages = last_page
        if max_pages:
            total_pages = min(total_pages, max_pages)
        for page in range(2, total_pages + 1):
            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                log.debug(f"[{company_label}] Falha ao carregar p{page}.")
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            items.extend(chunk)

    return items

# ---------------- Execução principal ----------------
def read_excel(excel_path: Path) -> pd.DataFrame:
    df = pd.read_excel(excel_path)
    # ATUALIZADO: Verifica as colunas do novo formato para ativos nacionais
    expected_cols = {
        "Empresa", "Setor", "Ticker B3", "Ticker ADR", "Bolsa (EUA)", "Link News",
    }
    missing = expected_cols - set(df.columns)
    if missing:
        raise ValueError(f"Colunas faltantes no Excel: {sorted(missing)}")
    return df

def merge_incremental(df_new: pd.DataFrame, out_parquet: Path) -> pd.DataFrame:
    if out_parquet.exists():
        df_old = pd.read_parquet(out_parquet)
        df_all = pd.concat([df_old, df_new], ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["id"]).reset_index(drop=True)
        return df_all
    return df_new

def sort_by_datetime(df: pd.DataFrame) -> pd.DataFrame:
    def _safe_parse_iso(x):
        try:
            return du.parse(x)
        except Exception:
            return None
    if "datetime" in df.columns:
        df["_dt_sort"] = df["datetime"].map(_safe_parse_iso)
        df = df.sort_values("_dt_sort", ascending=False).drop(columns=["_dt_sort"])
    return df

def run(
    excel: Path,
    out_parquet: Path,
    only: Optional[str],
    max_pages: Optional[int],
    save_html_debug: bool,
    workers: int,
):
    df_ref = read_excel(excel)
    if only:
        # ATUALIZADO: Filtra usando a coluna 'Ticker B3'
        mask = (
            df_ref["Ticker B3"].astype(str).str.contains(only, case=False, na=False) |
            df_ref["Empresa"].astype(str).str.contains(only, case=False, na=False)
        )
        df_ref = df_ref[mask].copy()
        log.info(f"Filtrando --only '{only}'. {len(df_ref)} linha(s) no Excel após filtro.")

    backup_dir = Path("./backup")
    backup_dir.mkdir(exist_ok=True)
    log.info(f"Diretório de backup individual: {backup_dir.resolve()}")

    for _, row in tqdm(df_ref.iterrows(), total=len(df_ref), desc="Empresas"):
        empresa = str(row["Empresa"]).strip()
        setor = str(row["Setor"]).strip()
        # ATUALIZADO: Pega o ticker da coluna 'Ticker B3'
        ticker_orig = str(row["Ticker B3"]).strip()
        link_news = str(row["Link News"]).strip()

        if not link_news or link_news.lower() == "nan":
            log.warning(f"[{empresa}] Link News vazio; pulando.")
            continue

        label = ticker_orig or empresa
        items = scrape_company(
            link_news=link_news,
            ticker_orig=ticker_orig or empresa,
            sector=setor,
            company_label=label,
            polite_sleep=0.7,
            max_pages=max_pages if max_pages is not None else DEFAULT_MAX_PAGES,
            save_html_debug=save_html_debug,
        )
        if not items:
            log.info(f"[{label}] Nenhuma notícia nova encontrada.")
            continue

        df_company = pd.DataFrame(items).drop_duplicates(subset=["id"]).reset_index(drop=True)

        backup_filename = f"{safe_filename(label)}.parquet"
        backup_filepath = backup_dir / backup_filename
        df_company.to_parquet(backup_filepath, index=False)
        log.info(f"[{label}] Backup individual com {len(df_company)} notícias salvo em: {backup_filepath}")

        df_merged = merge_incremental(df_company, out_parquet)
        df_merged = sort_by_datetime(df_merged)
        df_merged.to_parquet(out_parquet, index=False)
        log.info(f"[{label}] Salvo. O arquivo principal agora tem {len(df_merged):,} linhas em {out_parquet}")

    log.info("Processo concluído.")
    if not out_parquet.exists():
         log.warning("Nenhuma notícia foi coletada e o arquivo de saída não foi criado.")

# ---------------- CLI ----------------
def parse_args():
    p = argparse.ArgumentParser(description="Scraper de notícias do Investing.com (PT-BR)")
    p.add_argument("--excel", default=DEFAULT_EXCEL, help="Caminho do Excel de referência")
    p.add_argument("--out", default=DEFAULT_OUT, help="Arquivo Parquet de saída")
    # ATUALIZADO: Ajuda reflete a mudança para Ticker B3
    p.add_argument("--only", default=None, help="Filtra por Ticker B3 ou Empresa")
    p.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES, help="Limite máx. de páginas por ativo")
    p.add_argument("--save-html-debug", action="store_true", help="Salva HTML das páginas em ./_html_debug")
    p.add_argument("--workers", type=int, default=1, help="(reservado) Nº de workers em paralelo")
    p.add_argument("--verbose", action="store_true", help="Logs detalhados (DEBUG)")
    p.add_argument("--debug", action="store_true", help="Equivalente a --verbose")
    args, _unknown = p.parse_known_args()
    return args

def main():
    import sys
    if any("ipykernel" in x for x in sys.argv):
        pass
    args = parse_args()
    setup_logging(verbose=args.verbose or args.debug, debug=args.debug)
    excel = Path(args.excel)
    out_parquet = Path(args.out)
    log.info(f"Excel: {excel.resolve()}")
    log.info(f"Saída: {out_parquet.resolve()}")
    if args.only:
        log.info(f"Filtro --only: {args.only}")
    log.info(f"Limite de páginas: {args.max_pages}")
    if args.save_html_debug:
        log.info("Salvar HTML debug: ON")
    try:
        run(
            excel=excel,
            out_parquet=out_parquet,
            only=args.only,
            max_pages=args.max_pages,
            save_html_debug=args.save_html_debug,
            workers=args.workers,
        )
    except Exception as e:
        log.exception(f"Falha fatal: {e}")

if __name__ == "__main__":
    main()

17:40:11 | INFO    | Excel: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/BaseRefAtivosNacionais.xlsx
17:40:11 | INFO    | Saída: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/investing_news_nacionais_que_faltaram.parquet
17:40:11 | INFO    | Limite de páginas: 1400
17:40:11 | INFO    | Diretório de backup individual: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/notebooks/backup
Empresas:   0%|          | 0/7 [00:00<?, ?it/s]17:40:13 | INFO    | [SUZB3] p1: 10 items.
17:40:16 | INFO    | [SUZB3] p2: 10 items.
17:40:18 | INFO    | [SUZB3] p3: 10 items.
17:40:20 | INFO    | [SUZB3] p4: 10 items.
17:40:22 | INFO    | [SUZB3] p5: 10 items.
17:40:24 | INFO    | [SUZB3] p6: 10 items.
17:40:26 | INFO    | [SUZB3] p7: 10 items.
17:40:28 | INFO    | [SUZB3] p8: 10 items.
17:40:30 | INFO    | [SUZB3] p9: 10 items.
17:40:32 | INFO    | [SUZB3] p10: 10 items.
17:40:34 

In [3]:
import pandas as pd
dados = pd.read_parquet("../data/investing_news_nacionais.parquet")
dados

Unnamed: 0,id,datetime,source,headline,ticker,sector,country,url,language
0,d417c9ca-acaf-5ac9-a970-594e8b23c79b,2025-10-17T19:54:29.054784-03:00,Investing.com,CFO da Apple vende ações no valor de US$ 1 milhão,VALE3,Mineração,BR,https://br.investing.com/news/insider-trading-...,pt-BR
1,aac62e8d-9ba7-5a4c-872e-8083150260f4,2025-10-17T17:05:25.602700-03:00,Reuters,Petrobras amplia vendas de petróleo para Índia...,PETR3 / PETR4,Óleo e Gás,BR,https://br.investing.com/news/commodities-news...,pt-BR
2,a1b8d827-9c97-5eca-9d15-c48b496df1de,2025-10-17T14:02:15.632513-03:00,Reuters,João Fukunaga deixa presidência da Previ,BBDC3 / BBDC4,Financeiro,BR,https://br.investing.com/news/stock-market-new...,pt-BR
3,38e8d2d0-3a35-5fa4-a110-84445f1ae950,2025-10-17T13:32:36.446860-03:00,Reuters,Ibovespa sobe e flerta com 143 mil pontos; Pri...,VALE3,Mineração,BR,https://br.investing.com/news/stock-market-new...,pt-BR
4,988120f3-3841-54c7-8f89-7a95c7978b33,2025-10-17T11:54:29.055123-03:00,Investing.com,Apple garante direitos exclusivos de streaming...,VALE3,Mineração,BR,https://br.investing.com/news/stock-market-new...,pt-BR
...,...,...,...,...,...,...,...,...,...
59083,a2e68f61-9668-53f7-8402-945bb287279f,2014-03-25T13:38:00-03:00,Investing.com,Bolsas dos EUA abrem em alta após relatório im...,VALE3,Mineração,BR,https://br.investing.com/news/stock-market-new...,pt-BR
59084,eb76dcb3-d4f0-594b-913c-928215d78cfb,2014-03-25T11:43:00-03:00,Investing.com,Bolsas dos EUA sobem antes de dados; Dow Jones...,VALE3,Mineração,BR,https://br.investing.com/news/stock-market-new...,pt-BR
59085,3dd9b357-c4fe-5b24-a420-5ed910b9327f,1970-01-01T00:00:01-03:00,Money Times,"Itaú BBA eleva em 38,5% o preço-alvo para Maga...",ITUB3 / ITUB4,Financeiro,BR,https://br.investing.com/news/stock-market-new...,pt-BR
59086,cc9d69b3-e437-594c-ad87-d8c91b34cc3e,1970-01-01T00:00:01-03:00,Reuters,Fraqueza em Wall Street dita queda do Ibovespa...,ITUB3 / ITUB4,Financeiro,BR,https://br.investing.com/news/stock-market-new...,pt-BR


In [17]:
#Agrupar por 'ticker'
agrupado = dados.groupby('ticker').size().reset_index(name='contagem')
agrupado

Unnamed: 0,ticker,contagem
0,ABEV3,722
1,AZUL4,1389
2,B3SA3,6050
3,BBDC3 / BBDC4,2961
4,BRFS3,1427
5,CMIG3 / CMIG4,1400
6,CSAN3,613
7,CSNA3,1010
8,EMBR3,2216
9,GGBR3 / GGBR4,1219


In [9]:
#Pegar as 10 primeiras linhas
sample = dados.head(10)
sample.to_csv("sample_bdrs.csv", index=False)