In [None]:
%pip install openai lxml beautifulsoup4

In [1]:
import os
os.environ["OPENAI_API_KEY"] = "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"


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


In [8]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extractor dla BusinessInsider (businessinsider.com.pl) – analogiczny do wersji
obserwatora, ale z własnymi selektorami HTML.
"""
import os, sys, time, json, requests, pandas as pd
from bs4 import BeautifulSoup
from openai import OpenAI

# ------------ KONFIGURACJA ------------
INPUT_CSV = "full_businessinsider_com_pl.csv"
OUTPUT_CSV = "businessinsider_llm_extracted.csv"
CHECKPOINT_CSV = "checkpoint_progress_businessinsider.csv"
MAX_ARTICLES = 800
MODEL_NAME = "gpt-4.1-mini"
CHECKPOINT_INTERVAL = 10

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.7,en;q=0.6",
    "Connection": "keep-alive",
}

client = OpenAI(timeout=60.0)

# ------------ CHECKPOINTING ------------
def load_checkpoint():
    if os.path.exists(CHECKPOINT_CSV):
        try:
            checkpoint_df = pd.read_csv(CHECKPOINT_CSV)
            if len(checkpoint_df) > 0:
                last_idx = checkpoint_df["processed_idx"].max()
                print(f"[INFO] Wznawiam od artykułu {last_idx + 1}")
                return last_idx, checkpoint_df.to_dict("records")
        except Exception as e:
            print(f"[WARN] Błąd przy wczytywaniu checkpoint: {e}")
    return -1, []

def save_checkpoint(results, last_processed_idx):
    try:
        checkpoint_data = []
        for i, result in enumerate(results):
            r = result.copy()
            r["processed_idx"] = i
            checkpoint_data.append(r)
        pd.DataFrame(checkpoint_data).to_csv(CHECKPOINT_CSV, index=False)
        print(f"[INFO] Checkpoint zapisany po artykule {last_processed_idx}")
    except Exception as e:
        print(f"[ERROR] Błąd przy zapisywaniu checkpoint: {e}")

# ------------ POBIERANIE HTML ------------
def fetch_once(url: str, timeout: float = 20.0) -> str | None:
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        resp.raise_for_status()
        resp.encoding = resp.apparent_encoding
        return resp.text
    except requests.RequestException as e:
        print(f"[ERROR] Błąd pobierania {url}: {e}", file=sys.stderr)
        return None

def fetch_html(url: str, timeout: float = 20.0, max_retries: int = 3) -> str | None:
    html = None
    for attempt in range(max_retries):
        html = fetch_once(url, timeout=timeout)
        if html is not None:
            break
        wait = 2 ** attempt
        print(f"[WARN] Retry {attempt+1}/{max_retries} za {wait}s dla {url}")
        time.sleep(wait)
    return html

# ------------ EKSTRAKCJA TREŚCI ------------
def extract_article_text_businessinsider(html: str) -> str:
    """
    Próbuje złapać treść z układu BusinessInsider. Szuka kontenerów article /
    article_content / grid, a w razie czego zbiera paragrafy.
    """
    if not html or len(html) < 50:
        return ""
    try:
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav"]):
            tag.decompose()

        # najpierw <article>
        article = soup.find("article")
        candidates = []
        if article:
            candidates.append(article)

        # typowe kontenery body
        for cls in [
            "article_content",
            "article__content",
            "article-body",
            "content-container",
            "all-columns-wide",
            "article",
        ]:
            div = soup.find("div", class_=lambda c: c and cls in c)
            if div:
                candidates.append(div)

        text_parts = []
        for node in candidates:
            parts = [p.get_text(" ", strip=True) for p in node.find_all("p") if p.get_text(strip=True)]
            if parts:
                text_parts.extend(parts)
        if not text_parts:
            text_parts = [p.get_text(" ", strip=True) for p in soup.find_all("p") if p.get_text(strip=True)]

text = "\n".join(text_parts)
        return text
    except Exception as e:
        print(f"[WARN] extract_article_text_businessinsider error: {e}")
        return ""

# ------------ WYWOŁANIE LLM ------------
def call_llm_extract(article_text: str, url: str) -> dict | None:
    system_prompt = (
        "Jesteś analitykiem ekonomicznym. "
        "Dostajesz artykuł prasowy i masz z niego wyciągnąć wyłącznie konkretne prognozy makroekonomiczne "
        "(inflacja, PKB, stopy procentowe, bezrobocie, wynagrodzenia, kurs walut, deficyt, dług publiczny). "
        "Zwracaj szczególną uwagę na rozróżnienie między deficytem a długiem publicznym. "
        "Jeżeli nie ma prognoz liczbowych — has_forecast=false."
    )

    user_prompt = f"""
URL artykułu: {url}

Pełny tekst artykułu:
"""{article_text}"""

Zwróć odpowiedź jako JSON w schemacie:
{{
  "has_forecast": bool,
  "main_topic": "...",
  "country": "...",
  "forecasts": [
    {{
      "variable": "...",
      "value": ...,
      "unit": "...",
      "horizon": "...",
      "direction": "...",
      "who_forecasts": "...",
      "quote": "..."
    }}
  ]
}}
"""

    try:
        start = time.time()
        print(f"[DEBUG] LLM start dla {url} (len_tekstu={len(article_text)})")
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            response_format={"type": "json_object"},
        )
        elapsed = time.time() - start
        print(f"[DEBUG] LLM done dla {url} w {elapsed:.1f}s")
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        print(f"[ERROR] Błąd wywołania OpenAI dla {url}: {e}", file=sys.stderr)
        return None

# ------------ GŁÓWNA PĘTLA ------------
def main():
    if not os.getenv("OPENAI_API_KEY"):
        print("[ERROR] Brak zmiennej środowiskowej OPENAI_API_KEY.", file=sys.stderr)
        sys.exit(1)

    try:
        df = pd.read_csv(INPUT_CSV)
    except Exception as e:
        print(f"[ERROR] Nie udało się wczytać {INPUT_CSV}: {e}", file=sys.stderr)
        sys.exit(1)

    if "url" not in df.columns:
        print("[ERROR] Brak kolumny 'url' w CSV.", file=sys.stderr)
        sys.exit(1)

    print(f"[INFO] Wczytano {len(df)} rekordów.")
    df_subset = df.head(MAX_ARTICLES).copy()

    last_processed_idx, results = load_checkpoint()
    start_idx = last_processed_idx + 1

    for idx, row in df_subset.iterrows():
        if idx < start_idx:
            continue
        url = row["url"]
        print(f"
[INFO] ({idx+1}/{len(df_subset)}) Przetwarzam: {url}")

        html = fetch_html(url)
        if not html:
            print("[WARN] Brak HTML — pomijam.")
            continue

        article_text = extract_article_text_businessinsider(html)
        print(f"[DEBUG] Długość tekstu: {len(article_text)}")
        if len(article_text) < 200:
            print(f"[WARN] Za mało tekstu ({len(article_text)}), pomijam.")
            continue

        llm_data = call_llm_extract(article_text, url)
        if llm_data is None:
            print("[WARN] LLM zwrócił błąd lub timeout — pomijam.")
            continue

        result_row = {
            "url": url,
            "query": row.get("query", ""),
            "title_search": row.get("title_search", ""),
            "snippet_search": row.get("snippet_search", ""),
            "google_detected_date": row.get("google_detected_date", ""),
            "has_forecast": llm_data.get("has_forecast"),
            "main_topic": llm_data.get("main_topic"),
            "country": llm_data.get("country"),
            "forecasts_json": json.dumps(llm_data.get("forecasts", []), ensure_ascii=False),
        }
        results.append(result_row)

        if len(results) % CHECKPOINT_INTERVAL == 0:
            save_checkpoint(results, idx)
        time.sleep(0.2)

    if not results:
        print("[WARN] Brak wyników — nic do zapisania.")
        return

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"
[INFO] Zapisano {len(results)} rezultatów do {OUTPUT_CSV}")

    if os.path.exists(CHECKPOINT_CSV):
        os.remove(CHECKPOINT_CSV)
        print("[INFO] Usunięto checkpoint.")

if __name__ == "__main__":
    main()


IndentationError: unindent does not match any outer indentation level (<string>, line 121)