In [1]:
%pip install openai lxml beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["OPENAI_API_KEY"] = "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"


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


In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
import time
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openai import OpenAI

# ------------ KONFIGURACJA ------------

INPUT_CSV = "/Users/filipniewczas/studia/cogni/magisterka kod/csv ze stron/money_pl_combined.csv"
OUTPUT_CSV = "money_llm_extracted.csv"
CHECKPOINT_CSV = "checkpoint_progress.csv"
MAX_ARTICLES = 800
MODEL_NAME = "gpt-4.1-mini"
CHECKPOINT_INTERVAL = 10

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.7,en;q=0.6",
    "Connection": "keep-alive",
}

client = OpenAI(timeout=120.0)

# ------------ CHECKPOINTING ------------

def load_checkpoint():
    if os.path.exists(CHECKPOINT_CSV):
        try:
            df = pd.read_csv(CHECKPOINT_CSV)
            if len(df) > 0:
                last_idx = df["processed_idx"].max()
                print(f"[INFO] Wznawiam od artykułu {last_idx + 1}")
                return last_idx, df.to_dict("records")
        except Exception as e:
            print(f"[WARN] Błąd przy wczytywaniu checkpoint: {e}")
    return -1, []


def save_checkpoint(results, last_idx):
    try:
        df = pd.DataFrame(results)
        df["processed_idx"] = range(len(df))  # żeby load_checkpoint miało kolumnę
        df.to_csv(CHECKPOINT_CSV, index=False)
        print(f"[INFO] Checkpoint zapisany po artykule {last_idx}")
    except Exception as e:
        print(f"[ERROR] Nie mogę zapisać checkpointu: {e}")


# ------------ POBIERANIE HTML ------------

def fetch_html(url: str, timeout: float = 20.0, max_retries: int = 3) -> str | None:
    for attempt in range(max_retries):
        try:
            r = requests.get(url, headers=HEADERS, timeout=timeout)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except Exception as e:
            wait = 2 ** attempt
            print(f"[WARN] Pobieranie {url} nieudane: {e} (retry za {wait}s)")
            time.sleep(wait)
    return None


# ------------ EXTRACTOR MONEY.PL ------------

def extract_article_text_money(html: str) -> str:
    """
    Ekstrakcja treści artykułu z money.pl.

    Struktura (na nowych artykułach):
    <main class="wp-main-article">
      <article ...>
        ...
        <div class="wp-content-text-raw x-tts" ...>
          <p>...</p>
        </div>
        ...
    """
    if not html or len(html) < 50:
        return ""

    try:
        soup = BeautifulSoup(html, "html.parser")

        # usuń typowe śmieci
        for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav"]):
            tag.decompose()

        candidates = []

        # główny kontener money.pl
        main = soup.find("main", class_=lambda c: c and "wp-main-article" in c)
        if main:
            candidates.append(main)

        # fallback: pierwszy <article>
        article = soup.find("article")
        if article:
            candidates.append(article)

        text_parts: list[str] = []

        # Szukamy divów z klasą wp-content-text-raw (tam siedzi tekst)
        content_divs = []
        for node in candidates:
            content_divs.extend(
                node.find_all("div", class_=lambda c: c and "wp-content-text-raw" in c)
            )

        # jeśli nie znaleziono dedykowanych divów, bierzemy po prostu <article>
        if not content_divs and article:
            content_divs = [article]
        elif not content_divs and candidates:
            content_divs = candidates

        for container in content_divs:
            for el in container.find_all(["p", "h2", "h3", "h4"]):
                t = el.get_text(" ", strip=True)
                if not t:
                    continue

                low = t.lower()

                # odfiltrowanie śmieci redakcyjnych
                if low.startswith("czytaj także"):
                    continue
                if low.startswith("zobacz także"):
                    continue
                if "reklama" in low and len(low) < 80:
                    continue

                text_parts.append(t)

        # fallback globalny – gdyby wszystko zawiodło
        if not text_parts:
            text_parts = [
                p.get_text(" ", strip=True)
                for p in soup.find_all("p")
                if p.get_text(strip=True)
            ]

        # usuwamy duplikaty przy zachowaniu kolejności
        seen = set()
        unique_parts = []
        for t in text_parts:
            if t not in seen:
                seen.add(t)
                unique_parts.append(t)

        return "\n".join(unique_parts)

    except Exception as e:
        print(f"[WARN] extractor error (money.pl): {e}")
        return ""


# ------------ LLM ------------

def call_llm_extract(text: str, url: str) -> dict | None:
    system_prompt = (
        "Jesteś analitykiem ekonomicznym. Ekstrahuj wyłącznie prognozy makroekonomiczne "
        "(inflacja, PKB, bezrobocie, płace, deficyt, dług, stopy procentowe). "
        "Jeżeli brak konkretnych liczb — has_forecast=false."
    )

    user_prompt = f"""
URL: {url}

Tekst artykułu:
\"\"\"{text}\"\"\"

Zwróć JSON w formacie:
{{
  "has_forecast": bool,
  "main_topic": "...",
  "country": "...",
  "forecasts": [
    {{
      "variable": "...",
      "value": ...,
      "unit": "...",
      "horizon": "...",
      "direction": "...",
      "who_forecasts": "...",
      "quote": "..."
    }}
  ]
}}
"""

    try:
        start = time.time()
        print(f"[DEBUG] LLM start: {url} (len={len(text)})")
        r = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            response_format={"type": "json_object"},
        )
        dt = time.time() - start
        print(f"[DEBUG] LLM done: {url} w {dt:.1f}s")

        return json.loads(r.choices[0].message.content)

    except Exception as e:
        print(f"[ERROR] LLM error dla {url}: {e}")
        return None


# ------------ GŁÓWNA PĘTLA ------------

def main():
    if not os.getenv("OPENAI_API_KEY"):
        print("[ERROR] Brak OPENAI_API_KEY")
        sys.exit(1)

    try:
        df = pd.read_csv(INPUT_CSV)
    except Exception as e:
        print(f"[ERROR] Nie mogę wczytać CSV: {e}")
        sys.exit(1)

    if "url" not in df.columns:
        print("[ERROR] Brak kolumny 'url' w pliku CSV")
        sys.exit(1)

    df = df.head(MAX_ARTICLES).copy()
    print(f"[INFO] Wczytano {len(df)} URLi")

    last_idx, results = load_checkpoint()
    start = last_idx + 1

    for idx, row in df.iterrows():
        if idx < start:
            continue

        url = row["url"]
        print(f"\n[INFO] ({idx+1}/{len(df)}) Przetwarzam: {url}")

        html = fetch_html(url)
        if not html:
            print("[WARN] Brak HTML — pomijam")
            continue

        text = extract_article_text_money(html)
        print(f"[DEBUG] Długość tekstu: {len(text)}")

        if len(text) < 200:
            print("[WARN] Za mało treści — pomijam")
            continue

        data = call_llm_extract(text, url)
        if data is None:
            print("[WARN] LLM nie zwrócił danych — pomijam")
            continue

        result = {
            "url": url,
            "query": row.get("query", ""),
            "title_search": row.get("title_search", ""),
            "snippet_search": row.get("snippet_search", ""),
            "google_detected_date": row.get("google_detected_date", ""),
            "has_forecast": data.get("has_forecast"),
            "main_topic": data.get("main_topic"),
            "country": data.get("country"),
            "forecasts_json": json.dumps(data.get("forecasts", []), ensure_ascii=False),
        }
        results.append(result)

        if len(results) % CHECKPOINT_INTERVAL == 0:
            save_checkpoint(results, idx)

        time.sleep(0.2)

    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print(f"[INFO] Zapisano {len(results)} rekordów do {OUTPUT_CSV}")

    if os.path.exists(CHECKPOINT_CSV):
        os.remove(CHECKPOINT_CSV)
        print("[INFO] Checkpoint usunięty")


if __name__ == "__main__":
    main()


[INFO] Wczytano 678 URLi

[INFO] (1/678) Przetwarzam: https://www.money.pl/gospodarka/nbp-prognozuje-przyszlosc-tak-ma-sie-zmienic-inflacja-place-bezrobocie-7219230797712256a.html
[DEBUG] Długość tekstu: 4153
[DEBUG] LLM start: https://www.money.pl/gospodarka/nbp-prognozuje-przyszlosc-tak-ma-sie-zmienic-inflacja-place-bezrobocie-7219230797712256a.html (len=4153)
[DEBUG] LLM done: https://www.money.pl/gospodarka/nbp-prognozuje-przyszlosc-tak-ma-sie-zmienic-inflacja-place-bezrobocie-7219230797712256a.html w 11.1s

[INFO] (2/678) Przetwarzam: https://www.money.pl/gospodarka/ten-dokument-zawazyl-na-decyzji-rpp-wiemy-co-pokazuje-nowa-projekcja-inflacji-7218606490475296a.html
[DEBUG] Długość tekstu: 2168
[DEBUG] LLM start: https://www.money.pl/gospodarka/ten-dokument-zawazyl-na-decyzji-rpp-wiemy-co-pokazuje-nowa-projekcja-inflacji-7218606490475296a.html (len=2168)
[DEBUG] LLM done: https://www.money.pl/gospodarka/ten-dokument-zawazyl-na-decyzji-rpp-wiemy-co-pokazuje-nowa-projekcja-inflacji-7