In [10]:
import os, sys
from pathlib import Path
from dotenv import load_dotenv

REPO = Path("/Users/chunghyunhan/Projects/agentics").resolve()
os.chdir(REPO)
SRC = REPO / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
load_dotenv(REPO / ".env")


True

In [11]:
import os, sys
from pathlib import Path
from dotenv import load_dotenv

REPO = Path("/Users/chunghyunhan/Projects/agentics").resolve()
os.chdir(REPO)

from pprint import pprint
print("Python:", sys.executable)
print("CWD   :", os.getcwd())

# .env 강제 로드 (루트 고정)
load_dotenv(REPO / ".env", override=True)

# 가장 중요한 키들 확인
must = ["OPENAI_API_KEY", "AGENTICS_LLM_PROVIDER"]
pprint({k: ("SET" if os.getenv(k) else None) for k in must})

# (권장) provider를 명시적으로 openai로 고정
os.environ["AGENTICS_LLM_PROVIDER"] = os.getenv("AGENTICS_LLM_PROVIDER") or "openai"
print("AGENTICS_LLM_PROVIDER =", os.environ["AGENTICS_LLM_PROVIDER"])


Python: /Users/chunghyunhan/Projects/.venv/bin/python
CWD   : /Users/chunghyunhan/Projects/agentics
{'AGENTICS_LLM_PROVIDER': 'SET', 'OPENAI_API_KEY': 'SET'}
AGENTICS_LLM_PROVIDER = openai


In [15]:
import json, re, subprocess, sys, time, os
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Dict, Any, Tuple, Set

import pandas as pd
from tqdm.auto import tqdm  # pip install tqdm

# ========= Config =========
REPO_ROOT = Path("/Users/chunghyunhan/Projects/agentics").resolve()
CSV_PATH = REPO_ROOT / "dao_finished_proposals_stats.csv"
SCRIPT = REPO_ROOT / "examples" / "agentics_proposal_decision.py"

DEFAULT_RUN_DIR = REPO_ROOT / "Decision_runs"
DEFAULT_RUN_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR = REPO_ROOT / "Decision_runs_result_saving"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Performance / robustness knobs
SUBPROC_TIMEOUT_SEC = 600           # kill if a single run hangs > 10 minutes
SLEEP_BETWEEN_RUNS_SEC = 0.5        # spacing to respect MCP/LLM rate limits
CHECKPOINT_EVERY = 25               # write a checkpoint parquet every N runs
RESUME_IF_EXISTS = True             # skip proposals that already have saved_json
FAIL_STOP_AFTER = None              # e.g., set to 10 to abort after 10 failures

# ========= Load =========
df = pd.read_csv(CSV_PATH)
df = df[3064:3065]
df



Unnamed: 0,space,proposal_id,title,end_iso,num_voters,vp_min,vp_25%,vp_median,vp_75%,vp_max,vp_mean,vp_std,sp_sum
3064,lido-snapshot.eth,0xe2165bbde749b0f0bb7d8c78447eb64e5ff4e700b790...,Proposal to fund the Protocol Guild Pilot via...,2022-04-16T17:00:00+00:00,61,0.1,0.3,2.559323,396.0,20010000.0,1467490.0,4795811.0,89516920.0


In [16]:

# Basic column validation (defensive)
for col in ("space", "proposal_id"):
    if col not in df.columns:
        raise ValueError(f"Missing required column in CSV: {col!r}")

# ========= Helpers =========
def snapshot_url(space: str, proposal_id: str) -> str:
    return f"https://snapshot.org/#/{space}/proposal/{proposal_id}"

_SANITIZE = re.compile(r"[^A-Za-z0-9._-]")

def sanitize(s: str) -> str:
    return _SANITIZE.sub("_", str(s))

def derive_filename(space: str, index: int, proposal_id: str) -> Path:
    """Legacy canonical name (kept for backward-compat)."""
    safe_space = sanitize(space)
    safe_pid = sanitize(proposal_id)
    return OUTPUT_DIR / f"{safe_space}_{index:04d}_{safe_pid}.json"

def has_saved(space: str, proposal_id: str) -> bool:
    """
    Index-independent existence check.
    We glob for any file like {space}_*_{proposal_id}.json
    so reordering/slicing of CSV won't break resume.
    """
    safe_space = sanitize(space)
    safe_pid = sanitize(proposal_id)
    pattern = f"{safe_space}_*_{safe_pid}.json"
    return any(OUTPUT_DIR.glob(pattern))

def list_decision_jsons() -> Dict[str, Path]:
    """Current decision_* artifacts under DEFAULT_RUN_DIR."""
    return {p.name: p for p in DEFAULT_RUN_DIR.glob("decision_*.json")}

def detect_new_decision(before: Dict[str, Path]) -> Optional[Path]:
    """Return newest decision json created after the run."""
    after = list_decision_jsons()
    new_paths = [path for name, path in after.items() if name not in before]
    if not new_paths:
        return None
    return max(new_paths, key=lambda p: p.stat().st_mtime)

def run_decision(url: str) -> subprocess.CompletedProcess:
    """Run the interactive script once with canned stdin."""
    canned_input = f"{url}\n"   # Snapshot Proposal URL>
    canned_input += "n\n"       # reuse focus areas? -> no
    canned_input += "\n"        # custom focus (blank)
    env = os.environ.copy()
    src_path = str(REPO_ROOT / "src")
    existing = env.get("PYTHONPATH")
    env["PYTHONPATH"] = src_path if not existing else f"{src_path}:{existing}"
    return subprocess.run(
        [sys.executable, str(SCRIPT)],
        input=canned_input,
        text=True,
        capture_output=True,
        cwd=REPO_ROOT,
        env=env,
        check=False,            # don't raise; we inspect returncode
        timeout=SUBPROC_TIMEOUT_SEC,
    )

_SAVED_REGEX = re.compile(r"Saved:\s*(.*Decision_runs/decision_[0-9T:-]+\.json)")

def extract_saved_path(stdout: str) -> Optional[Path]:
    m = _SAVED_REGEX.search(stdout or "")
    return Path(m.group(1)).resolve() if m else None

def summarize_status(rec: Dict[str, Any]) -> str:
    if rec.get("skipped"):
        return "skip"
    rc = rec.get("returncode")
    if rc == 0 and rec.get("saved_json"):
        return "ok"
    if rec.get("timeout"):
        return "timeout"
    return f"fail(rc={rc})"

# ========= Pre-compute already-done set (index-independent) =========
if RESUME_IF_EXISTS:
    done_pairs: Set[Tuple[str, str]] = set()
    # Parse existing files in OUTPUT_DIR
    for p in OUTPUT_DIR.glob("*.json"):
        name = p.name  # e.g., aavedao.eth_0001_0xabcde....json
        try:
            # split by last underscore to get proposal_id; the rest is space + index
            # safer: split from right at most once
            head, _, tail = name.rpartition("_")
            pid = tail[:-5]  # strip ".json"
            space = head.split("_", 1)[0]  # before first underscore is space (sanitized)
            done_pairs.add((space, pid))
        except Exception:
            # if parsing fails, ignore (still covered by per-iteration has_saved)
            pass
else:
    done_pairs = set()

# Filter DF to remaining work (so we don't even iterate finished ones)
if RESUME_IF_EXISTS and len(done_pairs) > 0:
    # Compare with sanitized names to match file naming
    df["_space_key"] = df["space"].map(sanitize)
    df["_pid_key"] = df["proposal_id"].map(sanitize)
    mask = ~df.apply(lambda r: (r["_space_key"], r["_pid_key"]) in done_pairs, axis=1)
    df_remaining = df[mask].drop(columns=["_space_key", "_pid_key"])
else:
    df_remaining = df

print(f"Total rows: {len(df)} | Remaining after resume filter: {len(df_remaining)}")

# ========= Main =========
results = []
fail_count = 0
start_all = time.time()

progress = tqdm(
    df_remaining.itertuples(index=True, name="Proposal"),
    total=len(df_remaining),
    desc="Processing Snapshot proposals",
    dynamic_ncols=True,
    leave=True,
)

for row in progress:
    idx = int(row.Index)
    space = str(row.space)
    pid = str(row.proposal_id)
    url = snapshot_url(space, pid)

    # Index-independent skip (fast path)
    if RESUME_IF_EXISTS and has_saved(space, pid):
        rec = {
            "index": idx,
            "space": space,
            "proposal_id": pid,
            "snapshot_url": url,
            "returncode": 0,
            "stdout": "",
            "stderr": "",
            "saved_json": str(next(OUTPUT_DIR.glob(f"{sanitize(space)}_*_{sanitize(pid)}.json"))),
            "skipped": True,
            "timeout": False,
            "started_at": None,
            "ended_at": None,
            "elapsed_sec": 0.0,
        }
        results.append(rec)
        progress.set_postfix({"status": "skip", "space": space, "id": pid[:8]})
        continue

    # For new runs, also compute canonical target (kept for continuity)
    target_path = derive_filename(space, idx, pid)

    # Show what's being processed right now
    progress.set_postfix({"status": "run", "space": space, "id": pid[:8]})
    tqdm.write(f"[{idx}] {space} :: {pid} -> {url}")

    t0 = time.time()
    started_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    proc = None
    timeout_hit = False
    before_decisions = list_decision_jsons()
    try:
        proc = run_decision(url)
    except subprocess.TimeoutExpired as te:
        timeout_hit = True
        proc = te

    elapsed = time.time() - t0
    ended_at = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    # Try to pull the saved path from stdout
    saved_path = None
    stdout_text = getattr(proc, "stdout", "")
    stderr_text = getattr(proc, "stderr", "")
    saved_path = extract_saved_path(stdout_text or "")
    saved_origin = "stdout" if saved_path else None

    if (not saved_path or not saved_path.exists()) and not timeout_hit:
        fallback_path = detect_new_decision(before_decisions)
        if fallback_path and fallback_path.exists():
            saved_path = fallback_path
            saved_origin = "fallback"
            tqdm.write(f"    Fallback picked {fallback_path.name} from Decision_runs")

    renamed_path = None
    if saved_path and saved_path.exists():
        # Move into our OUTPUT_DIR with canonical name
        target_path.write_bytes(saved_path.read_bytes())
        try:
            saved_path.unlink()
        except Exception:
            pass
        renamed_path = target_path

    rec = {
        "index": idx,
        "space": space,
        "proposal_id": pid,
        "snapshot_url": url,
        "returncode": getattr(proc, "returncode", None),
        "stdout": stdout_text,
        "stderr": stderr_text,
        "saved_json": str(renamed_path) if renamed_path else None,
        "saved_json_source": saved_origin,
        "skipped": False,
        "timeout": timeout_hit,
        "started_at": started_at,
        "ended_at": ended_at,
        "elapsed_sec": round(elapsed, 3),
    }
    results.append(rec)

    status = summarize_status(rec)
    progress.set_postfix({"status": status, "t": f"{elapsed:.1f}s", "space": space, "id": pid[:8]})

    if status.startswith("fail"):
        fail_count += 1
        tqdm.write(f"  -> FAIL ({status}). See stderr below:")
        if stderr_text:
            head = "\n".join(stderr_text.splitlines()[:12])
            tqdm.write(head)
        fail_log = OUTPUT_DIR / f"fail_{idx:04d}_{sanitize(space)}_{sanitize(pid)[:8]}.log"
        try:
            fail_log.write_text(
                f"URL: {url}\nReturnCode: {rec['returncode']}\nTimeout: {timeout_hit}\n"
                f"--- STDOUT ---\n{stdout_text}\n\n--- STDERR ---\n{stderr_text}\n"
            )
        except Exception:
            pass
        if FAIL_STOP_AFTER and fail_count >= FAIL_STOP_AFTER:
            tqdm.write(f"Aborting after {fail_count} failures (FAIL_STOP_AFTER).")
            break

    # checkpoint save
    if len(results) % CHECKPOINT_EVERY == 0:
        ckpt = pd.DataFrame(results)
        ckpt.to_parquet(OUTPUT_DIR / "run_log_ckpt.parquet", index=False)
        tqdm.write(f"[checkpoint] wrote {len(results)} records")

    # spacing to be gentle with MCP/LLM rate limits
    time.sleep(SLEEP_BETWEEN_RUNS_SEC)

# Final save
results_df = pd.DataFrame(results)
results_df.to_parquet(OUTPUT_DIR / "run_log.parquet", index=False)

tqdm.write(
    f"Done. {len(results)} processed | "
    f"ok={sum(1 for r in results if summarize_status(r)=='ok')} | "
    f"skip={sum(1 for r in results if r.get('skipped'))} | "
    f"timeout={sum(1 for r in results if r.get('timeout'))} | "
    f"fail={sum(1 for r in results if summarize_status(r).startswith('fail'))}"
)



Total rows: 1 | Remaining after resume filter: 1


Processing Snapshot proposals:   0%|          | 0/1 [00:00<?, ?it/s]

[3064] lido-snapshot.eth :: 0xe2165bbde749b0f0bb7d8c78447eb64e5ff4e700b7905023fdd0eec820ebe5b4 -> https://snapshot.org/#/lido-snapshot.eth/proposal/0xe2165bbde749b0f0bb7d8c78447eb64e5ff4e700b7905023fdd0eec820ebe5b4
    Fallback picked decision_20251002T213904Z.json from Decision_runs
Done. 1 processed | ok=1 | skip=0 | timeout=0 | fail=0
