In [5]:
import os
import time
import random
import requests
from datetime import datetime, timezone
from typing import List, Dict, Optional
import pandas as pd
from tqdm import tqdm   # 🔹 tqdm 추가

# ---------- 기본 설정 ----------
SNAPSHOT_API = "https://hub.snapshot.org/graphql"
TIMEOUT = 30
BASE_SLEEP  = 0.6
MAX_RETRIES = 5
BACKOFF_BASE = 1.7
JITTER = (0.1, 0.35)

session = requests.Session()
session.headers.update({"User-Agent": "target-proposals/0.1"})

# ---------- 유틸 ----------
def _sleep():
    time.sleep(BASE_SLEEP + random.uniform(*JITTER))

def _now_ts() -> int:
    return int(datetime.now(timezone.utc).timestamp())

def _ts_to_iso(ts: int) -> str:
    return datetime.fromtimestamp(int(ts), tz=timezone.utc).isoformat()

# ---------- Snapshot GraphQL ----------
PROPOSALS_Q = """
query($space: String!, $first: Int!, $skip: Int!) {
  proposals(
    first: $first
    skip: $skip
    where: { space_in: [$space] }
    orderBy: "created"
    orderDirection: desc
  ) {
    id
    title
    author
    body
    discussion
    start
    end
    state
  }
}
"""

def gql(query: str, variables: Optional[dict] = None) -> dict:
    retries = 0
    while True:
        _sleep()
        try:
            r = session.post(SNAPSHOT_API, json={"query": query, "variables": variables or {}}, timeout=TIMEOUT)
        except requests.RequestException:
            if retries < MAX_RETRIES:
                delay = (BACKOFF_BASE ** retries) + random.uniform(*JITTER)
                time.sleep(delay)
                retries += 1
                continue
            raise
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 502, 503, 504) and retries < MAX_RETRIES:
            ra = r.headers.get("Retry-After")
            delay = float(ra) if (ra and ra.isdigit()) else (BACKOFF_BASE ** retries)
            time.sleep(delay + random.uniform(*JITTER))
            retries += 1
            continue
        r.raise_for_status()

def fetch_all_proposals(space: str, batch: int = 100) -> List[dict]:
    out, skip = [], 0
    while True:
        data = gql(PROPOSALS_Q, {"space": space, "first": batch, "skip": skip})
        if not data or "data" not in data:
            break
        chunk = data["data"].get("proposals", [])
        if not chunk:
            break
        out.extend(chunk)
        if len(chunk) < batch:
            break
        skip += batch
    return out

def finished_only(proposals: List[dict]) -> List[dict]:
    nt = _now_ts()
    return [p for p in proposals if p.get("state") == "closed" and int(p.get("end") or 0) <= nt]

def with_discussion_only(proposals: List[dict]) -> List[dict]:
    """Snapshot의 discussion 필드가 비어있지 않은 것만"""
    return [p for p in proposals if (p.get("discussion") or "").strip()]

# ---------- 실행 파트 ----------
SPACES = [
    "aavedao.eth",
    "arbitrumfoundation.eth",
    "snapshot.dcl.eth",
    "balancer.eth",
    "cvx.eth",
    "1inch.eth",
    "aurafinance.eth",
    "lido-snapshot.eth",
    "uniswapgovernance.eth",
    "metislayer2.eth",
]

os.makedirs("Target_proposals", exist_ok=True)

summary_rows = []

print("🚀 Snapshot 프로포절 수집 시작...\n")

# tqdm으로 프로세스 진행률 표시
for space in tqdm(SPACES, desc="Processing Spaces"):
    # 1) 해당 space의 전체 프로포절 가져오기
    all_props = fetch_all_proposals(space)
    
    # 2) 종료(closed) 상태만 남기기
    finished_props = finished_only(all_props)
    
    # 3) discussion 필드가 있는 프로포절만 남기기
    discussion_props = with_discussion_only(finished_props)

    # 4) space별 개별 CSV 저장
    df = pd.DataFrame([{
        "space": space,
        "id": p.get("id"),
        "title": p.get("title"),
        "author": p.get("author"),
        "discussion": p.get("discussion"),
        "start": int(p.get("start") or 0),
        "end": int(p.get("end") or 0),
        "end_iso": _ts_to_iso(p.get("end") or 0),
        "state": p.get("state"),
    } for p in discussion_props])
    df.to_csv(f"Target_proposals/{space}_with_discussion.csv", index=False)

    # 5) summary 정보 업데이트
    total = len(finished_props)  # 母수: 종료된 프로포절 수
    count_with_disc = len(discussion_props)
    percent = (count_with_disc / total * 100.0) if total > 0 else 0.0

    summary_rows.append({
        "space": space,
        "total_closed": total,
        "with_discussion": count_with_disc,
        "without_discussion": max(total - count_with_disc, 0),
        "pct_with_discussion": round(percent, 2),
    })

# ---------- 요약 저장 ----------
summary_df = pd.DataFrame(summary_rows).sort_values("pct_with_discussion", ascending=False)
summary_df.to_csv("Target_proposals/summary_with_discussion_by_space.csv", index=False)

print("\n✅ 완료!")
print(" - Target_proposals/summary_with_discussion_by_space.csv")
print(" - Target_proposals/<space>_with_discussion.csv (per space)")


🚀 Snapshot 프로포절 수집 시작...



Processing Spaces: 100%|██████████| 10/10 [01:30<00:00,  9.01s/it]


✅ 완료!
 - Target_proposals/summary_with_discussion_by_space.csv
 - Target_proposals/<space>_with_discussion.csv (per space)





In [8]:
import os, glob, json, time, random, re, requests
from urllib.parse import urlparse
from datetime import datetime, timezone
import pandas as pd
from tqdm import tqdm

# -------- HTTP & 유틸 --------
TIMEOUT = 30
BASE_SLEEP  = 0.6
MAX_RETRIES = 5
BACKOFF_BASE = 1.7
JITTER = (0.1, 0.35)

session = requests.Session()
session.headers.update({"User-Agent": "target-proposals-save-comments/0.1"})

def _sleep():
    time.sleep(BASE_SLEEP + random.uniform(*JITTER))

def fetch_url(url: str) -> requests.Response:
    retries = 0
    while True:
        _sleep()
        try:
            r = session.get(url, timeout=TIMEOUT)
        except requests.RequestException:
            if retries < MAX_RETRIES:
                delay = (BACKOFF_BASE ** retries) + random.uniform(*JITTER)
                time.sleep(delay); retries += 1
                continue
            raise
        if r.status_code == 200:
            return r
        if r.status_code in (429, 502, 503, 504) and retries < MAX_RETRIES:
            ra = r.headers.get("Retry-After")
            delay = float(ra) if (ra and ra.isdigit()) else (BACKOFF_BASE ** retries)
            time.sleep(delay + random.uniform(*JITTER)); retries += 1
            continue
        return r

def is_discourse_thread(url: str) -> bool:
    try:
        return url.startswith("http") and "/t/" in urlparse(url).path.lower()
    except Exception:
        return False

def to_ts(iso: str) -> int:
    try:
        return int(datetime.fromisoformat(iso.replace("Z","+00:00")).timestamp())
    except Exception:
        return 0

def fetch_discourse_thread(url: str, max_pages: int = 10):
    base = url.split("?")[0].rstrip("/")
    if not base.endswith(".json"):
        base = base + ".json"
    posts, header = [], {}
    for page in range(1, max_pages + 1):
        u = base if page == 1 else base.replace(".json", f".json?page={page}")
        rr = fetch_url(u)
        if rr.status_code != 200:
            break
        j = rr.json()
        if page == 1:
            header = {
                "title": j.get("title"),
                "slug": j.get("slug"),
                "created_at": j.get("created_at"),
                "posts_count": j.get("posts_count"),
                "tags": j.get("tags"),
                "url": url,
            }
        chunk = j.get("post_stream", {}).get("posts", [])
        if not chunk:
            break
        for p in chunk:
            posts.append({
                "id": p.get("id"),
                "username": p.get("username"),
                "user_id": p.get("user_id"),
                "created_at": p.get("created_at"),
                "updated_at": p.get("updated_at"),
                "raw": p.get("raw"),
                "cooked": p.get("cooked"),
                "post_number": p.get("post_number"),
                "reply_to_post_number": p.get("reply_to_post_number"),
            })
    return {"thread": header, "posts": posts}

# -------- 입력 로드 & 샘플링 --------
os.makedirs("Target_proposals", exist_ok=True)
comments_dir = "Target_proposals/comments"
os.makedirs(comments_dir, exist_ok=True)

files = sorted(glob.glob("Target_proposals/*_with_discussion.csv"))
if not files:
    raise FileNotFoundError("Target_proposals/*_with_discussion.csv 가 없습니다. 먼저 생성 코드를 실행하세요.")

dfs = []
for f in files:
    try:
        df = pd.read_csv(f)
        needed = {"space","id","title","author","discussion","start","end","end_iso","state"}
        if needed.issubset(df.columns):
            dfs.append(df)
    except Exception:
        pass

if not dfs:
    raise RuntimeError("읽을 수 있는 with_discussion CSV가 없습니다.")

all_df = pd.concat(dfs, ignore_index=True)
if len(all_df) == 0:
    raise RuntimeError("with_discussion 행이 비어 있습니다.")

sample_n = min(10, len(all_df))
sample_df = all_df.sample(n=sample_n, random_state=42).reset_index(drop=True)

# -------- 저장 준비: 통합 아웃풋 --------
combined_csv_path = os.path.join(comments_dir, "comments_all.csv")
combined_jsonl_path = os.path.join(comments_dir, "comments_all.jsonl")
skip_log_path = os.path.join(comments_dir, "skipped_non_discourse.csv")

# 통합 파일 초기화
pd.DataFrame(columns=[
    "space","proposal_id","proposal_title","discussion_url",
    "post_id","post_number","reply_to_post_number",
    "author_username","created_at","created_ts","text_raw","text_html"
]).to_csv(combined_csv_path, index=False)
open(combined_jsonl_path, "w", encoding="utf-8").close()
pd.DataFrame(columns=[
    "space","proposal_id","proposal_title","discussion_url","reason"
]).to_csv(skip_log_path, index=False)

print(f"📝 샘플 {sample_n}개 댓글 저장 시작...\n")

combined_rows = []
skipped_rows = []

for i in tqdm(range(sample_n), desc="Saving comments"):
    row = sample_df.loc[i]
    url = str(row["discussion"]).strip()
    space = row["space"]; pid = row["id"]; title = row["title"]
    end_ts = int(row.get("end", 0)) if pd.notna(row.get("end", 0)) else 0

    if not is_discourse_thread(url):
        skipped_rows.append({
            "space": space, "proposal_id": pid, "proposal_title": title,
            "discussion_url": url, "reason": "non-discourse (no structured comments)"
        })
        continue

    data = fetch_discourse_thread(url, max_pages=10)
    posts = data.get("posts", [])
    # 프로포절 종료 이전 댓글만
    if end_ts:
        posts = [p for p in posts if p.get("created_at") and to_ts(p["created_at"]) <= end_ts]

    # ---- 프로포절별 CSV 저장 ----
    per_path = os.path.join(comments_dir, f"{space}__{pid}__comments.csv")
    per_df = pd.DataFrame([{
        "space": space,
        "proposal_id": pid,
        "proposal_title": title,
        "discussion_url": url,
        "post_id": p.get("id"),
        "post_number": p.get("post_number"),
        "reply_to_post_number": p.get("reply_to_post_number"),
        "author_username": p.get("username"),
        "created_at": p.get("created_at"),
        "created_ts": to_ts(p.get("created_at") or ""),
        "text_raw": p.get("raw"),
        "text_html": p.get("cooked"),
    } for p in posts])
    per_df.to_csv(per_path, index=False)

    # ---- 통합 CSV/JSONL에 추가 ----
    if len(posts):
        combined_rows.extend(per_df.to_dict(orient="records"))
        with open(combined_jsonl_path, "a", encoding="utf-8") as wf:
            for p in posts:
                record = {
                    "text": p.get("raw") or p.get("cooked") or "",
                    "meta": {
                        "space": space,
                        "proposal_id": pid,
                        "proposal_title": title,
                        "discussion_url": url,
                        "post_id": p.get("id"),
                        "post_number": p.get("post_number"),
                        "reply_to_post_number": p.get("reply_to_post_number"),
                        "author_username": p.get("username"),
                        "created_at": p.get("created_at"),
                        "created_ts": to_ts(p.get("created_at") or ""),
                        "text_raw": p.get("raw"),
                        "text_html": p.get("cooked"),
                    }
                }
                wf.write(json.dumps(record, ensure_ascii=False) + "\n")

# 통합 CSV 쓰기
if combined_rows:
    pd.DataFrame(combined_rows).to_csv(combined_csv_path, mode="w", index=False)

# 스킵 로그 쓰기
if skipped_rows:
    pd.DataFrame(skipped_rows).to_csv(skip_log_path, mode="w", index=False)

print("\n✅ 완료!")
print(f" - 프로포절별 댓글 CSV: Target_proposals/comments/<space>__<proposal_id>__comments.csv")
print(f" - 통합 CSV: {combined_csv_path}")
print(f" - 통합 JSONL(LLM 친화): {combined_jsonl_path}")
print(f" - 스킵 로그(비-디스코스): {skip_log_path}")


📝 샘플 10개 댓글 저장 시작...



Saving comments: 100%|██████████| 10/10 [00:25<00:00,  2.56s/it]


✅ 완료!
 - 프로포절별 댓글 CSV: Target_proposals/comments/<space>__<proposal_id>__comments.csv
 - 통합 CSV: Target_proposals/comments/comments_all.csv
 - 통합 JSONL(LLM 친화): Target_proposals/comments/comments_all.jsonl
 - 스킵 로그(비-디스코스): Target_proposals/comments/skipped_non_discourse.csv



