In [111]:
import os

# === Configuration ===
ROOT_DIR = "/Users/chunghyunhan/Projects/agentics/Decision_runs_result_saving"  # <<-- change to your folder
OUT_CSV  = os.path.join(ROOT_DIR, "decision_runs_consolidated.csv")
OUT_JSONL= os.path.join(ROOT_DIR, "decision_runs_consolidated.jsonl")

# If True, reprocess existing snapshot_urls so schema changes (new columns) are reflected.
UPSERT_ON_SCHEMA_CHANGE = True
print("ROOT_DIR:", ROOT_DIR)


ROOT_DIR: /Users/chunghyunhan/Projects/agentics/Decision_runs_result_saving


In [112]:
# === Helpers ===
import json, os, glob, pandas as pd
import re
from typing import Any, Dict, List, Optional

def safe_get(d: Dict[str, Any], path: List[Any], default=None):
    cur = d
    for p in path:
        if isinstance(cur, dict) and p in cur:
            cur = cur[p]
        else:
            return default
    return cur

def flatten_evidence_quotes(evidence_list):
    if not isinstance(evidence_list, list):
        return ""
    return "
".join([f"- {ev.get(\"quote\").strip()}" for ev in evidence_list if isinstance(ev, dict) and ev.get("quote")])

def collect_similar_ids(sim_list):
    if not isinstance(sim_list, list):
        return ""
    ids = []
    for x in sim_list:
        pid = (x.get("proposal_id") or (x.get("cleaned") or {}).get("proposal_id"))
        if pid:
            ids.append(pid)
    return ",".join(ids)

def build_similar_cleaned_columns(sim_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Flatten ALL 'cleaned' fields from similar_proposals_data with numbered prefixes,
    EXCEPT 'winning_option_index' which we intentionally drop.
    """
    out = {}
    if not isinstance(sim_list, list):
        return out
    for i, item in enumerate(sim_list, start=1):
        cleaned = item.get("cleaned") or {}
        for k, v in cleaned.items():
            if k == "winning_option_index":  # <- DROP this key
                continue
            col = f"similar_proposal_{i}_{k}"
            out[col] = v
    return out

def build_similar_raw_impacts(sim_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Create per-index raw impact columns instead of list columns:
      similar_proposal_{i}_price_impact_pct_raw
      similar_proposal_{i}_tvl_impact_pct_raw
    """
    out = {}
    if not isinstance(sim_list, list):
        return out
    for i, item in enumerate(sim_list, start=1):
        raw = item.get("raw") or {}
        tvl_pct   = (raw.get("tvl_impact") or {}).get("abnormal_change_pct")
        price_pct = (raw.get("price_impact") or {}).get("abnormal_change_pct")
        if price_pct is not None:
            out[f"similar_proposal_{i}_price_impact_pct_raw"] = price_pct
        if tvl_pct is not None:
            out[f"similar_proposal_{i}_tvl_impact_pct_raw"]   = tvl_pct
    return out

def build_similar_market_columns(sim_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    out = {}
    if not isinstance(sim_list, list):
        return out
    for i, item in enumerate(sim_list, start=1):
        for key in ("price_impact_pct", "price_impact_market_pct", "price_impact_market_adjusted_pct", "tvl_impact_pct"):
            value = item.get(key)
            if value is not None:
                out[f"similar_proposal_{i}_{key}"] = value
    return out

def parse_snapshot_url(snapshot_url: str) -> Dict[str, Optional[str]]:
    if not isinstance(snapshot_url, str):
        return {"proposal_id": None, "space": None}
    match = re.search(r"#/([^/]+)/proposal/([^/?#]+)", snapshot_url)
    if match:
        space, proposal_id = match.group(1), match.group(2)
        return {"proposal_id": proposal_id, "space": space}
    return {"proposal_id": None, "space": None}

def extract_record(d: Dict[str, Any]) -> Dict[str, Any]:
    snapshot_url = d.get("snapshot_url") or safe_get(d, ["decision","snapshot_url"], "")
    tmc = d.get("timeline_metrics_current") or {}
    decision = d.get("decision", {})
    simdata  = d.get("similar_proposals_data", [])

    avr = decision.get("actual_vote_result") or {}
    sim_props = decision.get("similar_proposals") or []

    snapshot_parts = parse_snapshot_url(snapshot_url)
    row = {
        "snapshot_url": snapshot_url,
        "proposal_id": snapshot_parts["proposal_id"],
        "space": snapshot_parts["space"],
        "end_iso": decision.get("event_end_utc"),
        "total_votes": tmc.get("total_votes"),
        "vp_by_quartile": json.dumps(tmc.get("vp_by_quartile")) if tmc.get("vp_by_quartile") is not None else None,
        "spike_index": tmc.get("spike_index"),
        "spike_follow_support_ratio": tmc.get("spike_follow_support_ratio"),
        "stairwise_ratio": tmc.get("stairwise_ratio"),
        "half_slope_diff": tmc.get("half_slope_diff"),

        # ⛔️ (변경) 더 이상 list 컬럼을 만들지 않음
        # "similar_raw_tvl_impact_pct_list": ...
        # "similar_raw_price_impact_pct_list": ...

        "selected_choice_label": decision.get("selected_choice_label"),
        "summary": decision.get("summary"),
        "key_arguments_for": " | ".join(decision.get("key_arguments_for") or []),
        "key_arguments_against": " | ".join(decision.get("key_arguments_against") or []),
        "evidence_quotes": flatten_evidence_quotes(decision.get("evidence")),
        "available_choices": ", ".join(decision.get("available_choices") or []) if isinstance(decision.get("available_choices"), list) else decision.get("available_choices"),

        "token_price_impact_pct": decision.get("token_price_impact_pct"),
        "token_price_market_pct": decision.get("token_price_market_pct"),
        "token_price_market_adjusted_pct": decision.get("token_price_market_adjusted_pct"),
        "tvl_impact_pct": decision.get("tvl_impact_pct"),
        "actual_vote_result_winner": avr.get("winner_label"),
        "actual_vote_scores_total": avr.get("scores_total"),
        "actual_vote_margin_pct": avr.get("margin_pct"),

        "decision_stance": decision.get("decision_stance"),
        "ai_final_conclusion": decision.get("ai_final_conclusion"),
        "ai_final_reason": decision.get("ai_final_reason"),
        "similar_proposals": collect_similar_ids(sim_props),
        "ex_post_price_impact_pct": decision.get("ex_post_price_impact_pct"),
        "ex_post_tvl_impact_pct": decision.get("ex_post_tvl_impact_pct"),

        "market_index_impact_pct": d.get("market_index_impact_pct"),
        "market_adjusted_price_impact_pct": d.get("market_adjusted_price_impact_pct"),
        "agentic_ai_choice": d.get("agentic_ai_choice"),
        "actual_outcome": d.get("actual_outcome"),
        "match_result": d.get("match_result"),
        "forum_sentiment_summary": json.dumps(d.get("forum_sentiment_summary")) if d.get("forum_sentiment_summary") else None,
    }

    # (추가) cleaned 필드(일부 키 제외)와 raw per-index 임팩트 병합
    row.update(build_similar_cleaned_columns(simdata))
    row.update(build_similar_raw_impacts(simdata))
    row.update(build_similar_market_columns(simdata))
    return row


SyntaxError: unterminated string literal (detected at line 18) (2342366468.py, line 18)

In [114]:

# === Consolidate and print 'newly added' count; always overwrite outputs ===
# 1) Load existing consolidated to know which snapshot_urls we already have
if os.path.exists(OUT_CSV):
    existing_df = pd.read_csv(OUT_CSV)
    known_urls = set(existing_df.get("snapshot_url", pd.Series(dtype=str)).dropna().astype(str))
else:
    existing_df = pd.DataFrame()
    known_urls = set()

# 2) Read ALL json files currently in the folder
rows = []
for fp in sorted(glob.glob(os.path.join(ROOT_DIR, "*.json"))):
    try:
        with open(fp, "r") as f:
            data = json.load(f)
    except Exception:
        continue
    rows.append(extract_record(data))

new_df = pd.DataFrame(rows)

# 3) Count how many are NEW vs existing, by snapshot_url
file_urls = set(new_df.get("snapshot_url", pd.Series(dtype=str)).dropna().astype(str))
new_urls = file_urls - known_urls
num_new = len(new_urls)

# 4) Upsert (union schema, prefer latest rows) and ALWAYS overwrite outputs
union_cols = sorted(set(existing_df.columns) | set(new_df.columns))
existing_df = existing_df.reindex(columns=union_cols, fill_value=pd.NA)
new_df      = new_df.reindex(columns=union_cols,      fill_value=pd.NA)
final_df = pd.concat([existing_df, new_df], ignore_index=True)

if "snapshot_url" in final_df.columns:
    final_df = final_df.drop_duplicates(subset=["snapshot_url"], keep="last")

# 5) Save (overwrite)
final_df.to_csv(OUT_CSV, index=False)
with open(OUT_JSONL, "w") as f:
    for _, r in final_df.iterrows():
        f.write(json.dumps({k:v for k,v in r.items() if pd.notna(v)}, ensure_ascii=False) + "\n")

print(f"[NEW ADDED THIS RUN]: {num_new}")
print(f"[TOTAL ROWS NOW]    : {len(final_df)}")
print(f"CSV   -> {OUT_CSV}")
print(f"JSONL -> {OUT_JSONL}")

# 6) Optional: small preview
display_cols = [c for c in final_df.columns if c.startswith("similar_proposal_")][:8]
preview = final_df[["snapshot_url"] + display_cols].head(5)
try:
    from caas_jupyter_tools import display_dataframe_to_user
    display_dataframe_to_user("Preview (first 5 rows with similar_proposal_* cols)", preview)
except Exception:
    display(preview)


[NEW ADDED THIS RUN]: 2
[TOTAL ROWS NOW]    : 2
CSV   -> /Users/chunghyunhan/Projects/agentics/Decision_runs_result_saving/decision_runs_consolidated.csv
JSONL -> /Users/chunghyunhan/Projects/agentics/Decision_runs_result_saving/decision_runs_consolidated.jsonl


  final_df = pd.concat([existing_df, new_df], ignore_index=True)


Unnamed: 0,snapshot_url,similar_proposal_1_author,similar_proposal_1_change_stance,similar_proposal_1_end_utc,similar_proposal_1_margin_abs,similar_proposal_1_margin_pct,similar_proposal_1_price_impact_market_adjusted_pct,similar_proposal_1_price_impact_market_pct,similar_proposal_1_price_impact_pct
0,https://snapshot.org/#/aavedao.eth/proposal/0x...,0xF1dF824419879Bb8a7E758173523F88EfB7Af193,To change,2024-10-18T10:01:59Z,681545.386417,0.884058,-0.4164,1.7272,1.3108
1,https://snapshot.org/#/aavedao.eth/proposal/0x...,0xF1dF824419879Bb8a7E758173523F88EfB7Af193,To change,2025-07-26T11:31:29Z,777987.210231,0.999992,0.3538,0.4195,0.7733
