# SiteSage Evaluation & Rubric Revision

This notebook orchestrates three stages:
1. Load Dianping collection data to compute ground-truth (GT) ratios for identical brands operating at different addresses.
2. Run the SiteSage agent flow once per location (per brand) and reuse those cached outputs to build comparison pairs.
3. Re-score each pair with the latest rubric (skipping aligned cases) and run `rubric_revision`—with a lightweight test harness that reuses stored sessions—to iteratively adjust the rubric.

Tweak the configuration cells as needed before running each stage.

In [25]:
from __future__ import annotations

import csv
import hashlib
import json
import re
from dataclasses import dataclass
from itertools import combinations
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

import dotenv
from openai import OpenAI

dotenv.load_dotenv()

from sitesage_backend import run_sitesage_session, parse_json_from_text

DATA_PATH = Path("data/dianping_collection_data.csv")
PROMPT_TEMPLATE = (
    "I want to open a boutique coffee shop optimized for morning commuters at {}. "
    "Please run the full SiteSage workflow and deliver the final report."
)
SESSION_LANGUAGE = "zh"
CACHE_DIR = Path("save/evaluate_cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

RATIO_THRESHOLD = 0.1

session_results: Dict[Tuple[str, str], Dict[str, object]] = {}
location_index: Dict[str, 'LocationStat'] = {}
evaluated_pairs: List[Dict[str, object]] = []
test_pairs: List[Dict[str, object]] = []
SCORING_CACHE: Dict[Tuple[str, str], Dict[str, float]] = {}

def _fmt(value):
    return f"{value:.2f}" if isinstance(value, (int, float)) else "n/a"


In [26]:
@dataclass(frozen=True)
class LocationStat:
    brand: str
    store_name: str
    address: str
    total_reviews: float
    avg_reviews_per_day: float
    sample_days: int


def parse_brand_name(store_name: str) -> str:
    clean_name = (store_name or "").strip()
    if "(" in clean_name:
        return clean_name.split("(", 1)[0].strip()
    if "（" in clean_name:
        return clean_name.split("（", 1)[0].strip()
    return clean_name or "unknown"


def load_location_stats(csv_path: Path) -> List[LocationStat]:
    stats: Dict[Tuple[str, str, str], Dict[str, float]] = {}
    with csv_path.open(newline="", encoding="utf-8") as fh:
        reader = csv.DictReader(fh)
        for row in reader:
            store_name = (row["store"] or "").strip()
            brand = parse_brand_name(store_name)
            address = (row["address"] or "").strip()
            review_cnt = int(row["review_cnt"])
            key = (brand, store_name, address)
            entry = stats.setdefault(key, {"total": 0, "days": 0})
            entry["total"] += review_cnt
            entry["days"] += 1
    locations: List[LocationStat] = []
    for (brand, store_name, address), values in stats.items():
        days = max(int(values["days"]), 1)
        total = float(values["total"])
        locations.append(
            LocationStat(
                brand=brand,
                store_name=store_name,
                address=address,
                total_reviews=total,
                avg_reviews_per_day=total / days,
                sample_days=days,
            )
        )
    locations.sort(key=lambda loc: (loc.brand, -loc.total_reviews))
    return locations


def group_locations_by_brand(locations: Iterable[LocationStat]) -> Dict[str, List[LocationStat]]:
    groups: Dict[str, List[LocationStat]] = {}
    for loc in locations:
        groups.setdefault(loc.brand, []).append(loc)
    for locs in groups.values():
        locs.sort(key=lambda loc: -loc.total_reviews)
    return groups


def build_location_pairs(groups: Dict[str, List[LocationStat]]) -> List[Dict[str, object]]:
    pairs: List[Dict[str, object]] = []
    for brand, locs in groups.items():
        if len(locs) < 2:
            continue
        for i in range(len(locs)):
            for j in range(i + 1, len(locs)):
                loc_a = locs[i]
                loc_b = locs[j]
                ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
                pairs.append(
                    {
                        "brand": brand,
                        "location_a": loc_a,
                        "location_b": loc_b,
                        "gt_ratio": ratio,
                        "gt_preference": "A" if ratio >= 1 else "B",
                        "gt_difference": loc_a.total_reviews - loc_b.total_reviews,
                    }
                )
    return pairs


def load_initial_rubric_text() -> str:
    rubric_dir = Path("rubrics")
    sections: List[str] = []
    for name in ("customer_rubric.md", "traffic_rubric.md", "competition_rubric.md"):
        path = rubric_dir / name
        if path.exists():
            sections.append(path.read_text(encoding="utf-8"))
    return "\n\n".join(sections).strip()


def slugify(value: str) -> str:
    slug = re.sub(r"[^a-zA-Z0-9]+", "-", value).strip("-").lower()
    return slug or "location"


def build_session_id(location: LocationStat) -> str:
    brand_slug = slugify(location.brand)
    address_slug = slugify(location.address)[:40]
    return f"eval-{brand_slug}-{address_slug}"


def load_final_report_text(payload: Dict[str, object]) -> str:
    final_report = payload.get("final_report", {}) or {}
    path_str = final_report.get("report_path") if isinstance(final_report, dict) else None
    if isinstance(path_str, str) and path_str:
        path = Path(path_str)
        if path.exists():
            return path.read_text(encoding="utf-8")
    if isinstance(final_report, dict):
        return final_report.get("report_md", "")
    return ""


def extract_report_sections(payload: Dict[str, object]) -> Dict[str, str]:
    sections: Dict[str, str] = {}
    raw_reports = payload.get("reports") or {}
    for key in ("customer", "traffic", "competition"):
        value = raw_reports.get(key) if isinstance(raw_reports, dict) else None
        if isinstance(value, str):
            sections[key] = value
        elif isinstance(value, dict):
            sections[key] = value.get("report_md", "")
        else:
            sections[key] = ""
    return sections


def run_session_for_location(location: LocationStat) -> Dict[str, object]:
    session_id = build_session_id(location)
    cache_path = CACHE_DIR / f"{session_id}.json"
    prompt = PROMPT_TEMPLATE.format(location.address)
    if cache_path.exists():
        payload = json.loads(cache_path.read_text(encoding="utf-8"))
    else:
        payload = run_sitesage_session(session_id, prompt, language=SESSION_LANGUAGE)
        cache_path.write_text(
            json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8"
        )
    report_md = load_final_report_text(payload)
    report_sections = extract_report_sections(payload)
    return {
        "session_id": session_id,
        "prompt": prompt,
        "final_score": payload.get("final_score"),
        "scores": payload.get("scores", {}),
        "report_md": report_md,
        "report_sections": report_sections,
        "raw": payload,
    }


In [27]:
locations = load_location_stats(DATA_PATH)
brand_groups = group_locations_by_brand(locations)
location_index = {loc.address: loc for loc in locations}
location_pairs = build_location_pairs(brand_groups)

print(f"Loaded {len(locations)} store/location combinations across {len(brand_groups)} brands.")
print(f"Generated {len(location_pairs)} same-brand comparison pairs.")
print("\nPair catalog (first 10 shown):")
for idx, pair in enumerate(location_pairs[:10]):
    print(
        f"{idx:02d}: [{pair['brand']}] {pair['location_a'].store_name} vs {pair['location_b'].store_name} -> "
        f"GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Loaded 13 store/location combinations across 4 brands.
Generated 21 same-brand comparison pairs.

Pair catalog (first 10 shown):
00: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(比斯特上海购物村店) -> GT ratio 1.22 (prefers A)
01: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(武康路店) -> GT ratio 1.98 (prefers A)
02: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 2.11 (prefers A)
03: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT ratio 5.16 (prefers A)
04: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(武康路店) -> GT ratio 1.62 (prefers A)
05: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 1.73 (prefers A)
06: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT ratio 4.23 (prefers A)
07: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(武康路店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 1.07 (prefers A)
08: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(武康路店) 

In [30]:
# Run SiteSage for every location under the selected brand once (cached after first run).
target_brand = "Starbucks 甄选"
max_locations_per_brand = 6
brand_locations = brand_groups.get(target_brand, [])[:max_locations_per_brand]

if not brand_locations:
    raise ValueError(f"Brand {target_brand} not found in dataset.")

for loc in brand_locations:
    key = (loc.brand, loc.address)
    if key in session_results:
        print(f"Reusing cached session for {loc.store_name} @ {loc.address}")
        continue
    result = run_session_for_location(loc)
    session_results[key] = result
    print(
        f"Ran session {result['session_id']} -> final score {_fmt(result['final_score'])}"
    )

print("Available sessions for brand:")
for loc in brand_locations:
    key = (loc.brand, loc.address)
    result = session_results.get(key)
    print(
        f"- {loc.store_name} ({loc.address}) -> final score {_fmt(result.get('final_score') if result else None)}"
    )


Reusing cached session for Starbucks 甄选(美罗城店) @ 上海徐汇区肇家兵路1111号219单元
Reusing cached session for Starbucks 甄选(白玉兰广场1F店) @ 东长治路588号白玉兰广场商场1楼16号商铺
Reusing cached session for Starbucks 甄选(世茂广场店) @ 上海黄浦区南京东路829号G05商铺
Ran session eval-starbucks-1515-e1-03 -> final score 8.11
Ran session eval-starbucks-5001-1-gf12 -> final score 7.74
Available sessions for brand:
- Starbucks 甄选(美罗城店) (上海徐汇区肇家兵路1111号219单元) -> final score 8.32
- Starbucks 甄选(白玉兰广场1F店) (东长治路588号白玉兰广场商场1楼16号商铺) -> final score 8.07
- Starbucks 甄选(世茂广场店) (上海黄浦区南京东路829号G05商铺) -> final score 8.32
- Starbucks 甄选(静安嘉里中心f1店) (上海静安区南京西路1515号嘉里中心商场e1-03) -> final score 8.11
- Starbucks 甄选(莘庄仲盛店) (上海闵行区都市路5001号仲盛世界商城1层GF12商铺) -> final score 7.74


In [31]:
# Build comparison pairs using cached session outputs (no reruns).
evaluated_pairs = []

def _component(result: Dict[str, object], name: str):
    scores = result.get("scores") or {}
    if isinstance(scores, dict):
        value = scores.get(name)
        if isinstance(value, dict):
            return value.get("score")
        return value
    return None

for loc_a, loc_b in combinations(brand_locations, 2):
    key_a = (loc_a.brand, loc_a.address)
    key_b = (loc_b.brand, loc_b.address)
    res_a = session_results.get(key_a)
    res_b = session_results.get(key_b)
    if not res_a or not res_b:
        print(f"Missing session data for {loc_a.store_name} or {loc_b.store_name}, skipping.")
        continue
    ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
    evaluated_pairs.append(
        {
            "brand": loc_a.brand,
            "location_a": loc_a,
            "location_b": loc_b,
            "gt_ratio": ratio,
            "gt_preference": "A" if ratio >= 1 else "B",
            "location_a_result": res_a,
            "location_b_result": res_b,
        }
    )

print(f"Prepared {len(evaluated_pairs)} cached comparison pairs.")
for idx, pair in enumerate(evaluated_pairs, start=1):
    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Final scores -> A {_fmt(res_a.get('final_score'))} | B {_fmt(res_b.get('final_score'))}"
    )
    print(
        f"    Components A (C/T/K): {_fmt(_component(res_a, 'customer'))}/"
        f"{_fmt(_component(res_a, 'traffic'))}/{_fmt(_component(res_a, 'competition'))}"
    )
    print(
        f"    Components B (C/T/K): {_fmt(_component(res_b, 'customer'))}/"
        f"{_fmt(_component(res_b, 'traffic'))}/{_fmt(_component(res_b, 'competition'))}"
    )
    print(
        f"    GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Prepared 10 cached comparison pairs.
Pair 1: Starbucks 甄选(美罗城店) vs Starbucks 甄选(白玉兰广场1F店)
    Final scores -> A 8.32 | B 8.07
    Components A (C/T/K): 9.00/9.40/6.00
    Components B (C/T/K): 8.70/8.80/6.30
    GT ratio 1.74 (prefers A)
Pair 2: Starbucks 甄选(美罗城店) vs Starbucks 甄选(世茂广场店)
    Final scores -> A 8.32 | B 8.32
    Components A (C/T/K): 9.00/9.40/6.00
    Components B (C/T/K): 9.00/9.40/6.00
    GT ratio 2.27 (prefers A)
Pair 3: Starbucks 甄选(美罗城店) vs Starbucks 甄选(静安嘉里中心f1店)
    Final scores -> A 8.32 | B 8.11
    Components A (C/T/K): 9.00/9.40/6.00
    Components B (C/T/K): 9.10/9.40/5.90
    GT ratio 2.38 (prefers A)
Pair 4: Starbucks 甄选(美罗城店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 8.32 | B 7.74
    Components A (C/T/K): 9.00/9.40/6.00
    Components B (C/T/K): 8.30/8.80/5.60
    GT ratio 4.35 (prefers A)
Pair 5: Starbucks 甄选(白玉兰广场1F店) vs Starbucks 甄选(世茂广场店)
    Final scores -> A 8.07 | B 8.32
    Components A (C/T/K): 8.70/8.80/6.30
    Components B (C/T/K): 9.00/9.

In [32]:
# Lightweight test harness: load existing SiteSage runs from save/test_* directories.
SCORE_PATTERNS = {
    "customer": re.compile(r"Customer Analysis:\s*([0-9.]+)/10"),
    "traffic": re.compile(r"Traffic .*?:\s*([0-9.]+)/10"),
    "competition": re.compile(r"Competition Analysis:\s*([0-9.]+)/10"),
}
FINAL_PATTERN = re.compile(r"Final Weighted Score:\s*([0-9.]+)/10")

REPORT_FILES = {
    "customer": "02_customer.md",
    "traffic": "03_traffic.md",
    "competition": "04_competition.md",
}

def _extract_score(pattern, text: str) -> float | None:
    match = pattern.search(text)
    return float(match.group(1)) if match else None

def load_session_from_directory(session_id: str) -> Dict[str, object]:
    session_dir = Path("save") / session_id
    evaluation_path = session_dir / "05_evaluation.md"
    final_report_path = session_dir / "07_final_report.md"
    eval_text = evaluation_path.read_text(encoding="utf-8")
    report_md = final_report_path.read_text(encoding="utf-8")
    component_scores = {k: _extract_score(pattern, eval_text) for k, pattern in SCORE_PATTERNS.items()}
    final_score = _extract_score(FINAL_PATTERN, eval_text)
    report_sections = {
        key: (session_dir / fname).read_text(encoding="utf-8")
        for key, fname in REPORT_FILES.items()
    }
    return {
        "session_id": session_id,
        "final_score": final_score,
        "scores": component_scores,
        "report_md": report_md,
        "report_sections": report_sections,
    }

test_configs = [
    {"session_id": "test_0", "address": "上海静安区南京西路1515号嘉里中心商场e1-03"},
    {"session_id": "test_1", "address": "上海徐汇区肇家兵路1111号219单元"},
    {"session_id": "test_2", "address": "上海闵行区都市路5001号仲盛世界商城1层GF12商铺"},
]
test_entries = []
for cfg in test_configs:
    location = location_index.get(cfg["address"])
    if not location:
        print(f"Address {cfg['address']} not found in dataset, skipping.")
        continue
    payload = load_session_from_directory(cfg["session_id"])
    test_entries.append({"location": location, "result": payload})
    session_results.setdefault((location.brand, location.address), payload)
    print(
        f"Loaded {cfg['session_id']} for {location.store_name} -> final score {_fmt(payload.get('final_score'))}"
    )

test_pairs = []
for left, right in combinations(test_entries, 2):
    loc_a = left["location"]
    loc_b = right["location"]
    res_a = left["result"]
    res_b = right["result"]
    ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
    test_pairs.append(
        {
            "brand": loc_a.brand,
            "location_a": loc_a,
            "location_b": loc_b,
            "gt_ratio": ratio,
            "gt_preference": "A" if ratio >= 1 else "B",
            "location_a_result": res_a,
            "location_b_result": res_b,
        }
    )

print(f"Prepared {len(test_pairs)} test pairs from saved sessions.")
for idx, pair in enumerate(test_pairs, start=1):
    print(
        f"Test Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Final scores -> A {_fmt(pair['location_a_result'].get('final_score'))} | B {_fmt(pair['location_b_result'].get('final_score'))}"
    )
    print(
        f"    GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Loaded test_0 for Starbucks 甄选(静安嘉里中心f1店) -> final score 8.10
Loaded test_1 for Starbucks 甄选(美罗城店) -> final score 7.60
Loaded test_2 for Starbucks 甄选(莘庄仲盛店) -> final score 7.80
Prepared 3 test pairs from saved sessions.
Test Pair 1: Starbucks 甄选(静安嘉里中心f1店) vs Starbucks 甄选(美罗城店)
    Final scores -> A 8.10 | B 7.60
    GT ratio 0.42 (prefers B)
Test Pair 2: Starbucks 甄选(静安嘉里中心f1店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 8.10 | B 7.80
    GT ratio 1.83 (prefers A)
Test Pair 3: Starbucks 甄选(美罗城店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 7.60 | B 7.80
    GT ratio 4.35 (prefers A)


In [43]:
def score_location_with_rubric(result: Dict, rubric_text: str, section: str) -> Dict[str, float]:
    assert(section in ["customer", "traffic", "competition"])
    report = result["report_sections"][section]
    user_prompt = f"""## RUBRIC
{rubric_text}

## REPORT
{report}

Return strict JSON with """
    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "medium"},
        input=[
            {"role": "system", "content": "You score retail site analyses strictly according to the provided rubric."},
            {"role": "user", "content": [{"type": "input_text", "text": user_prompt}]},
        ],
    )
    payload = parse_json_from_text(response.output_text)

    def _to_float(key: str) -> float:
        val = payload.get(key)
        try:
            return float(val)
        except Exception:
            return 0.0

    scored = {
        "customer": _to_float("customer"),
        "traffic": _to_float("traffic"),
        "competition": _to_float("competition"),
        "final_score": _to_float("final_score"),
    }
    if not scored["final_score"]:
        comps = [scored["customer"], scored["traffic"], scored["competition"]]
        scored["final_score"] = sum(comps) / 3 if comps else 0.0
    SCORING_CACHE[cache_key] = scored
    return scored


def evaluate_pair_with_rubric(pair: Dict[str, object], rubric_text: str) -> Dict[str, object]:
    scores_a = score_location_with_rubric(pair["location_a_result"], rubric_text)
    scores_b = score_location_with_rubric(pair["location_b_result"], rubric_text)
    final_a = scores_a.get("final_score") or 0.0
    final_b = scores_b.get("final_score") or 0.0
    predicted_ratio = final_a / max(final_b, 1e-6)
    gt_ratio = pair["gt_ratio"]
    score_diff = predicted_ratio - gt_ratio
    order_matches = (predicted_ratio >= 1 and gt_ratio >= 1) or (predicted_ratio <= 1 and gt_ratio <= 1)
    within_threshold = abs(score_diff) <= RATIO_THRESHOLD
    return {
        "predicted_ratio": predicted_ratio,
        "gt_ratio": gt_ratio,
        "score_diff": score_diff,
        "order_matches": order_matches,
        "within_threshold": within_threshold,
        "scores_a": scores_a,
        "scores_b": scores_b,
    }


def rubric_revision(report1, report2, rubric, anchor, score1, score2, gt_location_score):
    system_prompt = """You are designed to revise the evaluation rubric by comparing two reports.
User will upload:
1. report1: one report evaluating the potential of one store open in location1.
2. report2: one report evaluating the potential of the same store open in location2.
3. score1: score evaluated from the report1.
4. score2: score evaluated form the report2.
5. Ground Truth: the score of location1/location2, meaning the relative traffic of location1 to location2
6. rubric: the current rubric for scoring and evaluating the location.
7. Anchor Rubric: an anchor rubric for reference, the revised rubric should not be too far from the anchor

You should think in your reasoning:
1. Find what goes wrong in the rubric that leads to final error in scoring.
2. Based on scores, reports and ground truth, adjust the rubric, to ensure the order of the score meets ground truth, if possible, keep the difference of gt's loc_a / loc_b and predicted loc_a score1 / loc_b score2 similar.
3. Add some hardcoded rubric when needed.

After thinking, you should directly output **REVISED RUBRIC**, which only includes Customer Potential Rubric, Traffic & Accessibility Potential Rubric and Competition Rubric.

The rubric should be general to all kinds of stores."""

    user_prompt = f"""---- 
# REPORT1
{report1} 
-----
-----
# REPORT2
{report2}
-----

## Score1: {score1}
## Score2: {score2}
## GT: location1 / location2 = {gt_location_score}

-----
## Rubric
{rubric}

## Anchor Rubric
{anchor}"""

    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "medium"},
        input=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [{"type": "input_text", "text": user_prompt}],
            },
        ],
    )
    return response.output_text




In [42]:
current_rubric = load_initial_rubric_text()
anchor_rubric = current_rubric
if not current_rubric:
    raise ValueError("Unable to load the baseline rubric text.")

In [44]:
# Choose which pair set to feed into the rubric revision loop.
pairs_for_revision = evaluated_pairs  # swap to evaluated_pairs after running brand sessions

if not pairs_for_revision:
    raise ValueError("No comparison pairs available; run the setup cells first.")

rubric_history = [{"iteration": 0, "rubric": current_rubric}]

for idx, pair in enumerate(pairs_for_revision, start=1):
    alignment = evaluate_pair_with_rubric(pair, current_rubric)
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Predicted ratio {alignment['predicted_ratio']:.2f} vs GT {alignment['gt_ratio']:.2f} -> diff {alignment['score_diff']:.3f}"
    )
    if alignment["order_matches"] and alignment["within_threshold"]:
        print("    Alignment within threshold, skipping revision for this pair.")
        print("-" * 40)
        continue

    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    revised = rubric_revision(
        report1=res_a['report_sections']['customer'],
        report2=res_b['report_sections']['customer'],
        rubric=current_rubric,
        anchor=anchor_rubric,
        score1=alignment["scores_a"].get("final_score"),
        score2=alignment["scores_b"].get("final_score"),
        gt_location_score=f"{pair['gt_ratio']:.4f}",
    )
    current_rubric = revised.strip()
    rubric_history.append({"iteration": idx, "rubric": current_rubric})
    heading = current_rubric.splitlines()[0] if current_rubric.splitlines() else current_rubric[:80]
    print(
        f"    Rubric updated (iter {idx}) -> {heading}"
    )
    print("-" * 40)

revised_rubric_path = Path("rubrics/revised_rubric.md")
revised_rubric_path.write_text(current_rubric, encoding="utf-8")
print(f"Latest rubric saved to {revised_rubric_path.resolve()}")



Pair 1: Starbucks 甄选(美罗城店) vs Starbucks 甄选(白玉兰广场1F店)
    Predicted ratio 1.05 vs GT 1.74 -> diff -0.692
    Rubric updated (iter 1) -> **REVISED RUBRIC**
----------------------------------------
Pair 2: Starbucks 甄选(美罗城店) vs Starbucks 甄选(世茂广场店)
    Predicted ratio 1.01 vs GT 2.27 -> diff -1.263
    Rubric updated (iter 2) -> **REVISED RUBRIC**
----------------------------------------
Pair 3: Starbucks 甄选(美罗城店) vs Starbucks 甄选(静安嘉里中心f1店)
    Predicted ratio 1.00 vs GT 2.38 -> diff -1.385
    Rubric updated (iter 3) -> **REVISED RUBRIC**
----------------------------------------
Pair 4: Starbucks 甄选(美罗城店) vs Starbucks 甄选(莘庄仲盛店)
    Predicted ratio 1.02 vs GT 4.35 -> diff -3.326
    Rubric updated (iter 4) -> **REVISED RUBRIC**
----------------------------------------
Pair 5: Starbucks 甄选(白玉兰广场1F店) vs Starbucks 甄选(世茂广场店)
    Predicted ratio 0.97 vs GT 1.31 -> diff -0.340
    Rubric updated (iter 5) -> **REVISED RUBRIC**
----------------------------------------
Pair 6: Starbucks 甄选(白玉兰广场1F店

KeyboardInterrupt: 

In [48]:
res_a['report_sections']

'## 1. How far will this coffee shop realistically draw customers?\n\nGiven:\n- Boutique, quality-focused but mainly takeaway\n- Morning commuter focus\n- Located inside a major commercial complex on Nanjing East Road, near People’s Square hub\n\nReasonable “influence radii”:\n\n- **Core walk‑by & habitual commuter radius: 0–300 m**\n  - People using nearby metro exits, office workers in adjacent towers, mall staff.\n- **Primary trade area: ~500 m**\n  - Most morning commuters & office workers willing to walk 3–7 minutes for coffee if it’s on their natural path.\n- **Extended trade area: up to 800–1,000 m**\n  - Some office workers and hotel guests; more occasional than daily.\n\nBelow I’ll use **500 m and 1,000 m** as main reference radii.\n\n---\n\n## 2. Nearby residential population & age profile\n\nWorldPop data around the coordinates (31.234548, 121.475928):\n\n### 2.1 Population within different radii\n\n**Within 500 m**\n- Total population: **≈ 25,012 people**\n- Age structure:\

In [45]:

revised_rubric_path = Path("rubrics/revised_rubric.md")
revised_rubric_path.write_text(current_rubric, encoding="utf-8")
print(f"Latest rubric saved to {revised_rubric_path.resolve()}")



Latest rubric saved to /Users/bob/Downloads/baidu_cloud/留学-UofT/Courses/ece1786 - NLP/project/SiteSage/src/rubrics/revised_rubric.md
