# SiteSage Evaluation & Rubric Revision

This notebook orchestrates three stages:
1. Load Dianping collection data to compute ground-truth (GT) ratios for identical brands operating at different addresses.
2. Run the SiteSage agent flow once per location (per brand) and reuse those cached outputs to build comparison pairs.
3. Re-score each pair with the latest rubric (skipping aligned cases) and run `rubric_revision`—with a lightweight test harness that reuses stored sessions—to iteratively adjust the rubric.

Tweak the configuration cells as needed before running each stage.

In [1]:
from __future__ import annotations

import csv
import hashlib
import json
import re
from dataclasses import dataclass
from itertools import combinations
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from sitesage_backend import fix_json_error

import dotenv
from openai import OpenAI

dotenv.load_dotenv()

from sitesage_backend import run_sitesage_session_async, parse_json_from_text

DATA_PATH = Path("data/dianping_collection_data.csv")
PROMPT_TEMPLATE = (
    "I want to open a coffee shop at {}. "
)
SESSION_LANGUAGE = "zh"
CACHE_DIR = Path("save/evaluate_cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

RATIO_THRESHOLD = 0.6

session_results: Dict[Tuple[str, str], Dict[str, object]] = {}
location_index: Dict[str, 'LocationStat'] = {}
evaluated_pairs: List[Dict[str, object]] = []
test_pairs: List[Dict[str, object]] = []
SCORING_CACHE: Dict[Tuple[str, str], Dict[str, float]] = {}

def _fmt(value):
    return f"{value:.2f}" if isinstance(value, (int, float)) else "n/a"


In [2]:
# necessary functions
@dataclass(frozen=True)
class LocationStat:
    brand: str
    store_name: str
    address: str
    total_reviews: float
    avg_reviews_per_day: float
    sample_days: int


def parse_brand_name(store_name: str) -> str:
    clean_name = (store_name or "").strip()
    if "(" in clean_name:
        return clean_name.split("(", 1)[0].strip()
    if "（" in clean_name:
        return clean_name.split("（", 1)[0].strip()
    return clean_name or "unknown"


def load_location_stats(csv_path: Path) -> List[LocationStat]:
    stats: Dict[Tuple[str, str, str], Dict[str, float]] = {}
    with csv_path.open(newline="", encoding="utf-8") as fh:
        reader = csv.DictReader(fh)
        for row in reader:
            store_name = (row["store"] or "").strip()
            brand = parse_brand_name(store_name)
            address = (row["address"] or "").strip()
            review_cnt = int(row["review_cnt"])
            key = (brand, store_name, address)
            entry = stats.setdefault(key, {"total": 0, "days": 0})
            entry["total"] += review_cnt
            entry["days"] += 1
    locations: List[LocationStat] = []
    for (brand, store_name, address), values in stats.items():
        days = max(int(values["days"]), 1)
        total = float(values["total"])
        locations.append(
            LocationStat(
                brand=brand,
                store_name=store_name,
                address=address,
                total_reviews=total,
                avg_reviews_per_day=total / days,
                sample_days=days,
            )
        )
    locations.sort(key=lambda loc: (loc.brand, -loc.total_reviews))
    return locations


def group_locations_by_brand(locations: Iterable[LocationStat]) -> Dict[str, List[LocationStat]]:
    groups: Dict[str, List[LocationStat]] = {}
    for loc in locations:
        groups.setdefault(loc.brand, []).append(loc)
    for locs in groups.values():
        locs.sort(key=lambda loc: -loc.total_reviews)
    return groups


def build_location_pairs(groups: Dict[str, List[LocationStat]]) -> List[Dict[str, object]]:
    pairs: List[Dict[str, object]] = []
    for brand, locs in groups.items():
        if len(locs) < 2:
            continue
        for i in range(len(locs)):
            for j in range(i + 1, len(locs)):
                loc_a = locs[i]
                loc_b = locs[j]
                ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
                pairs.append(
                    {
                        "brand": brand,
                        "location_a": loc_a,
                        "location_b": loc_b,
                        "gt_ratio": ratio,
                        "gt_preference": "A" if ratio >= 1 else "B",
                        "gt_difference": loc_a.total_reviews - loc_b.total_reviews,
                    }
                )
    return pairs


def load_initial_rubric_text():
    rubric_dir = Path("rubrics")
    sections = {}
    for name in ("customer_rubric.md", "traffic_rubric.md", "competition_rubric.md"):
        path = rubric_dir / name
        sections[name.split("_")[0]] = path.read_text(encoding="utf-8")
    return sections


def slugify(value: str) -> str:
    slug = re.sub(r"[^a-zA-Z0-9]+", "-", value).strip("-").lower()
    return slug or "location"


def build_session_id(location: LocationStat) -> str:
    brand_slug = slugify(location.brand)
    address_slug = slugify(location.address)[:40]
    return f"eval-{brand_slug}-{address_slug}"


def load_final_report_text(payload: Dict[str, object]) -> str:
    final_report = payload.get("final_report", {}) or {}
    path_str = final_report.get("report_path") if isinstance(final_report, dict) else None
    if isinstance(path_str, str) and path_str:
        path = Path(path_str)
        if path.exists():
            return path.read_text(encoding="utf-8")
    if isinstance(final_report, dict):
        return final_report.get("report_md", "")
    return ""


def extract_report_sections(payload: Dict[str, object]) -> Dict[str, str]:
    sections: Dict[str, str] = {}
    raw_reports = payload.get("reports") or {}
    for key in ("customer", "traffic", "competition"):
        value = raw_reports.get(key) if isinstance(raw_reports, dict) else None
        if isinstance(value, str):
            sections[key] = value
        elif isinstance(value, dict):
            sections[key] = value.get("report_md", "")
        else:
            sections[key] = ""
    return sections


async def run_session_for_location(location: LocationStat) -> Dict[str, object]:
    session_id = build_session_id(location)
    cache_path = CACHE_DIR / f"{session_id}.json"
    prompt = PROMPT_TEMPLATE.format(location.address)
    if cache_path.exists():
        payload = json.loads(cache_path.read_text(encoding="utf-8"))
    else:
        payload = await run_sitesage_session_async(session_id, prompt, language=SESSION_LANGUAGE)
        cache_path.write_text(
            json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8"
        )
    report_md = load_final_report_text(payload)
    report_sections = extract_report_sections(payload)
    return {
        "session_id": session_id,
        "prompt": prompt,
        "final_score": payload.get("final_score"),
        "scores": payload.get("scores", {}),
        "report_md": report_md,
        "report_sections": report_sections,
        "raw": payload,
    }


In [3]:
# build catalogs
locations = load_location_stats(DATA_PATH)
brand_groups = group_locations_by_brand(locations)
location_index = {loc.address: loc for loc in locations}
location_pairs = build_location_pairs(brand_groups)

print(f"Loaded {len(locations)} store/location combinations across {len(brand_groups)} brands.")
print(f"Generated {len(location_pairs)} same-brand comparison pairs.")
for idx, pair in enumerate(location_pairs):
    print(
        f"{idx:02d}: [{pair['brand']}] {pair['location_a'].store_name} vs {pair['location_b'].store_name} -> "
        f"GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Loaded 25 store/location combinations across 4 brands.
Generated 111 same-brand comparison pairs.
00: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(比斯特上海购物村店) -> GT ratio 1.22 (prefers A)
01: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(武康路店) -> GT ratio 1.98 (prefers A)
02: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 2.11 (prefers A)
03: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT ratio 5.16 (prefers A)
04: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(武康路店) -> GT ratio 1.62 (prefers A)
05: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 1.73 (prefers A)
06: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT ratio 4.23 (prefers A)
07: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(武康路店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 1.07 (prefers A)
08: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(武康路店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT

## Run SiteSage for every location under the selected brand once

In [4]:
# Run SiteSage for every location under the selected brand once (cached after first run).
target_brand = "Manner Coffee"
max_locations_per_brand = 20
brand_locations = brand_groups.get(target_brand, [])[:max_locations_per_brand]

if not brand_locations:
    raise ValueError(f"Brand {target_brand} not found in dataset.")

for loc in brand_locations:
    key = (loc.brand, loc.address)
    if key in session_results:
        print(f"Reusing cached session for {loc.store_name} @ {loc.address}")
        continue
    result = await run_session_for_location(loc)
    session_results[key] = result
    print(
        f"Ran session {result['session_id']} -> final score {_fmt(result['final_score'])}"
    )

print("Available sessions for brand:")
for loc in brand_locations:
    key = (loc.brand, loc.address)
    result = session_results.get(key)
    print(
        f"- {loc.store_name} ({loc.address}) -> final score {_fmt(result.get('final_score') if result else None)}"
    )


Ran session eval-manner-coffee-60 -> final score 5.63
Ran session eval-manner-coffee-8-d-lg2-45 -> final score 7.94
Ran session eval-manner-coffee-243 -> final score 4.88
Ran session eval-manner-coffee-1111-1-101-2-201 -> final score 4.51
Ran session eval-manner-coffee-2690-5 -> final score 4.36
Ran session eval-manner-coffee-1-3-333e-f -> final score 4.29
Ran session eval-manner-coffee-205-101 -> final score 4.55


[+5.908  s] RT.Session  : DEBUG    - Session b9f9585a-5098-4fea-8ee6-565b86953f2f is initialized
[+5.909  s] RT.Publisher: DEBUG    - RequestCreation(current_node_id=None, new_request_id=33c7d3d9-b687-49d4-ac7c-f8ecc1066950, running_mode=async, new_node_type=EasyToolCallLLM, args=(), kwargs={'user_input': 'Extract store info and resolve the place. Use tools as needed and return the required JSON.\n\nUser request:\nI want to open a coffee shop at 上海东大名路999号北外滩来福士广场B2层26A号（近提篮桥站3号口）. '})
[+5.910  s] RT          : INFO     - START CREATED UnderstandingAgent
[92m14:17:58 - LiteLLM:INFO[0m: utils.py:3383 - 
LiteLLM completion() model= gpt-5.1; provider = openai


### Build comparison pairs using cached session outputs (no reruns).

In [8]:
# Build comparison pairs using cached session outputs (no reruns).
evaluated_pairs = []

def _component(result: Dict[str, object], name: str):
    scores = result.get("scores") or {}
    if isinstance(scores, dict):
        value = scores.get(name)
        if isinstance(value, dict):
            return value.get("score")
        return value
    return None

for loc_a, loc_b in combinations(brand_locations, 2):
    key_a = (loc_a.brand, loc_a.address)
    key_b = (loc_b.brand, loc_b.address)
    res_a = session_results.get(key_a)
    res_b = session_results.get(key_b)
    if not res_a or not res_b:
        print(f"Missing session data for {loc_a.store_name} or {loc_b.store_name}, skipping.")
        continue
    ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
    evaluated_pairs.append(
        {
            "brand": loc_a.brand,
            "location_a": loc_a,
            "location_b": loc_b,
            "gt_ratio": ratio,
            "gt_preference": "A" if ratio >= 1 else "B",
            "location_a_result": res_a,
            "location_b_result": res_b,
        }
    )

print(f"Prepared {len(evaluated_pairs)} cached comparison pairs.")
for idx, pair in enumerate(evaluated_pairs, start=1):
    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    pt_ratio = res_a.get('final_score') / res_b.get('final_score')
    if pt_ratio > 1: pt_preference = "A"
    else: pt_preference = "B"
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Final scores -> A {_fmt(res_a.get('final_score'))} | B {_fmt(res_b.get('final_score'))}"
    )
    print(
        f"    Components A (C/T/K): {_fmt(_component(res_a, 'customer'))}/"
        f"{_fmt(_component(res_a, 'traffic'))}/{_fmt(_component(res_a, 'competition'))}"
    )
    print(
        f"    Components B (C/T/K): {_fmt(_component(res_b, 'customer'))}/"
        f"{_fmt(_component(res_b, 'traffic'))}/{_fmt(_component(res_b, 'competition'))}"
    )
    print(
        f"    GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
        f"    PT ratio {pt_ratio:.2f} (prefers {pt_preference})"
    )


Prepared 10 cached comparison pairs.
Pair 1: % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(比斯特上海购物村店)
    Final scores -> A 5.78 | B 4.43
    Components A (C/T/K): 4.00/6.20/7.10
    Components B (C/T/K): 4.40/2.90/6.50
    GT ratio 1.22 (prefers A)    PT ratio 1.30 (prefers A)
Pair 2: % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(武康路店)
    Final scores -> A 5.78 | B 5.03
    Components A (C/T/K): 4.00/6.20/7.10
    Components B (C/T/K): 5.50/4.60/5.10
    GT ratio 1.98 (prefers A)    PT ratio 1.15 (prefers A)
Pair 3: % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(上海西岸中環店)
    Final scores -> A 5.78 | B 3.51
    Components A (C/T/K): 4.00/6.20/7.10
    Components B (C/T/K): 3.80/2.10/5.10
    GT ratio 2.11 (prefers A)    PT ratio 1.65 (prefers A)
Pair 4: % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(建国西路店)
    Final scores -> A 5.78 | B 4.38
    Components A (C/T/K): 4.00/6.20/7.10
    Components B (C/T/K): 4.50/5.20/3.20
    GT ratio 5.16 (prefers A)    PT ratio 1.32 (prefers A)
Pair 5: % Arab

### (Optional) Test data

In [6]:
# Lightweight test harness: load existing SiteSage runs from save/test_* directories.
SCORE_PATTERNS = {
    "customer": re.compile(r"Customer Analysis:\s*([0-9.]+)/10"),
    "traffic": re.compile(r"Traffic .*?:\s*([0-9.]+)/10"),
    "competition": re.compile(r"Competition Analysis:\s*([0-9.]+)/10"),
}
FINAL_PATTERN = re.compile(r"Final Weighted Score:\s*([0-9.]+)/10")

REPORT_FILES = {
    "customer": "02_customer.md",
    "traffic": "03_traffic.md",
    "competition": "04_competition.md",
}

def _extract_score(pattern, text: str) -> float | None:
    match = pattern.search(text)
    return float(match.group(1)) if match else None

def load_session_from_directory(session_id: str) -> Dict[str, object]:
    session_dir = Path("save") / session_id
    evaluation_path = session_dir / "05_evaluation.md"
    final_report_path = session_dir / "07_final_report.md"
    eval_text = evaluation_path.read_text(encoding="utf-8")
    report_md = final_report_path.read_text(encoding="utf-8")
    component_scores = {k: _extract_score(pattern, eval_text) for k, pattern in SCORE_PATTERNS.items()}
    final_score = _extract_score(FINAL_PATTERN, eval_text)
    report_sections = {
        key: (session_dir / fname).read_text(encoding="utf-8")
        for key, fname in REPORT_FILES.items()
    }
    return {
        "session_id": session_id,
        "final_score": final_score,
        "scores": component_scores,
        "report_md": report_md,
        "report_sections": report_sections,
    }

test_configs = [
    {"session_id": "test_0", "address": "上海静安区南京西路1515号嘉里中心商场e1-03"},
    {"session_id": "test_1", "address": "上海徐汇区肇家兵路1111号219单元"},
    {"session_id": "test_2", "address": "上海闵行区都市路5001号仲盛世界商城1层GF12商铺"},
]
test_entries = []
for cfg in test_configs:
    location = location_index.get(cfg["address"])
    if not location:
        print(f"Address {cfg['address']} not found in dataset, skipping.")
        continue
    payload = load_session_from_directory(cfg["session_id"])
    test_entries.append({"location": location, "result": payload})
    session_results.setdefault((location.brand, location.address), payload)
    print(
        f"Loaded {cfg['session_id']} for {location.store_name} -> final score {_fmt(payload.get('final_score'))}"
    )

test_pairs = []
for left, right in combinations(test_entries, 2):
    loc_a = left["location"]
    loc_b = right["location"]
    res_a = left["result"]
    res_b = right["result"]
    ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
    test_pairs.append(
        {
            "brand": loc_a.brand,
            "location_a": loc_a,
            "location_b": loc_b,
            "gt_ratio": ratio,
            "gt_preference": "A" if ratio >= 1 else "B",
            "location_a_result": res_a,
            "location_b_result": res_b,
        }
    )

print(f"Prepared {len(test_pairs)} test pairs from saved sessions.")
for idx, pair in enumerate(test_pairs, start=1):
    print(
        f"Test Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Final scores -> A {_fmt(pair['location_a_result'].get('final_score'))} | B {_fmt(pair['location_b_result'].get('final_score'))}"
    )
    print(
        f"    GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Loaded test_0 for Starbucks 甄选(静安嘉里中心f1店) -> final score 8.10
Loaded test_1 for Starbucks 甄选(美罗城店) -> final score 7.60
Loaded test_2 for Starbucks 甄选(莘庄仲盛店) -> final score 7.80
Prepared 3 test pairs from saved sessions.
Test Pair 1: Starbucks 甄选(静安嘉里中心f1店) vs Starbucks 甄选(美罗城店)
    Final scores -> A 8.10 | B 7.60
    GT ratio 0.42 (prefers B)
Test Pair 2: Starbucks 甄选(静安嘉里中心f1店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 8.10 | B 7.80
    GT ratio 1.83 (prefers A)
Test Pair 3: Starbucks 甄选(美罗城店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 7.60 | B 7.80
    GT ratio 4.35 (prefers A)


## Rubric functions build

problem:
- should I add "the rubric should be generalized to all stores"
    - the rubric does not hold for all stores!

In [29]:
from prompts.evaluation import EVALUATION_AGENT_SYSTEM, EVALUATION_SEPARATE_AGENT_SYSTEM
def score_location_with_rubric(customer_report: str,
                               customer_rubric: str,
                               traffic_report: str,
                               traffic_rubric: str,
                               competition_report: str,
                               competition_rubric: str,
                               weights: Dict[str, float]
                            ) -> Dict[str, float]:
    user_prompt = f"""Evaluate three analysis reports using the provided rubrics. Score objectively and provide detailed justifications.

---

CUSTOMER ANALYSIS REPORT:
{customer_report}

CUSTOMER SCORING RUBRIC:
{customer_rubric}

---

TRAFFIC & ACCESSIBILITY REPORT:
{traffic_report}

TRAFFIC SCORING RUBRIC:
{traffic_rubric}

---

COMPETITION ANALYSIS REPORT:
{competition_report}

COMPETITION SCORING RUBRIC:
{competition_rubric}

---

Evaluate each report according to its rubric. Return the JSON with scores and justifications."""
    
    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "low"},
        input=[
            {"role": "system", "content": EVALUATION_AGENT_SYSTEM},
            {"role": "user", "content": [{"type": "input_text", "text": user_prompt}]},
        ],
    )
    try:
        payload = parse_json_from_text(response.output_text)
    except Exception as e:
        print("error in parsing revision, try again ...", e)
        payload = parse_json_from_text(fix_json_error(response.output_text))

    ejson = {
        "customer": payload["customer"],
        "traffic": payload["traffic"],
        "competition": payload["competition"],
    }

    evaluation_scores = {
        "customer": ejson.get("customer", {"score": 0.0, "justification": ""}),
        "traffic": ejson.get("traffic", {"score": 0.0, "justification": ""}),
        "competition": ejson.get("competition", {"score": 0.0, "justification": ""}),
    }

    # Calculate final weighted score
    customer_score = float(evaluation_scores["customer"].get("score", 0.0))
    traffic_score = float(evaluation_scores["traffic"].get("score", 0.0))
    competition_score = float(evaluation_scores["competition"].get("score", 0.0))
    final_score = (weights["customer"] * customer_score) + (weights["traffic"] * traffic_score) + (weights["competition"] * competition_score)

    scored = {
        "final_score": final_score,
        "customer_score": ejson["customer"]["score"],
        "traffic_score": ejson["traffic"]["score"],
        "competition_score": ejson["competition"]["score"],
        "customer_criterion_scores": ejson["customer"]["criterion_scores"],
        "traffic_criterion_scores": ejson["traffic"]["criterion_scores"],
        "competition_criterion_scores": ejson["competition"]["criterion_scores"],
        "weights": weights
    }
    return scored

def score_location_with_rubric_separate(customer_report: str,
                                        customer_rubric: str,
                                        traffic_report: str,
                                        traffic_rubric: str,
                                        competition_report: str,
                                        competition_rubric: str,
                                        weights: Dict[str, float]
                                       ) -> Dict[str, float]:
    user_prompt = """Evaluate the analysis report using the provided rubrics. Score objectively and provide detailed justifications.

---

ANALYSIS REPORT:
{report}

SCORING RUBRIC:
{rubric}

---

Evaluate report according to its rubric. Return the JSON with scores and justifications."""
    
    client = OpenAI()
    def _run_analysis(report, rubric):
        response = client.responses.create(
            model="gpt-5.1",
            reasoning={"effort": "low"},
            input=[
                {"role": "system", "content": EVALUATION_SEPARATE_AGENT_SYSTEM},
                {"role": "user", "content": [{"type": "input_text", "text": user_prompt.format(report = report, rubric = rubric)}]},
            ],
        )
        try:
            payload = parse_json_from_text(response.output_text)
        except Exception as e:
            print("error in parsing revision, try again ...", e)
            payload = parse_json_from_text(fix_json_error(response.output_text))
        return payload

    ejson = {
        "customer": _run_analysis(customer_report, customer_rubric),
        "traffic": _run_analysis(traffic_report, traffic_rubric),
        "competition": _run_analysis(competition_report, competition_rubric),
    }

    evaluation_scores = {
        "customer": ejson.get("customer", {"score": 0.0, "justification": ""}),
        "traffic": ejson.get("traffic", {"score": 0.0, "justification": ""}),
        "competition": ejson.get("competition", {"score": 0.0, "justification": ""}),
    }

    # Calculate final weighted score
    customer_score = float(evaluation_scores["customer"].get("score", 0.0))
    traffic_score = float(evaluation_scores["traffic"].get("score", 0.0))
    competition_score = float(evaluation_scores["competition"].get("score", 0.0))
    final_score = (weights["customer"] * customer_score) + (weights["traffic"] * traffic_score) + (weights["competition"] * competition_score)

    scored = {
        "final_score": final_score,
        "customer_score": ejson["customer"]["score"],
        "traffic_score": ejson["traffic"]["score"],
        "competition_score": ejson["competition"]["score"],
        "customer_criterion_scores": ejson["customer"]["criterion_scores"],
        "traffic_criterion_scores": ejson["traffic"]["criterion_scores"],
        "competition_criterion_scores": ejson["competition"]["criterion_scores"],
        "weights": weights
    }
    return scored

def evaluate_pair_with_rubric(pair: Dict, rubrics: Dict[str, str], separate: bool = False):
    loc_a = pair["location_a_result"]
    loc_b = pair["location_b_result"]

    if separate:
        score_func = score_location_with_rubric_separate
    else:
        score_func = score_location_with_rubric

    scores_a = score_func(
        loc_a["report_sections"]["customer"],
        rubrics["customer"],
        loc_a["report_sections"]["traffic"],
        rubrics["traffic"],
        loc_a["report_sections"]["competition"],
        rubrics["competition"],
        loc_a["raw"]["weights"]
    )

    scores_b = score_func(
        loc_b["report_sections"]["customer"],
        rubrics["customer"],
        loc_b["report_sections"]["traffic"],
        rubrics["traffic"],
        loc_b["report_sections"]["competition"],
        rubrics["competition"],
        loc_b["raw"]["weights"]
    )

    final_a = scores_a.get("final_score") or 0.0
    final_b = scores_b.get("final_score") or 0.0
    predicted_ratio = final_a / max(final_b, 1e-6)
    gt_ratio = pair["gt_ratio"]
    score_diff = max(predicted_ratio, gt_ratio) / min(predicted_ratio, gt_ratio)
    order_matches = (predicted_ratio >= 1 and gt_ratio >= 1) or (predicted_ratio <= 1 and gt_ratio <= 1)
    within_threshold = abs(score_diff - 1) <= RATIO_THRESHOLD
    return {
        "predicted_ratio": predicted_ratio,
        "gt_ratio": gt_ratio,
        "score_diff": score_diff,
        "order_matches": order_matches,
        "within_threshold": within_threshold,
        "scores_a": scores_a,
        "scores_b": scores_b,
    }


def rubric_revision(report1, report2, rubric, score1, score2, gt_location_score, pred_location_score, separate: bool = False):
    if separate:
        output_format = """{
    "rubric": "string (revised rubric)",
    "weakness": List[string] (short bullet point: main problem of the current rubric),
    "problems": List[string] (short bullet point: problems in the reports that contributed to the mismatch),
    "revisions": List[string] (short bullet point: what revision has been done to the rubric to make it better) 
}
"""
    else:
        output_format = """{
    "customer_rubric": "string (revised customer rubric)",
    "traffic_rubric": "string (revised traffic rubric)",
    "competition_rubric": "string (revised competition rubric)"
    "weakness": List[string] (short bullet point: main problem of the current rubric),
    "problems": List[string] (short bullet point: problems in the reports that contributed to the mismatch),
    "revisions": List[string] (short bullet point: what revision has been done to the rubric to make it better) 
}"""
    system_prompt = """You are a rubric-tuning agent. Your job is to revise the evaluation rubric by comparing two location evaluation reports and their scores.

The user will provide:
1. report1: a report evaluating the potential of opening a store in location1.
2. report2: a report evaluating the potential of opening the same store in location2.
3. score1: the current score given to report1 using the existing rubric.
4. score2: the current score given to report2 using the existing rubric.
5. Ground Truth: a numeric ratio GT = location1/location2, representing the relative traffic of location1 to location2 (GT > 1 means location1 has higher traffic; GT < 1 means location2 has higher traffic).
6. Predicted: a numeric ratio Pred = location1/location2 derived from the current scores (for example, Pred = score1 / score2).
7. rubric: JSON, the current rubric for scoring and evaluating the locations (including dimensions, weights, and criteria).

Your goals:
- Diagnose what is wrong with the current rubric that leads to a mismatch between GT and Pred (e.g., wrong ordering, too small difference, or reversed preference).
- Propose a revised rubric that:
  - makes the score ordering consistent with ground truth (if GT > 1, we prefer score1 > score2; if GT << 1, we prefer score2 >> score1),
  - increases the sensitivity of scores to real differences in location quality,
  - and remains general enough to be applied to other locations and stores.

When reasoning (internally, do NOT show your chain-of-thought to the user):
1. Compare GT and Pred:
   - If GT and Pred have opposite ordering (e.g. GT > 1 but score1 < score2), treat this as a serious rubric failure.
   - If |GT - Pred| is large (e.g., GT >> 1 but Pred ≈ 1), treat this as evidence that the rubric is not capturing real differences between locations.
2. Inspect the original rubric JSON and identify:
   - which dimensions are overweighted or underweighted,
   - which important dimensions are missing,
   - and which criteria are too vague or not measurable.
3. Adjust the rubric:
   - You may add new dimensions, delete dimensions, or change the weight of dimensions.
   - You must ensure the total sum of all dimension weights is 100%.
   - You are encouraged to introduce clearer, more granular levels (e.g., 3–5 levels with numeric thresholds) so that differences between locations produce more distinct scores.
   - You may add hard-coded numeric thresholds (e.g., population ranges, traffic counts, distance to competitors) to make scoring more objective and easier to apply.

Reminder: 
   - You should give rubric, do not integrate any other information in the rubric such as suggestions.
   - The rubric should be grounded in the concrete observations from the two reports.
   - Keep each component in rubrics concise and short with bullet points.
   - The weight for each component in rubric should be bigger than 8%.
   - The revised rubric must not include information about the store such as "coffee shop", "boutique".

Output format:
Return ONLY a single valid JSON object, with no extra text, in the following format:
{output_format}

- "weakness" should focus on issues in the existing rubric.
- "problems" should focus on issues in how the reports were written or interpreted.
""".format(output_format = output_format)

    user_prompt = f"""---- 
# REPORT1
{report1} 
## Score1
{score1}
-----
-----
# REPORT2
{report2}
## Score2
{score2}
-----

## GT: location1 : location2 = {gt_location_score}
## Predicted: location1 : location2 = {pred_location_score}

-----
-----

## Rubric
{rubric}
-----
"""

    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "medium"},
        input=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [{"type": "input_text", "text": user_prompt}],
            },
        ],
    )
    return response.output_text

In [32]:
current_rubric = load_initial_rubric_text()

In [33]:
current_rubric

{'customer': "# Customer Potential Rubric (Further Revised, Location-Oriented, Commuter-Focused)\n\nPurpose: Score 0–10 based on the strength of the realistically reachable customer base and how well it matches the specific store type and its key earning dayparts.\n\n---\n\n## Sub-dimension weights for Customer (sum = 100%)\n\n1. Population density and catchment strength – 10%\n2. Demographic and spending fit – 10%\n3. Customer behavior, daypart and peak demand – 70%\n4. Opportunities and risks – 10%\n\n---\n\n## 1. Population density and catchment strength (10%)\n\nEvaluate both residential and daytime or visitor population within a practical catchment, typically 5–10 minutes' walk (≈300–800 m) in dense urban areas.\n\n### Excellent (9–10)\n\n9.5–10.0 (city-level super-hub)\n- Within roughly 10 minutes' walk at least one of:\n  - Residents ≥80k plus strong daytime inflow (major offices and at least two large malls or complexes), or\n  - Estimated average daytime plus visitor populatio

# Rubric Revision - Revise all rubrics together - not recommended

In [31]:
# RUBRIC REVISION LOOP
# Choose which pair set to feed into the rubric revision loop.
pairs_for_revision = evaluated_pairs  # swap to evaluated_pairs after running brand sessions

if not pairs_for_revision:
    raise ValueError("No comparison pairs available; run the setup cells first.")

rubric_history = [{"iteration": 0, "rubric": current_rubric}]

for idx, pair in enumerate(pairs_for_revision, start=1):
    alignment = evaluate_pair_with_rubric(pair, current_rubric)
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Predicted ratio {alignment['predicted_ratio']:.2f} vs GT {alignment['gt_ratio']:.2f} -> diff {alignment['score_diff']:.3f}"
    )
    if alignment["order_matches"] and alignment["within_threshold"]:
        print("    Alignment within threshold, skipping revision for this pair.")
        print("-" * 40)
        continue

    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    revised = rubric_revision(
        report1=res_a['report_sections'],
        report2=res_b['report_sections'],
        rubric=current_rubric,
        score1=alignment["scores_a"],
        score2=alignment["scores_b"],
        gt_location_score=f"{pair['gt_ratio']:.4f}",
        pred_location_score=f"{alignment['predicted_ratio']:.4f}"
    )
    try:
        revised = parse_json_from_text(revised)
    except Exception as e:
        print("error in parsing revision, try again ...", e)
        revised = parse_json_from_text(fix_json_error(revised))
    current_rubric = {"customer": revised["customer_rubric"], "traffic": revised["traffic_rubric"], "competition": revised["competition_rubric"]}
    weakness = revised.get("weakness", "")
    problems = revised.get("problems", "")
    rubric_history.append({"iteration": idx, "rubric": revised, "weakness": weakness, "problem": problems})
    print(
        f"[[[ Rubric updated (iter {idx}) ]]]\nWeakness in rubric: {weakness}\nProblem in reports: {problems}"
    )
    print("-" * 40)

revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")


Pair 1: Starbucks 甄选(美罗城店) vs Starbucks 甄选(白玉兰广场1F店)
    Predicted ratio 1.02 vs GT 1.74 -> diff 1.699
[[[ Rubric updated (iter 1) ]]]
Weakness in rubric: ['Traffic rubric under-specifies numeric thresholds for effective pass-by volumes, causing super-hubs and mid-tier nodes to bunch together at similar scores.', "Customer rubric does not tie top scores to explicit daily cup ranges or peak-hour throughput, so locations with very different demand levels can both be scored as 'excellent'.", 'Competition rubric allows dense hyper-clusters inside malls to appear only moderately negative, which can overly offset or flatten genuine traffic advantages.', 'Sub-dimension weights emphasize qualitative judgments without clear hard caps for detour time, making micro-access convenience overrule large differences in raw volume.']
Problem in reports: ["Report2 uses strong qualitative language ('structurally solid', 'highly favorable') and optimistic ranges that make a clearly smaller node feel simila

KeyboardInterrupt: 

In [None]:
revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")

In [22]:
def summary_problems(problems: str):
    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "medium"},
        input=[
            {"role": "user", "content": [{"type": "input_text", "text": f"Help me to summarize this, extract bullet points on what information is missing in terms of customer, traffic and competition analysis of a location. \nProblems: {problems}"}]},
        ],
    )
    return response.output_text

In [23]:
summary_problems(problems)

'Here’s a synthesized summary plus a focused list of *what’s missing* in terms of customer, traffic, and competition analysis.\n\n---\n\n## Overall summary\n\nAcross all the reports, analysis repeatedly stays at the *macro-node* level (big station ridership, whole-mall visitors, district traffic) and uses narrative ranges. What’s missing is a single, consistent, storefront-level view of:\n\n- How many **reachable customers** actually pass within buying distance during the **key morning window**.\n- How **micro-location** (exact corridor, entrance, opening times, walking distance, exits used) changes that reachable base.\n- How **competition on the same flow line** converts that demand into realistic per-store volumes.\n\nWithout this, weaker sites look artificially similar to stronger super-hubs.\n\n---\n\n## Missing information – Customer analysis\n\n- **Comparable “reachable peak base” per site**\n  - A single numeric estimate of realistic customers reachable at the storefront in 7:0

# RUBRIC REVISION LOOP with SEPARATION

In [None]:
# RUBRIC REVISION LOOP with SEPARATION
# Choose which pair set to feed into the rubric revision loop.
import copy
pairs_for_revision = evaluated_pairs  # swap to evaluated_pairs after running brand sessions

if not pairs_for_revision:
    raise ValueError("No comparison pairs available; run the setup cells first.")

rubric_history = [{"iteration": 0, "rubric": current_rubric}]

for idx, pair in enumerate(pairs_for_revision, start=1):
    alignment = evaluate_pair_with_rubric(pair, current_rubric, separate=True)
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Predicted ratio {alignment['predicted_ratio']:.2f} vs GT {alignment['gt_ratio']:.2f} -> diff {alignment['score_diff']:.3f}"
    )

    score1 = alignment["scores_a"]["final_score"]
    score2 = alignment["scores_b"]["final_score"]
    print(f"Before revision, score comparison: {score1} <-> {score2}")

    if alignment["order_matches"] and alignment["within_threshold"]:
        print("    Alignment within threshold, skipping revision for this pair.")
        print("-" * 40)
        continue
    
    wkns = []
    pbls = []
    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    for analysis in ["customer", "traffic", "competition"]:
        score1 = alignment["scores_a"]["{}_score".format(analysis)]
        score2 = alignment["scores_b"]["{}_score".format(analysis)]
        print(f"Start revising {analysis}: score1: {score1}, score2: {score2}")
        revised = rubric_revision(
            report1=res_a['report_sections'][analysis],
            report2=res_b['report_sections'][analysis],
            rubric=current_rubric[analysis],
            score1=alignment["scores_a"]["{}_score".format(analysis)],
            score2=alignment["scores_b"]["{}_score".format(analysis)],
            gt_location_score=f"{pair['gt_ratio']:.4f}",
            pred_location_score=f"{alignment['predicted_ratio']:.4f}",
            separate=True
        )
        try:
            revised = parse_json_from_text(revised)
        except Exception as e:
            print("error in parsing revision, try again ...", e)
            revised = parse_json_from_text(fix_json_error(revised))
        current_rubric[analysis] = revised["rubric"]
        wkns.append(revised.get("weakness", ""))
        pbls.append(revised.get("problems", ""))

    rubric_history.append({"iteration": idx, "rubric": copy.deepcopy(current_rubric), "weakness": wkns, "problem": pbls})

    alignment_after = evaluate_pair_with_rubric(pair, current_rubric, separate=True)

    score1 = alignment_after["scores_a"]["final_score"]
    score2 = alignment_after["scores_b"]["final_score"]
    print(f"After revising, score comparison: {score1} <-> {score2}")

    for analysis in ["customer", "traffic", "competition"]:
        score1 = alignment_after["scores_a"]["{}_score".format(analysis)]
        score2 = alignment_after["scores_b"]["{}_score".format(analysis)]
        print(f"{analysis} : {score1} <-> {score2}")
    
    print(
        f"[[[ Rubric updated (iter {idx}) ]]]\nWeakness in rubric: {wkns}\nProblem in reports: {pbls}"
    )
    print("-" * 40)

revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")


Pair 1: Starbucks 甄选(美罗城店) vs Starbucks 甄选(白玉兰广场1F店)
    Predicted ratio 1.02 vs GT 1.74 -> diff 1.704
Before revision, score comparison: 7.639000000000001 <-> 7.484000000000002
Start revising customer: score1: 9.3, score2: 8.9
Start revising traffic: score1: 8.2, score2: 8.3
Start revising competition: score1: 4.9, score2: 4.7
After revising, score comparison: 8.048 <-> 7.336
customer : 8.1 <-> 7.5
traffic : 8.9 <-> 8.0
competition : 6.8 <-> 6.2
[[[ Rubric updated (iter 1) ]]]
Weakness in rubric: [["Overweights a single, qualitative 'customer behavior' dimension (70%) without quantitative thresholds, causing different-scale hubs to receive similar high scores.", 'Underweights raw scale of reachable traffic and concrete volume estimates (only 10% for population/catchment), even though this is the main driver of customer potential.', "Lacks explicit numeric bands for footfall or cup volumes, so evaluators can rate both a city super-hub and a solid but smaller node as 'Excellent'.", 'Doe

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
rubric_history

In [35]:

revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")


Latest rubric saved to rubrics/revised_rubric.json


In [38]:

rubric_history_path = "rubrics/rubric_history.json"
json.dump(rubric_history, open(rubric_history_path, 'w'), ensure_ascii=False, indent=4)
print(f"rubric history saved to {rubric_history_path}")


rubric history saved to rubrics/rubric_history.json


In [36]:
current_rubric

{'customer': "Customer Potential Rubric (Repeat‑Commuter Urban Coffee) – Revised\n\nUse this rubric to score locations 0–10 based on the strength of the **realistically reachable** customer base for a commuter‑focused boutique coffee shop (mainly takeaway).\n\nGeneral principles\n- Evaluate a **single conservative typical weekday**.\n- Use the same fixed **3‑hour morning window** (e.g. 07:30–10:30) for all sites.\n- Base all volume estimates on people passing within **~30–40 m of the storefront along natural paths**, not whole‑station or whole‑mall totals.\n- Always separate:\n  - **repeat, routine users** (commuters, office workers, students, nearby residents using the path ≥ 3 weekdays/week), vs.\n  - **irregular leisure / tourist flows** (occasional shoppers, sightseers, cruise guests, park visitors, etc.).\n- Prioritise **stable repeat flows** over large but irregular tourist traffic.\n- For metro interchanges, only count people who either exit to street near the storefront or walk

In [37]:
print(current_rubric["customer"])

Customer Potential Rubric (Repeat‑Commuter Urban Coffee) – Revised

Use this rubric to score locations 0–10 based on the strength of the **realistically reachable** customer base for a commuter‑focused boutique coffee shop (mainly takeaway).

General principles
- Evaluate a **single conservative typical weekday**.
- Use the same fixed **3‑hour morning window** (e.g. 07:30–10:30) for all sites.
- Base all volume estimates on people passing within **~30–40 m of the storefront along natural paths**, not whole‑station or whole‑mall totals.
- Always separate:
  - **repeat, routine users** (commuters, office workers, students, nearby residents using the path ≥ 3 weekdays/week), vs.
  - **irregular leisure / tourist flows** (occasional shoppers, sightseers, cruise guests, park visitors, etc.).
- Prioritise **stable repeat flows** over large but irregular tourist traffic.
- For metro interchanges, only count people who either exit to street near the storefront or walk the specific corridor whe

## Final Comparison
- Experiment 1: use separate evaluation to calculate the accuracy
- Experiment 2: use all-in-one evaluation to calculate the accuracy

In [68]:
pair['location_a'].store_name

'Starbucks 甄选(静安嘉里中心f1店)'

In [74]:
rubric_history[0]['rubric']

{'customer': "Customer Potential Rubric (Repeat‑Commuter Urban Coffee) – Revised\n\nUse this rubric to score locations 0–10 based on the strength of the **realistically reachable** customer base for a commuter‑focused boutique coffee shop (mainly takeaway).\n\nGeneral principles\n- Evaluate a **single conservative typical weekday**.\n- Use the same fixed **3‑hour morning window** (e.g. 07:30–10:30) for all sites.\n- Base all volume estimates on people passing within **~30–40 m of the storefront along natural paths**, not whole‑station or whole‑mall totals.\n- Always separate:\n  - **repeat, routine users** (commuters, office workers, students, nearby residents using the path ≥ 3 weekdays/week), vs.\n  - **irregular leisure / tourist flows** (occasional shoppers, sightseers, cruise guests, park visitors, etc.).\n- Prioritise **stable repeat flows** over large but irregular tourist traffic.\n- For metro interchanges, only count people who either exit to street near the storefront or walk

In [82]:
current_rubric

{'customer': "Customer Potential Rubric (Repeat‑Commuter Urban Coffee) – Revised\n\nUse this rubric to score locations 0–10 based on the strength of the **realistically reachable** customer base for a commuter‑focused boutique coffee shop (mainly takeaway).\n\nGeneral principles\n- Evaluate a **single conservative typical weekday**.\n- Use the same fixed **3‑hour morning window** (e.g. 07:30–10:30) for all sites.\n- Base all volume estimates on people passing within **~30–40 m of the storefront along natural paths**, not whole‑station or whole‑mall totals.\n- Always separate:\n  - **repeat, routine users** (commuters, office workers, students, nearby residents using the path ≥ 3 weekdays/week), vs.\n  - **irregular leisure / tourist flows** (occasional shoppers, sightseers, cruise guests, park visitors, etc.).\n- Prioritise **stable repeat flows** over large but irregular tourist traffic.\n- For metro interchanges, only count people who either exit to street near the storefront or walk

In [85]:
# Input ------------------------------------------------------------------------------------------------------------------
pairs_for_comparison = evaluated_pairs  # swap to evaluated_pairs after running brand sessions
# tested_rubric = current_rubric # the rubric to test on
tested_rubric = load_initial_rubric_text() # the rubric to test on

# Experiment -------------------------------------------------------------------------------------------------------------
if not pairs_for_comparison:
    raise ValueError("No comparison pairs available; run the setup cells first.")

store2score = {}
total_pairs = len(pairs_for_comparison)
hit_pairs = 0
ratio_differences = []

for pair in pairs_for_comparison:
    loc_a = pair['location_a_result']
    loc_b = pair['location_b_result']
    store1 = pair['location_a'].address
    store2 = pair['location_b'].address
    if store1 not in store2score:
        print("calculate score for store:", store1)
        scores_a = score_location_with_rubric_separate(
            loc_a["report_sections"]["customer"],
            tested_rubric["customer"],
            loc_a["report_sections"]["traffic"],
            tested_rubric["traffic"],
            loc_a["report_sections"]["competition"],
            tested_rubric["competition"],
            loc_a["raw"]["weights"]
        )
        store2score[store1] = scores_a
    if store2 not in store2score:
        print("calculate score for store:", store2)
        scores_b = score_location_with_rubric_separate(
            loc_b["report_sections"]["customer"],
            tested_rubric["customer"],
            loc_b["report_sections"]["traffic"],
            tested_rubric["traffic"],
            loc_b["report_sections"]["competition"],
            tested_rubric["competition"],
            loc_b["raw"]["weights"]
        )
        store2score[store2] = scores_b

    score1 = store2score[store1]["final_score"]
    score2 = store2score[store2]["final_score"]
    """{
        "final_score": final_score,
        "customer_score": ejson["customer"]["score"],
        "traffic_score": ejson["traffic"]["score"],
        "competition_score": ejson["competition"]["score"],
        "customer_criterion_scores": ejson["customer"]["criterion_scores"],
        "traffic_criterion_scores": ejson["traffic"]["criterion_scores"],
        "competition_criterion_scores": ejson["competition"]["criterion_scores"]
    }"""
    hit = False
    if (pair["gt_ratio"] > 0 and score1 > score2) or (pair["gt_ratio"] < 0 and score1 < score2):
        hit = True
        hit_pairs += 1
        ratio = max(score1, score2) / (min(score1, score2) + 1e-5)
        ratio_differences.append(abs(pair["gt_ratio"] - ratio))
    
    store1_customer_score = store2score[store1]["customer_score"]
    store1_traffic_score = store2score[store1]["traffic_score"]
    store1_comp_score = store2score[store1]["competition_score"]
    store2_customer_score = store2score[store2]["customer_score"]
    store2_traffic_score = store2score[store2]["traffic_score"]
    store2_comp_score = store2score[store2]["competition_score"]
    gt = pair["gt_ratio"]
    print(f"Accurate: {hit} \
          \n-----Store1: {pair['location_a'].store_name} - Scores: {score1} {store1_customer_score} {store1_traffic_score} {store1_comp_score} \
          \n-----Store2: {pair['location_b'].store_name} - Scores: {score2} {store2_customer_score} {store2_traffic_score} {store2_comp_score} \
          \n-----GT: {gt}")
    
ratio_difference = sum(ratio_differences) / (len(ratio_differences) + 1e-5)
print(f"hit: {hit_pairs} / {total_pairs}, Accuracy: {(hit_pairs / total_pairs * 100):.2f}, Ratio Difference: {ratio_difference:.1f}")

calculate score for store: 上海徐汇区肇家兵路1111号219单元
calculate score for store: 东长治路588号白玉兰广场商场1楼16号商铺
Accurate: False           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.7170000000000005 9.3 8.4 4.9           
-----Store2: Starbucks 甄选(白玉兰广场1F店) - Scores: 7.956000000000001 9.0 9.4 4.7           
-----GT: 1.7391304347826086
calculate score for store: 上海黄浦区南京东路829号G05商铺
Accurate: True           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.7170000000000005 9.3 8.4 4.9           
-----Store2: Starbucks 甄选(世茂广场店) - Scores: 7.28 9.1 8.4 3.6           
-----GT: 2.272727272727273
calculate score for store: 上海静安区南京西路1515号嘉里中心商场e1-03
Accurate: True           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.7170000000000005 9.3 8.4 4.9           
-----Store2: Starbucks 甄选(静安嘉里中心f1店) - Scores: 7.460000000000001 8.7 8.4 4.7           
-----GT: 2.380952380952381
calculate score for store: 上海闵行区都市路5001号仲盛世界商城1层GF12商铺
Accurate: True           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.7170000000000005 9.3

Experiment 2: use end-to-end evaluation

In [79]:
rubric_history[-1]['rubric']['rubric']

"Competition & Node Potential Rubric (v6)\n\nGoal\n- Score 0–10 on **net competitive favorability** for a new coffee-focused store at a given node.\n- Score should track **realistic sustainable daily cups (RSV)** for this store.\n- Higher-RSV nodes must score **clearly higher**, and large differences in effective traffic should translate to **meaningful score gaps**.\n\nDimension weights (sum = 100%)\n1. Effective Node Volume (RSV proxy) – **65%**\n2. Saturation & Per-capita Competition – **15%**\n3. Positioning White Space – **10%**\n4. Anchor / Incumbent Risk – **10%**\n\nFinal score = weighted average of 4 dimensions, then adjusted by gates in Section 5.\n\n---\n1. Effective Node Volume (RSV proxy) – 65%\n\n1.1 Required inputs\n- **Effective passers (by daypart, within 0–50 m of frontage)**\n  - Morning peak (7:30–10:30) – only people passing within ~0–50 m on real desire lines.\n  - Other dayparts (lunch, afternoon, evening) – only if flows pass within ~0–50 m.\n- **Daily-path base

In [81]:
rubric_history[-2]['rubric']

{'rubric': "Competition and Node Potential Rubric (v5)\n\nGoal\n- Score 0–10 on net competitive favorability for a new coffee-focused store at a given node.\n- Score should track realistic sustainable daily cups (RSV) for this store, not just fame of the area.\n- Higher-RSV nodes must receive clearly higher scores, and extremely saturated flagship clusters must be visibly penalized.\n\nDimension weights (sum = 100%)\n1. Effective Node Volume (RSV) – 55%\n2. Saturation & Competitor Density – 20%\n3. Positioning White Space – 15%\n4. Anchor / Incumbent Risk – 10%\n\nFinal score = weighted average of 4 dimensions, then adjusted by gates in Section 5.\n\n---\n1. Effective Node Volume (RSV) – 55%\n\n1.1 Required inputs\n- Effective passers on primary flows:\n  - Morning peak (7:30–10:30): people passing within 0–50 m of frontage on main desire lines (not whole station entries).\n  - Other dayparts if relevant (lunch, afternoon, evening) within 0–50 m.\n- Relevant base:\n  - Office workers a

In [71]:
store2score_2 = {}
# Input ------------------------------------------------------------------------------------------------------------------
pairs_for_comparison = evaluated_pairs  # swap to evaluated_pairs after running brand sessions
tested_rubric = rubric_history[0]['rubric'] # the rubric to test on

# Experiment -------------------------------------------------------------------------------------------------------------
if not pairs_for_comparison:
    raise ValueError("No comparison pairs available; run the setup cells first.")

total_pairs = len(pairs_for_comparison)
hit_pairs = 0
ratio_differences = []

for pair in pairs_for_comparison:
    loc_a = pair['location_a_result']
    loc_b = pair['location_b_result']
    store1 = pair['location_a'].address
    store2 = pair['location_b'].address
    if store1 not in store2score_2:
        print("calculate score for store:", store1)
        scores_a = score_location_with_rubric(
            loc_a["report_sections"]["customer"],
            tested_rubric["customer"],
            loc_a["report_sections"]["traffic"],
            tested_rubric["traffic"],
            loc_a["report_sections"]["competition"],
            tested_rubric["competition"],
            loc_a["raw"]["weights"]
        )
        store2score_2[store1] = scores_a
    if store2 not in store2score_2:
        print("calculate score for store:", store2)
        scores_b = score_location_with_rubric(
            loc_b["report_sections"]["customer"],
            tested_rubric["customer"],
            loc_b["report_sections"]["traffic"],
            tested_rubric["traffic"],
            loc_b["report_sections"]["competition"],
            tested_rubric["competition"],
            loc_b["raw"]["weights"]
        )
        store2score_2[store2] = scores_b

    score1 = store2score_2[store1]["final_score"]
    score2 = store2score_2[store2]["final_score"]
    """{
        "final_score": final_score,
        "customer_score": ejson["customer"]["score"],
        "traffic_score": ejson["traffic"]["score"],
        "competition_score": ejson["competition"]["score"],
        "customer_criterion_scores": ejson["customer"]["criterion_scores"],
        "traffic_criterion_scores": ejson["traffic"]["criterion_scores"],
        "competition_criterion_scores": ejson["competition"]["criterion_scores"]
    }"""
    hit = False
    if (pair["gt_ratio"] > 0 and score1 > score2) or (pair["gt_ratio"] < 0 and score1 < score2):
        hit = True
        hit_pairs += 1
        ratio = max(score1, score2) / (min(score1, score2) + 1e-5)
        ratio_differences.append(abs(pair["gt_ratio"] - ratio))
    
    store1_customer_score = store2score_2[store1]["customer_score"]
    store1_traffic_score = store2score_2[store1]["traffic_score"]
    store1_comp_score = store2score_2[store1]["competition_score"]
    store2_customer_score = store2score_2[store2]["customer_score"]
    store2_traffic_score = store2score_2[store2]["traffic_score"]
    store2_comp_score = store2score_2[store2]["competition_score"]
    gt = pair["gt_ratio"]
    print(f"Accurate: {hit} \
          \n-----Store1: {pair['location_a'].store_name} - Scores: {score1} {store1_customer_score} {store1_traffic_score} {store1_comp_score} \
          \n-----Store2: {pair['location_b'].store_name} - Scores: {score2} {store2_customer_score} {store2_traffic_score} {store2_comp_score} \
          \n-----GT: {gt}")
    
ratio_difference = sum(ratio_differences) / (len(ratio_differences) + 1e-5)
print(f"hit: {hit_pairs} / {total_pairs}, Accuracy: {(hit_pairs / total_pairs * 100):.2f}, Ratio Difference: {ratio_difference:.1f}")

calculate score for store: 上海徐汇区肇家兵路1111号219单元
calculate score for store: 东长治路588号白玉兰广场商场1楼16号商铺
error in parsing revision, try again ... Expecting ',' delimiter: line 23 column 5 (char 1342)
Accurate: True           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.5600000000000005 7.6 8.0 6.9           
-----Store2: Starbucks 甄选(白玉兰广场1F店) - Scores: 6.996 7.5 7.5 5.7           
-----GT: 1.7391304347826086
calculate score for store: 上海黄浦区南京东路829号G05商铺
Accurate: False           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.5600000000000005 7.6 8.0 6.9           
-----Store2: Starbucks 甄选(世茂广场店) - Scores: 7.5600000000000005 8.6 8.1 5.6           
-----GT: 2.272727272727273
calculate score for store: 上海静安区南京西路1515号嘉里中心商场e1-03
Accurate: True           
-----Store1: Starbucks 甄选(美罗城店) - Scores: 7.5600000000000005 7.6 8.0 6.9           
-----Store2: Starbucks 甄选(静安嘉里中心f1店) - Scores: 6.912000000000001 7.4 7.3 5.8           
-----GT: 2.380952380952381
calculate score for store: 上海闵行区都市路5001号仲盛世界商城1层

problems:
1. when the revision is done for coffee shops, the revised rubric becomes overfitting.

In [87]:
print(current_rubric['customer'])

Customer Potential Rubric (Repeat‑Commuter Urban Coffee) – Revised

Use this rubric to score locations 0–10 based on the strength of the **realistically reachable** customer base for a commuter‑focused boutique coffee shop (mainly takeaway).

General principles
- Evaluate a **single conservative typical weekday**.
- Use the same fixed **3‑hour morning window** (e.g. 07:30–10:30) for all sites.
- Base all volume estimates on people passing within **~30–40 m of the storefront along natural paths**, not whole‑station or whole‑mall totals.
- Always separate:
  - **repeat, routine users** (commuters, office workers, students, nearby residents using the path ≥ 3 weekdays/week), vs.
  - **irregular leisure / tourist flows** (occasional shoppers, sightseers, cruise guests, park visitors, etc.).
- Prioritise **stable repeat flows** over large but irregular tourist traffic.
- For metro interchanges, only count people who either exit to street near the storefront or walk the specific corridor whe

In [91]:
print(current_rubric["traffic"])

Traffic & Accessibility Rubric (v8 – commuter-focused coffee)

Purpose
- Score each location 0–10 based only on traffic & accessibility for a weekday-morning, commuter-focused specialty coffee shop.
- Output score is a weighted average of sub-dimensions (0–10 each). Weights sum to 100%.
- Emphasis is on: *actual* morning pass-by at the storefront, how on‑path the store is, and the size/quality of captive office populations.

Weights (sum 100%)
- A. Effective morning pass‑by at storefront (0–30 m) – **38%**
- B. Commuter detour & path fit – **15%**
- C. Commuter composition & peak-hour alignment – **10%**
- D. Macro transit relevance & capture – **10%**
- E. Surrounding employment & resident density (1 km) – **8%**
- F. Captive on-site / same-complex population – **12%**
- G. In-project visibility & unit placement – **5%**
- H. Car & drop-off accessibility – **2%**

Global caps (apply after weighted average)
- If A (morning pass-by) < 5.0 → overall score ≤ 6.5.
- If A < 4.0 → overall sc