# SiteSage Evaluation & Rubric Revision

This notebook orchestrates three stages:
1. Load Dianping collection data to compute ground-truth (GT) ratios for identical brands operating at different addresses.
2. Run the SiteSage agent flow once per location (per brand) and reuse those cached outputs to build comparison pairs.
3. Re-score each pair with the latest rubric (skipping aligned cases) and run `rubric_revision`—with a lightweight test harness that reuses stored sessions—to iteratively adjust the rubric.

Tweak the configuration cells as needed before running each stage.

In [2]:
from __future__ import annotations

import csv
import hashlib
import json
import re
from dataclasses import dataclass
from itertools import combinations
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from sitesage_backend import fix_json_error

import dotenv
from openai import OpenAI

dotenv.load_dotenv()

from sitesage_backend import run_sitesage_session_async, parse_json_from_text

DATA_PATH = Path("data/dianping_collection_data.csv")
PROMPT_TEMPLATE = (
    "I want to open a boutique coffee shop optimized for morning commuters at {}. "
    "Please run the full SiteSage workflow and deliver the final report."
)
SESSION_LANGUAGE = "zh"
CACHE_DIR = Path("save/evaluate_cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

RATIO_THRESHOLD = 0.6

session_results: Dict[Tuple[str, str], Dict[str, object]] = {}
location_index: Dict[str, 'LocationStat'] = {}
evaluated_pairs: List[Dict[str, object]] = []
test_pairs: List[Dict[str, object]] = []
SCORING_CACHE: Dict[Tuple[str, str], Dict[str, float]] = {}

def _fmt(value):
    return f"{value:.2f}" if isinstance(value, (int, float)) else "n/a"


In [3]:
@dataclass(frozen=True)
class LocationStat:
    brand: str
    store_name: str
    address: str
    total_reviews: float
    avg_reviews_per_day: float
    sample_days: int


def parse_brand_name(store_name: str) -> str:
    clean_name = (store_name or "").strip()
    if "(" in clean_name:
        return clean_name.split("(", 1)[0].strip()
    if "（" in clean_name:
        return clean_name.split("（", 1)[0].strip()
    return clean_name or "unknown"


def load_location_stats(csv_path: Path) -> List[LocationStat]:
    stats: Dict[Tuple[str, str, str], Dict[str, float]] = {}
    with csv_path.open(newline="", encoding="utf-8") as fh:
        reader = csv.DictReader(fh)
        for row in reader:
            store_name = (row["store"] or "").strip()
            brand = parse_brand_name(store_name)
            address = (row["address"] or "").strip()
            review_cnt = int(row["review_cnt"])
            key = (brand, store_name, address)
            entry = stats.setdefault(key, {"total": 0, "days": 0})
            entry["total"] += review_cnt
            entry["days"] += 1
    locations: List[LocationStat] = []
    for (brand, store_name, address), values in stats.items():
        days = max(int(values["days"]), 1)
        total = float(values["total"])
        locations.append(
            LocationStat(
                brand=brand,
                store_name=store_name,
                address=address,
                total_reviews=total,
                avg_reviews_per_day=total / days,
                sample_days=days,
            )
        )
    locations.sort(key=lambda loc: (loc.brand, -loc.total_reviews))
    return locations


def group_locations_by_brand(locations: Iterable[LocationStat]) -> Dict[str, List[LocationStat]]:
    groups: Dict[str, List[LocationStat]] = {}
    for loc in locations:
        groups.setdefault(loc.brand, []).append(loc)
    for locs in groups.values():
        locs.sort(key=lambda loc: -loc.total_reviews)
    return groups


def build_location_pairs(groups: Dict[str, List[LocationStat]]) -> List[Dict[str, object]]:
    pairs: List[Dict[str, object]] = []
    for brand, locs in groups.items():
        if len(locs) < 2:
            continue
        for i in range(len(locs)):
            for j in range(i + 1, len(locs)):
                loc_a = locs[i]
                loc_b = locs[j]
                ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
                pairs.append(
                    {
                        "brand": brand,
                        "location_a": loc_a,
                        "location_b": loc_b,
                        "gt_ratio": ratio,
                        "gt_preference": "A" if ratio >= 1 else "B",
                        "gt_difference": loc_a.total_reviews - loc_b.total_reviews,
                    }
                )
    return pairs


def load_initial_rubric_text():
    rubric_dir = Path("rubrics")
    sections = {}
    for name in ("customer_rubric.md", "traffic_rubric.md", "competition_rubric.md"):
        path = rubric_dir / name
        sections[name.split("_")[0]] = path.read_text(encoding="utf-8")
    return sections


def slugify(value: str) -> str:
    slug = re.sub(r"[^a-zA-Z0-9]+", "-", value).strip("-").lower()
    return slug or "location"


def build_session_id(location: LocationStat) -> str:
    brand_slug = slugify(location.brand)
    address_slug = slugify(location.address)[:40]
    return f"eval-{brand_slug}-{address_slug}"


def load_final_report_text(payload: Dict[str, object]) -> str:
    final_report = payload.get("final_report", {}) or {}
    path_str = final_report.get("report_path") if isinstance(final_report, dict) else None
    if isinstance(path_str, str) and path_str:
        path = Path(path_str)
        if path.exists():
            return path.read_text(encoding="utf-8")
    if isinstance(final_report, dict):
        return final_report.get("report_md", "")
    return ""


def extract_report_sections(payload: Dict[str, object]) -> Dict[str, str]:
    sections: Dict[str, str] = {}
    raw_reports = payload.get("reports") or {}
    for key in ("customer", "traffic", "competition"):
        value = raw_reports.get(key) if isinstance(raw_reports, dict) else None
        if isinstance(value, str):
            sections[key] = value
        elif isinstance(value, dict):
            sections[key] = value.get("report_md", "")
        else:
            sections[key] = ""
    return sections


async def run_session_for_location(location: LocationStat) -> Dict[str, object]:
    session_id = build_session_id(location)
    cache_path = CACHE_DIR / f"{session_id}.json"
    prompt = PROMPT_TEMPLATE.format(location.address)
    if cache_path.exists():
        payload = json.loads(cache_path.read_text(encoding="utf-8"))
    else:
        payload = await run_sitesage_session_async(session_id, prompt, language=SESSION_LANGUAGE)
        cache_path.write_text(
            json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8"
        )
    report_md = load_final_report_text(payload)
    report_sections = extract_report_sections(payload)
    return {
        "session_id": session_id,
        "prompt": prompt,
        "final_score": payload.get("final_score"),
        "scores": payload.get("scores", {}),
        "report_md": report_md,
        "report_sections": report_sections,
        "raw": payload,
    }


In [4]:
locations = load_location_stats(DATA_PATH)
brand_groups = group_locations_by_brand(locations)
location_index = {loc.address: loc for loc in locations}
location_pairs = build_location_pairs(brand_groups)

print(f"Loaded {len(locations)} store/location combinations across {len(brand_groups)} brands.")
print(f"Generated {len(location_pairs)} same-brand comparison pairs.")
print("\nPair catalog (first 10 shown):")
for idx, pair in enumerate(location_pairs[:10]):
    print(
        f"{idx:02d}: [{pair['brand']}] {pair['location_a'].store_name} vs {pair['location_b'].store_name} -> "
        f"GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Loaded 13 store/location combinations across 4 brands.
Generated 21 same-brand comparison pairs.

Pair catalog (first 10 shown):
00: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(比斯特上海购物村店) -> GT ratio 1.22 (prefers A)
01: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(武康路店) -> GT ratio 1.98 (prefers A)
02: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 2.11 (prefers A)
03: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(虹桥机场T2店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT ratio 5.16 (prefers A)
04: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(武康路店) -> GT ratio 1.62 (prefers A)
05: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 1.73 (prefers A)
06: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(比斯特上海购物村店) vs % Arabica阿拉比卡咖啡(建国西路店) -> GT ratio 4.23 (prefers A)
07: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(武康路店) vs % Arabica阿拉比卡咖啡(上海西岸中環店) -> GT ratio 1.07 (prefers A)
08: [% Arabica阿拉比卡咖啡] % Arabica阿拉比卡咖啡(武康路店) 

In [4]:
# Run SiteSage for every location under the selected brand once (cached after first run).
target_brand = "Starbucks 甄选"
max_locations_per_brand = 6
brand_locations = brand_groups.get(target_brand, [])[:max_locations_per_brand]

if not brand_locations:
    raise ValueError(f"Brand {target_brand} not found in dataset.")

for loc in brand_locations:
    key = (loc.brand, loc.address)
    if key in session_results:
        print(f"Reusing cached session for {loc.store_name} @ {loc.address}")
        continue
    result = await run_session_for_location(loc)
    session_results[key] = result
    print(
        f"Ran session {result['session_id']} -> final score {_fmt(result['final_score'])}"
    )

print("Available sessions for brand:")
for loc in brand_locations:
    key = (loc.brand, loc.address)
    result = session_results.get(key)
    print(
        f"- {loc.store_name} ({loc.address}) -> final score {_fmt(result.get('final_score') if result else None)}"
    )


[+254.151s] RT.Session  : DEBUG    - Session 33bf8f47-db16-4ce1-92b4-25bff9b0d8ef is initialized
[+254.153s] RT.Publisher: DEBUG    - RequestCreation(current_node_id=None, new_request_id=77504152-1577-4de6-91dc-f7dea1760444, running_mode=async, new_node_type=EasyToolCallLLM, args=(), kwargs={'user_input': 'Extract store info and resolve the place. Use tools as needed and return the required JSON.\n\nUser request:\nI want to open a boutique coffee shop optimized for morning commuters at 上海徐汇区肇家兵路1111号219单元. Please run the full SiteSage workflow and deliver the final report.'})
[+254.154s] RT          : INFO     - START CREATED UnderstandingAgent
[92m10:56:19 - LiteLLM:INFO[0m: utils.py:3383 - 
LiteLLM completion() model= gpt-5.1; provider = openai
[92m10:56:21 - LiteLLM:INFO[0m: utils.py:1277 - Wrapper: Completed Call, calling success_handler
[+255.740s] RT.Publisher: DEBUG    - RequestCreation(current_node_id=07a7d5d1-be2e-4616-a8b4-e534c6e63687, new_request_id=98108bb6-e1e2-4eb7-8

In [5]:
# Build comparison pairs using cached session outputs (no reruns).
evaluated_pairs = []

def _component(result: Dict[str, object], name: str):
    scores = result.get("scores") or {}
    if isinstance(scores, dict):
        value = scores.get(name)
        if isinstance(value, dict):
            return value.get("score")
        return value
    return None

for loc_a, loc_b in combinations(brand_locations, 2):
    key_a = (loc_a.brand, loc_a.address)
    key_b = (loc_b.brand, loc_b.address)
    res_a = session_results.get(key_a)
    res_b = session_results.get(key_b)
    if not res_a or not res_b:
        print(f"Missing session data for {loc_a.store_name} or {loc_b.store_name}, skipping.")
        continue
    ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
    evaluated_pairs.append(
        {
            "brand": loc_a.brand,
            "location_a": loc_a,
            "location_b": loc_b,
            "gt_ratio": ratio,
            "gt_preference": "A" if ratio >= 1 else "B",
            "location_a_result": res_a,
            "location_b_result": res_b,
        }
    )

print(f"Prepared {len(evaluated_pairs)} cached comparison pairs.")
for idx, pair in enumerate(evaluated_pairs, start=1):
    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Final scores -> A {_fmt(res_a.get('final_score'))} | B {_fmt(res_b.get('final_score'))}"
    )
    print(
        f"    Components A (C/T/K): {_fmt(_component(res_a, 'customer'))}/"
        f"{_fmt(_component(res_a, 'traffic'))}/{_fmt(_component(res_a, 'competition'))}"
    )
    print(
        f"    Components B (C/T/K): {_fmt(_component(res_b, 'customer'))}/"
        f"{_fmt(_component(res_b, 'traffic'))}/{_fmt(_component(res_b, 'competition'))}"
    )
    print(
        f"    GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Prepared 10 cached comparison pairs.
Pair 1: Starbucks 甄选(美罗城店) vs Starbucks 甄选(白玉兰广场1F店)
    Final scores -> A 7.15 | B 7.30
    Components A (C/T/K): 8.70/8.50/3.80
    Components B (C/T/K): 8.30/8.40/4.60
    GT ratio 1.74 (prefers A)
Pair 2: Starbucks 甄选(美罗城店) vs Starbucks 甄选(世茂广场店)
    Final scores -> A 7.15 | B 7.84
    Components A (C/T/K): 8.70/8.50/3.80
    Components B (C/T/K): 9.00/9.10/4.70
    GT ratio 2.27 (prefers A)
Pair 3: Starbucks 甄选(美罗城店) vs Starbucks 甄选(静安嘉里中心f1店)
    Final scores -> A 7.15 | B 7.83
    Components A (C/T/K): 8.70/8.50/3.80
    Components B (C/T/K): 8.60/8.70/5.70
    GT ratio 2.38 (prefers A)
Pair 4: Starbucks 甄选(美罗城店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 7.15 | B 7.16
    Components A (C/T/K): 8.70/8.50/3.80
    Components B (C/T/K): 8.40/8.30/4.10
    GT ratio 4.35 (prefers A)
Pair 5: Starbucks 甄选(白玉兰广场1F店) vs Starbucks 甄选(世茂广场店)
    Final scores -> A 7.30 | B 7.84
    Components A (C/T/K): 8.30/8.40/4.60
    Components B (C/T/K): 9.00/9.

In [6]:
# Lightweight test harness: load existing SiteSage runs from save/test_* directories.
SCORE_PATTERNS = {
    "customer": re.compile(r"Customer Analysis:\s*([0-9.]+)/10"),
    "traffic": re.compile(r"Traffic .*?:\s*([0-9.]+)/10"),
    "competition": re.compile(r"Competition Analysis:\s*([0-9.]+)/10"),
}
FINAL_PATTERN = re.compile(r"Final Weighted Score:\s*([0-9.]+)/10")

REPORT_FILES = {
    "customer": "02_customer.md",
    "traffic": "03_traffic.md",
    "competition": "04_competition.md",
}

def _extract_score(pattern, text: str) -> float | None:
    match = pattern.search(text)
    return float(match.group(1)) if match else None

def load_session_from_directory(session_id: str) -> Dict[str, object]:
    session_dir = Path("save") / session_id
    evaluation_path = session_dir / "05_evaluation.md"
    final_report_path = session_dir / "07_final_report.md"
    eval_text = evaluation_path.read_text(encoding="utf-8")
    report_md = final_report_path.read_text(encoding="utf-8")
    component_scores = {k: _extract_score(pattern, eval_text) for k, pattern in SCORE_PATTERNS.items()}
    final_score = _extract_score(FINAL_PATTERN, eval_text)
    report_sections = {
        key: (session_dir / fname).read_text(encoding="utf-8")
        for key, fname in REPORT_FILES.items()
    }
    return {
        "session_id": session_id,
        "final_score": final_score,
        "scores": component_scores,
        "report_md": report_md,
        "report_sections": report_sections,
    }

test_configs = [
    {"session_id": "test_0", "address": "上海静安区南京西路1515号嘉里中心商场e1-03"},
    {"session_id": "test_1", "address": "上海徐汇区肇家兵路1111号219单元"},
    {"session_id": "test_2", "address": "上海闵行区都市路5001号仲盛世界商城1层GF12商铺"},
]
test_entries = []
for cfg in test_configs:
    location = location_index.get(cfg["address"])
    if not location:
        print(f"Address {cfg['address']} not found in dataset, skipping.")
        continue
    payload = load_session_from_directory(cfg["session_id"])
    test_entries.append({"location": location, "result": payload})
    session_results.setdefault((location.brand, location.address), payload)
    print(
        f"Loaded {cfg['session_id']} for {location.store_name} -> final score {_fmt(payload.get('final_score'))}"
    )

test_pairs = []
for left, right in combinations(test_entries, 2):
    loc_a = left["location"]
    loc_b = right["location"]
    res_a = left["result"]
    res_b = right["result"]
    ratio = loc_a.total_reviews / max(loc_b.total_reviews, 1e-6)
    test_pairs.append(
        {
            "brand": loc_a.brand,
            "location_a": loc_a,
            "location_b": loc_b,
            "gt_ratio": ratio,
            "gt_preference": "A" if ratio >= 1 else "B",
            "location_a_result": res_a,
            "location_b_result": res_b,
        }
    )

print(f"Prepared {len(test_pairs)} test pairs from saved sessions.")
for idx, pair in enumerate(test_pairs, start=1):
    print(
        f"Test Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Final scores -> A {_fmt(pair['location_a_result'].get('final_score'))} | B {_fmt(pair['location_b_result'].get('final_score'))}"
    )
    print(
        f"    GT ratio {pair['gt_ratio']:.2f} (prefers {pair['gt_preference']})"
    )


Loaded test_0 for Starbucks 甄选(静安嘉里中心f1店) -> final score 8.10
Loaded test_1 for Starbucks 甄选(美罗城店) -> final score 7.60
Loaded test_2 for Starbucks 甄选(莘庄仲盛店) -> final score 7.80
Prepared 3 test pairs from saved sessions.
Test Pair 1: Starbucks 甄选(静安嘉里中心f1店) vs Starbucks 甄选(美罗城店)
    Final scores -> A 8.10 | B 7.60
    GT ratio 0.42 (prefers B)
Test Pair 2: Starbucks 甄选(静安嘉里中心f1店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 8.10 | B 7.80
    GT ratio 1.83 (prefers A)
Test Pair 3: Starbucks 甄选(美罗城店) vs Starbucks 甄选(莘庄仲盛店)
    Final scores -> A 7.60 | B 7.80
    GT ratio 4.35 (prefers A)


In [24]:
from prompts.evaluation import EVALUATION_AGENT_SYSTEM, EVALUATION_SEPARATE_AGENT_SYSTEM
def score_location_with_rubric(customer_report: str,
                               customer_rubric: str,
                               traffic_report: str,
                               traffic_rubric: str,
                               competition_report: str,
                               competition_rubric: str,
                               weights: Dict[str, float]
                            ) -> Dict[str, float]:
    user_prompt = f"""Evaluate three analysis reports using the provided rubrics. Score objectively and provide detailed justifications.

---

CUSTOMER ANALYSIS REPORT:
{customer_report}

CUSTOMER SCORING RUBRIC:
{customer_rubric}

---

TRAFFIC & ACCESSIBILITY REPORT:
{traffic_report}

TRAFFIC SCORING RUBRIC:
{traffic_rubric}

---

COMPETITION ANALYSIS REPORT:
{competition_report}

COMPETITION SCORING RUBRIC:
{competition_rubric}

---

Evaluate each report according to its rubric. Return the JSON with scores and justifications."""
    
    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "low"},
        input=[
            {"role": "system", "content": EVALUATION_AGENT_SYSTEM},
            {"role": "user", "content": [{"type": "input_text", "text": user_prompt}]},
        ],
    )
    try:
        payload = parse_json_from_text(response.output_text)
    except Exception as e:
        print("error in parsing revision, try again ...", e)
        payload = parse_json_from_text(fix_json_error(response.output_text))

    ejson = {
        "customer": payload["customer"],
        "traffic": payload["traffic"],
        "competition": payload["competition"],
    }

    evaluation_scores = {
        "customer": ejson.get("customer", {"score": 0.0, "justification": ""}),
        "traffic": ejson.get("traffic", {"score": 0.0, "justification": ""}),
        "competition": ejson.get("competition", {"score": 0.0, "justification": ""}),
    }

    # Calculate final weighted score
    customer_score = float(evaluation_scores["customer"].get("score", 0.0))
    traffic_score = float(evaluation_scores["traffic"].get("score", 0.0))
    competition_score = float(evaluation_scores["competition"].get("score", 0.0))
    final_score = (weights["customer"] * customer_score) + (weights["traffic"] * traffic_score) + (weights["competition"] * competition_score)

    scored = {
        "final_score": final_score,
        "customer_score": ejson["customer"]["score"],
        "traffic_score": ejson["traffic"]["score"],
        "competition_score": ejson["competition"]["score"],
        "customer_criterion_scores": ejson["customer"]["criterion_scores"],
        "traffic_criterion_scores": ejson["traffic"]["criterion_scores"],
        "competition_criterion_scores": ejson["competition"]["criterion_scores"],
        "weights": weights
    }
    return scored

def score_location_with_rubric_separate(customer_report: str,
                                        customer_rubric: str,
                                        traffic_report: str,
                                        traffic_rubric: str,
                                        competition_report: str,
                                        competition_rubric: str,
                                        weights: Dict[str, float]
                                       ) -> Dict[str, float]:
    user_prompt = """Evaluate the analysis report using the provided rubrics. Score objectively and provide detailed justifications.

---

ANALYSIS REPORT:
{report}

SCORING RUBRIC:
{rubric}

---

Evaluate report according to its rubric. Return the JSON with scores and justifications."""
    
    client = OpenAI()
    def _run_analysis(report, rubric):
        response = client.responses.create(
            model="gpt-5.1",
            reasoning={"effort": "low"},
            input=[
                {"role": "system", "content": EVALUATION_SEPARATE_AGENT_SYSTEM},
                {"role": "user", "content": [{"type": "input_text", "text": user_prompt.format(report = report, rubric = rubric)}]},
            ],
        )
        try:
            payload = parse_json_from_text(response.output_text)
        except Exception as e:
            print("error in parsing revision, try again ...", e)
            payload = parse_json_from_text(fix_json_error(response.output_text))
        return payload

    ejson = {
        "customer": _run_analysis(customer_report, customer_rubric),
        "traffic": _run_analysis(traffic_report, traffic_rubric),
        "competition": _run_analysis(competition_report, competition_rubric),
    }

    evaluation_scores = {
        "customer": ejson.get("customer", {"score": 0.0, "justification": ""}),
        "traffic": ejson.get("traffic", {"score": 0.0, "justification": ""}),
        "competition": ejson.get("competition", {"score": 0.0, "justification": ""}),
    }

    # Calculate final weighted score
    customer_score = float(evaluation_scores["customer"].get("score", 0.0))
    traffic_score = float(evaluation_scores["traffic"].get("score", 0.0))
    competition_score = float(evaluation_scores["competition"].get("score", 0.0))
    final_score = (weights["customer"] * customer_score) + (weights["traffic"] * traffic_score) + (weights["competition"] * competition_score)

    scored = {
        "final_score": final_score,
        "customer_score": ejson["customer"]["score"],
        "traffic_score": ejson["traffic"]["score"],
        "competition_score": ejson["competition"]["score"],
        "customer_criterion_scores": ejson["customer"]["criterion_scores"],
        "traffic_criterion_scores": ejson["traffic"]["criterion_scores"],
        "competition_criterion_scores": ejson["competition"]["criterion_scores"],
        "weights": weights
    }
    return scored

def evaluate_pair_with_rubric(pair: Dict, rubrics: Dict[str, str], separate: bool = False):
    loc_a = pair["location_a_result"]
    loc_b = pair["location_b_result"]

    if separate:
        score_func = score_location_with_rubric_separate
    else:
        score_func = score_location_with_rubric

    scores_a = score_func(
        loc_a["report_sections"]["customer"],
        rubrics["customer"],
        loc_a["report_sections"]["traffic"],
        rubrics["traffic"],
        loc_a["report_sections"]["competition"],
        rubrics["competition"],
        loc_a["raw"]["weights"]
    )

    scores_b = score_func(
        loc_b["report_sections"]["customer"],
        rubrics["customer"],
        loc_b["report_sections"]["traffic"],
        rubrics["traffic"],
        loc_b["report_sections"]["competition"],
        rubrics["competition"],
        loc_b["raw"]["weights"]
    )

    final_a = scores_a.get("final_score") or 0.0
    final_b = scores_b.get("final_score") or 0.0
    predicted_ratio = final_a / max(final_b, 1e-6)
    gt_ratio = pair["gt_ratio"]
    score_diff = max(predicted_ratio, gt_ratio) / min(predicted_ratio, gt_ratio)
    order_matches = (predicted_ratio >= 1 and gt_ratio >= 1) or (predicted_ratio <= 1 and gt_ratio <= 1)
    within_threshold = abs(score_diff - 1) <= RATIO_THRESHOLD
    return {
        "predicted_ratio": predicted_ratio,
        "gt_ratio": gt_ratio,
        "score_diff": score_diff,
        "order_matches": order_matches,
        "within_threshold": within_threshold,
        "scores_a": scores_a,
        "scores_b": scores_b,
    }


def rubric_revision(report1, report2, rubric, score1, score2, gt_location_score, pred_location_score, separate: bool = False):
    if separate:
        output_format = """{
    "rubric": "string (revised rubric)",
    "weakness": List[string] (short bullet point: main problem of the current rubric),
    "problems": List[string] (short bullet point: problems in the reports that contributed to the mismatch),
    "revisions": List[string] (short bullet point: what revision has been done to the rubric to make it better) 
}
"""
    else:
        output_format = """{
    "customer_rubric": "string (revised customer rubric)",
    "traffic_rubric": "string (revised traffic rubric)",
    "competition_rubric": "string (revised competition rubric)"
    "weakness": List[string] (short bullet point: main problem of the current rubric),
    "problems": List[string] (short bullet point: problems in the reports that contributed to the mismatch),
    "revisions": List[string] (short bullet point: what revision has been done to the rubric to make it better) 
}"""
    system_prompt = """You are a rubric-tuning agent. Your job is to revise the evaluation rubric by comparing two location evaluation reports and their scores.

The user will provide:
1. report1: a report evaluating the potential of opening a store in location1.
2. report2: a report evaluating the potential of opening the same store in location2.
3. score1: the current score given to report1 using the existing rubric.
4. score2: the current score given to report2 using the existing rubric.
5. Ground Truth: a numeric ratio GT = location1/location2, representing the relative traffic of location1 to location2 (GT > 1 means location1 has higher traffic; GT < 1 means location2 has higher traffic).
6. Predicted: a numeric ratio Pred = location1/location2 derived from the current scores (for example, Pred = score1 / score2).
7. rubric: JSON, the current rubric for scoring and evaluating the locations (including dimensions, weights, and criteria).

Your goals:
- Diagnose what is wrong with the current rubric that leads to a mismatch between GT and Pred (e.g., wrong ordering, too small difference, or reversed preference).
- Propose a revised rubric that:
  - makes the score ordering consistent with ground truth (if GT > 1, we prefer score1 > score2; if GT << 1, we prefer score2 >> score1),
  - increases the sensitivity of scores to real differences in location quality,
  - and remains general enough to be applied to other locations and stores.

When reasoning (internally, do NOT show your chain-of-thought to the user):
1. Compare GT and Pred:
   - If GT and Pred have opposite ordering (e.g. GT > 1 but score1 < score2), treat this as a serious rubric failure.
   - If |GT - Pred| is large (e.g., GT >> 1 but Pred ≈ 1), treat this as evidence that the rubric is not capturing real differences between locations.
2. Inspect the original rubric JSON and identify:
   - which dimensions are overweighted or underweighted,
   - which important dimensions are missing,
   - and which criteria are too vague or not measurable.
3. Adjust the rubric:
   - You may add new dimensions, delete dimensions, or change the weight of dimensions.
   - You must ensure the total sum of all dimension weights is 100%.
   - You are encouraged to introduce clearer, more granular levels (e.g., 3–5 levels with numeric thresholds) so that differences between locations produce more distinct scores.
   - You may add hard-coded numeric thresholds (e.g., population ranges, traffic counts, distance to competitors) to make scoring more objective and easier to apply.

Reminder: 
   - You should give rubric, do not integrate any other information in the rubric such as suggestions.
   - The rubric should be grounded in the concrete observations from the two reports.
   - Keep each component in rubrics concise and short with bullet points.

Output format:
Return ONLY a single valid JSON object, with no extra text, in the following format:
{output_format}

- "weakness" should focus on issues in the existing rubric.
- "problems" should focus on issues in how the reports were written or interpreted.
""".format(output_format = output_format)

    user_prompt = f"""---- 
# REPORT1
{report1} 
## Score1
{score1}
-----
-----
# REPORT2
{report2}
## Score2
{score2}
-----

## GT: location1 : location2 = {gt_location_score}
## Predicted: location1 : location2 = {pred_location_score}

-----
-----

## Rubric
{rubric}
-----
"""

    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "medium"},
        input=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [{"type": "input_text", "text": user_prompt}],
            },
        ],
    )
    return response.output_text

In [8]:
current_rubric = load_initial_rubric_text()

In [None]:
# RUBRIC REVISION LOOP
# Choose which pair set to feed into the rubric revision loop.
pairs_for_revision = evaluated_pairs  # swap to evaluated_pairs after running brand sessions

if not pairs_for_revision:
    raise ValueError("No comparison pairs available; run the setup cells first.")

rubric_history = [{"iteration": 0, "rubric": current_rubric}]

for idx, pair in enumerate(pairs_for_revision, start=1):
    alignment = evaluate_pair_with_rubric(pair, current_rubric)
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Predicted ratio {alignment['predicted_ratio']:.2f} vs GT {alignment['gt_ratio']:.2f} -> diff {alignment['score_diff']:.3f}"
    )
    if alignment["order_matches"] and alignment["within_threshold"]:
        print("    Alignment within threshold, skipping revision for this pair.")
        print("-" * 40)
        continue

    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    revised = rubric_revision(
        report1=res_a['report_sections'],
        report2=res_b['report_sections'],
        rubric=current_rubric,
        score1=alignment["scores_a"],
        score2=alignment["scores_b"],
        gt_location_score=f"{pair['gt_ratio']:.4f}",
        pred_location_score=f"{alignment['predicted_ratio']:.4f}"
    )
    try:
        revised = parse_json_from_text(revised)
    except Exception as e:
        print("error in parsing revision, try again ...", e)
        revised = parse_json_from_text(fix_json_error(revised))
    current_rubric = {"customer": revised["customer_rubric"], "traffic": revised["traffic_rubric"], "competition": revised["competition_rubric"]}
    weakness = revised.get("weakness", "")
    problems = revised.get("problems", "")
    rubric_history.append({"iteration": idx, "rubric": revised, "weakness": weakness, "problem": problems})
    print(
        f"[[[ Rubric updated (iter {idx}) ]]]\nWeakness in rubric: {weakness}\nProblem in reports: {problems}"
    )
    print("-" * 40)

revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")


BadRequestError: Error code: 400 - {'error': {'message': "Unsupported value: 'minimal' is not supported with the 'gpt-5.1' model. Supported values are: 'none', 'low', 'medium', and 'high'.", 'type': 'invalid_request_error', 'param': 'reasoning.effort', 'code': 'unsupported_value'}}

In [None]:
revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")

Latest rubric saved to rubrics/revised_rubric.json


In [20]:
rubric_history[0]

{'iteration': 0,
 'rubric': {'customer': "# Customer Potential Rubric (Location‑Oriented, Further Revised)\n\nScore 0–10 based on the strength of the catchment and how well it matches the specific store type and its *key dayparts* (for the format in question, e.g. morning commute, lunch, after‑work, weekend family time).\n\n### Weights (sum = 100%)\n\n1. Population Density & Catchment Strength – **30%**  \n2. Demographic & Spending Fit – **20%**  \n3. Customer Behavior, Daypart & Demand Potential – **35%**  \n4. Opportunities & Risks – **15%**\n\nRationale: For daypart‑concentrated formats (commuter coffee, lunch‑driven QSR, evening entertainment), *who is present and reachable in the key hours* matters more than raw all‑day headcount. Daypart & Demand Potential therefore carries the highest weight; Population Density is important but secondary once timing and access are factored in.\n\n> **Consistency rule (between sites):** When comparing multiple sites for the *same* format, use the

In [19]:
for x in rubric_history:
    print(x["problem"])

KeyError: 'problem'

In [12]:
# Aggregate and revise the 

In [21]:
problems = """Problem in reports: ['Both reports emphasize strong commuter potential but do not provide directly comparable, numeric storefront-level morning pass-by counts, encouraging conservative, similar peak-demand scores.', 'Report1 spends substantial space detailing extreme competition in Xujiahui, which likely pulled down its competition score more aggressively than Report2, even though its demand is much higher.', 'Report2’s language (“教科书级”“位置极佳”) strongly frames the site as ideal, possibly biasing scorers to rate its traffic and customer potential closer to Xujiahui than the underlying numbers justify.', 'Micro-location uncertainty (exact corridor/entrance placement) is similar for both sites, but neither report clearly contrasts how that uncertainty might still leave Xujiahui with much larger absolute flows.', 'Neither report explicitly converts station/district flows into a consistent, numeric estimate of key-daypart reachable population, so scorers defaulted to mid-range daypart and traffic-volume sub-scores for both.']
Problem in reports: ['Both reports focus heavily on macro node strength (metro hub status, big commercial district scale) and total daily visitors, but provide limited quantified estimates of actual storefront-level pass-by within 0–30 m in the key morning window, making it easy to overestimate weaker micro-locations.', 'Report 2 places substantial emphasis on Nanjing East Road’s tourist and all-day retail traffic, which is less relevant for a morning-commuter-focused concept; this likely inflated perceived customer potential relative to the morning-accessible base.', 'Neither report fully quantifies or verifies early-morning mall access patterns (which entrances and corridors are open at 7:00–8:00, which metro exits truly feed into the store’s corridor), even though this is precisely the factor that can create a >2× difference in realized traffic between superficially similar hubs.', 'Competition analysis in both reports is descriptive and brand-count focused but does not clearly distinguish between competitors directly on the same primary commuter flow and those on side flows; this blurs the assessment of real competitive pressure on the specific storefront.', 'The write-ups for People’s Square underplay the practical friction created by many dispersed metro exits and complex vertical circulation inside Shimao Plaza, which likely reduces the proportion of total station passengers who actually pass the store in the morning.']
Problem in reports: ['Both reports emphasize that each node is a tier-1 CBD super-hub and use large, loosely comparable district or station numbers instead of putting both sites on the same metric: realistic storefront-level peak pass-by.', "The Xujiahui (location1) report underlines massive station integration but does not push the relative magnitude versus Jing'an Kerry (location2) into the scoring narrative, so its 2.4x advantage is not reflected in daypart and traffic-volume sub-scores.", "The Jing'an Kerry report likely over-attributes Jing'an Temple Station flows to Kerry Centre corridors without clearly discounting passengers who use other exits, malls, or street paths, leading to overestimation of its effective morning catchment.", "Neither report forces an explicit, side-by-side numeric comparison of reachable peak-window customers (for example, estimated morning commuters able to buy within 3 minutes detour), which encourages scoring both sites into the same 'excellent' band.", 'Analysts allowed top-level category weights to drift between locations and did not apply the intended traffic-heavy weighting consistently, further compressing score differences.']
Problem in reports: ["The two reports use different, non-comparable notions of 'people near the store' (district-level footfall, whole-mall visitors, rough station volumes), and neither converts these consistently into a single metric of reachable peak customers at the storefront.", 'Both reports rely on wide numerical ranges and reasoned estimates without clearly stating a final, comparable estimate of peak-window reachable customers and corridor-level pass-by for scoring purposes.', 'Micro-location risk inside the malls (exact corridor tier, sightlines, and whether early-morning access is guaranteed) is acknowledged qualitatively but not translated into quantitative penalties, leading to optimistic traffic and customer scores, especially for the weaker site.', "The suburban hub site is treated as a generally 'very strong' commuter environment without sufficiently discounting the long walk from the station, partial funneling of commuters into the mall, and the lower-intensity CBD office base compared with the core city hub.", 'Competition descriptions are rich but not clearly tied back to per-store volume or observable underperformance, so saturation and risk are not used to differentiate the two sites as strongly as the underlying traffic differences would warrant.']
Problem in reports: ['Both reports describe traffic and customer bases with wide ranges and narrative (e.g., node-level metro ridership, whole-mall footfall) instead of stating a single, comparable estimate of reachable peak customers and corridor-level pass-by within the 7:00–10:00 window.', 'The Xinzhuang (location2) report leans heavily on hub-scale volumes (station ridership, bus hub totals, whole-mall visitors) without clearly downgrading to the subset that actually passes the storefront in the morning, likely overstating its effective peak reach relative to the ground truth.', 'Early-morning access constraints for interior mall corridors (especially at Zhongsheng) are flagged qualitatively but not translated into hard caps in the daypart/traffic sub-scores, making the site look closer to a true commuter super-hub than it really is.', 'The North Bund (location1) report and the Xinzhuang report use slightly different catchment logic and time windows for key numbers, so their peak-base estimates are not strictly like-for-like even though the rubric expects that.', "Neither report provides a concise summary table that ties all later scores back to a few core quantitative inputs (reachable peak base, corridor pass-by, competitor count), making it harder to enforce the rubric's relative-scaling rules."]
Problem in reports: ['Analysts did not compute or show a single, comparable estimate of reachable peak base and storefront-level pass-by for each site; instead they used overlapping ranges (e.g. 10–20k vs 10–16k morning passers-by), which made the two locations look artificially similar despite ground-truth traffic being ~1.9× higher at location 1.', 'They effectively reused old category weights (customer 0.32, traffic 0.40, competition 0.28) rather than the fixed 0.25/0.55/0.20 weights, diluting the impact of traffic differences on the final score.', 'They applied early-opening uncertainty caps in a way that set both locations’ key daypart sub-scores to roughly the same level instead of first differentiating underlying peak potential and then adjusting for access risk.', 'Narrative descriptions clearly distinguished People’s Square as a central CBD super-hub but sub-scores for customer and traffic were compressed into the mid 7s for both sites, and the 9–10 bands reserved for top-tier hubs were not used, leading to almost identical final scores.', 'Competition scoring treated both hyper-clustered malls similarly and did not introduce any offsetting differentiation, which is acceptable, but in combination with muted customer/traffic separation it left the overall comparison almost flat.']
Problem in reports: ['Neither report converts metro ridership + office/shopping populations into a single comparable ‘reachable peak base in 7:00–10:00’ figure for each site; instead they rely on narrative ranges, which makes it easy to overstate Xinzhuang’s parity with Jing’an Kerry.', 'Report2 likely overestimates the share of Xinzhuang Station flows that pass close enough to Zhongsheng’s 1F corridor (400–650 m away) to be realistic commuter customers, while under-penalizing uncertainty about early-morning mall opening for a 7:00–10:00 concept.', 'Report1 under-translates Jing’an Temple’s three-line CBD interchange plus very dense Grade-A offices into explicit peak-window numbers, so its structural super-hub advantage over Xinzhuang in both reachable base and storefront pass-by is understated in scoring.', 'Both competition sections describe hyper-saturation but stop at venue counts; they do not clearly map supply vs. demand into per-store volume bands, so competition scores for the two nodes end up very similar despite different overall market scale.']"""

In [22]:
def summary_problems(problems: str):
    client = OpenAI()
    response = client.responses.create(
        model="gpt-5.1",
        reasoning={"effort": "medium"},
        input=[
            {"role": "user", "content": [{"type": "input_text", "text": f"Help me to summarize this, extract bullet points on what information is missing in terms of customer, traffic and competition analysis of a location. \nProblems: {problems}"}]},
        ],
    )
    return response.output_text

In [23]:
summary_problems(problems)

'Here’s a synthesized summary plus a focused list of *what’s missing* in terms of customer, traffic, and competition analysis.\n\n---\n\n## Overall summary\n\nAcross all the reports, analysis repeatedly stays at the *macro-node* level (big station ridership, whole-mall visitors, district traffic) and uses narrative ranges. What’s missing is a single, consistent, storefront-level view of:\n\n- How many **reachable customers** actually pass within buying distance during the **key morning window**.\n- How **micro-location** (exact corridor, entrance, opening times, walking distance, exits used) changes that reachable base.\n- How **competition on the same flow line** converts that demand into realistic per-store volumes.\n\nWithout this, weaker sites look artificially similar to stronger super-hubs.\n\n---\n\n## Missing information – Customer analysis\n\n- **Comparable “reachable peak base” per site**\n  - A single numeric estimate of realistic customers reachable at the storefront in 7:0

In [None]:
resp = "Here’s a synthesized summary plus a focused list of *what’s missing* in terms of customer, traffic, and competition analysis.\n\n---\n\n## Overall summary\n\nAcross all the reports, analysis repeatedly stays at the *macro-node* level (big station ridership, whole-mall visitors, district traffic) and uses narrative ranges. What’s missing is a single, consistent, storefront-level view of:\n\n- How many **reachable customers** actually pass within buying distance during the **key morning window**.\n- How **micro-location** (exact corridor, entrance, opening times, walking distance, exits used) changes that reachable base.\n- How **competition on the same flow line** converts that demand into realistic per-store volumes.\n\nWithout this, weaker sites look artificially similar to stronger super-hubs.\n\n---\n\n## Missing information – Customer analysis\n\n- **Comparable “reachable peak base” per site**\n  - A single numeric estimate of realistic customers reachable at the storefront in 7:00–10:00 (not just station/mall totals or broad ranges).\n  - Explicit conversion from:\n    - Metro ridership (by line, by exit)\n    - Office worker and resident populations\n    - Relevant shoppers/commuters\n  - Into: “X–Y potential buyers who can purchase with ≤3 minutes detour.”\n\n- **Relevant customer segments for the concept**\n  - Split of:\n    - Morning commuters vs. all-day shoppers vs. tourists vs. local residents.\n  - Clear indication of which segments are actually addressable for a **morning-focused** concept, and which are mostly irrelevant (e.g., late-day tourists on Nanjing East Road).\n\n- **Impact of urban context on customer base**\n  - Comparative density and quality of office stock (core CBD vs. suburban hub) translated into:\n    - Estimated white-collar base.\n    - Expected purchase frequency and ticket size.\n  - Adjustments for suburban vs. CBD context rather than treating both as equivalent “very strong” environments.\n\n- **Customer accessibility frictions**\n  - Quantified impact of:\n    - Long walking distances from key stations (e.g., 400–650 m corridor walks).\n    - Complex vertical circulation (multiple levels, transfers, escalators) that reduce the share of total station passengers who *actually* reach the store.\n    - Dispersed station exits diluting flows (e.g., People’s Square, Jing’an Temple).\n\n---\n\n## Missing information – Traffic analysis\n\n- **Storefront-level pass-by counts**\n  - Directly measured or consistently modeled **pass-by within 0–30 m of the storefront** in the 7:00–10:00 window.\n  - A single, like-for-like figure per site instead of:\n    - “10–20k vs 10–16k” overlapping ranges.\n    - Whole-station or whole-mall daily numbers.\n\n- **Conversion logic from macro flows to corridor-level flows**\n  - For each location:\n    - Share of station passengers using each **specific exit**.\n    - Share of those who enter the particular **mall** vs. street or other projects.\n    - Share of mall entrants that use the **exact corridor** the store sits on.\n  - Explicit discounting of flows that:\n    - Use other exits.\n    - Go to other malls or street paths.\n    - Never pass near the candidate storefront.\n\n- **Early-morning access patterns (7:00–8:00)**\n  - Verified details for each site:\n    - Which **entrances** and **corridors** are open in early morning.\n    - Actual earliest **opening times** of relevant mall doors and internal corridors.\n    - Whether there is a **continuous, open route** from main commuter exits to the storefront in the peak window.\n  - Hard numeric adjustments:\n    - Cap on maximum reachable pass-by if certain corridors are closed or only partially open.\n\n- **Micro-location quality, quantified**\n  - Corridor tier and positioning:\n    - Primary vs. secondary vs. tertiary corridors.\n    - Distance from main funnels and escalators.\n  - Sightlines and visibility:\n    - Whether the storefront is visible from main approach lines.\n  - Numeric penalties/discounts for:\n    - Being off the main flow.\n    - Being on an upper/lower level with less direct flow.\n    - Indirect access or detours.\n\n- **Consistent catchment and time-window definitions**\n  - Same:\n    - Time windows (e.g., 7:00–10:00).\n    - Catchment logic (e.g., “≤3 minutes detour from main path”).\n  - Applied identically across all sites so that “reachable peak base” and “pass-by” figures are directly comparable.\n\n- **Concise quantitative summary per site**\n  - A clear table summarizing for each location:\n    - Reachable peak base (7:00–10:00) at storefront.\n    - Estimated corridor-level pass-by (0–30 m).\n    - Key access risks/constraints and the applied numeric penalties.\n\n---\n\n## Missing information – Competition analysis\n\n- **Flow-relevant competitor mapping**\n  - Differentiation between:\n    - Competitors **on the same primary commuter corridor** (true direct competition).\n    - Competitors on **secondary/side flows** with much lower shared traffic.\n  - Distance and flow overlap:\n    - How many direct competitors are within, say, 30–50 m on the same line of flow.\n\n- **Demand vs. supply quantification**\n  - Numeric **demand-per-store** view:\n    - Reachable peak base / number of directly competing stores on the same flow.\n  - Classification into per-store volume bands:\n    - E.g., “likely >X cups/day,” “Y–Z cups/day,” “high risk of <A cups/day.”\n  - Use of these bands to:\n    - Differentiate saturation risk between nodes of different absolute scale.\n\n- **Performance-based competition evidence**\n  - Where possible:\n    - Observed queues, visible sales intensity, or operator feedback per competitor cluster.\n    - Identification of **underperforming** or frequently rotated units as signs of over-saturation.\n  - Use of this data to adjust competition scores, not just count brands.\n\n- **Concept and daypart relevance of competitors**\n  - Separation of:\n    - Direct, same-daypart competitors (e.g., morning coffee/bakery, quick breakfast).\n    - Indirect or off-peak competitors (dessert shops, late-night bars, tourist-only formats).\n  - Adjusted competition pressure score focusing on competitors that target the **same morning commuter** demand.\n\n- **Quantitative competition summary**\n  - Per location:\n    - Number of directly relevant competitors on main commuter corridor.\n    - Estimated average demand per competitor.\n    - Qualitative notes tied back to numeric implications (e.g., “hyper-clustered but also hyper-demand; per-store volumes still high” vs. “many stores but limited corridor traffic; high risk of underperformance”).\n\n---\n\nIf you’d like, I can turn this into a checklist/template you can use for future site reports (with concrete data fields to fill in for each location)."
print(resp)

In [None]:
resp = "Here’s a synthesized summary plus a focused list of *what’s missing* in terms of customer, traffic, and competition analysis.\n\n---\n\n## Overall summary\n\nAcross all the reports, analysis repeatedly stays at the *macro-node* level (big station ridership, whole-mall visitors, district traffic) and uses narrative ranges. What’s missing is a single, consistent, storefront-level view of:\n\n- How many **reachable customers** actually pass within buying distance during the **key morning window**.\n- How **micro-location** (exact corridor, entrance, opening times, walking distance, exits used) changes that reachable base.\n- How **competition on the same flow line** converts that demand into realistic per-store volumes.\n\nWithout this, weaker sites look artificially similar to stronger super-hubs.\n\n---\n\n## Missing information – Customer analysis\n\n- **Comparable “reachable peak base” per site**\n  - A single numeric estimate of realistic customers reachable at the storefront in 7:00–10:00 (not just station/mall totals or broad ranges).\n  - Explicit conversion from:\n    - Metro ridership (by line, by exit)\n    - Office worker and resident populations\n    - Relevant shoppers/commuters\n  - Into: “X–Y potential buyers who can purchase with ≤3 minutes detour.”\n\n- **Relevant customer segments for the concept**\n  - Split of:\n    - Morning commuters vs. all-day shoppers vs. tourists vs. local residents.\n  - Clear indication of which segments are actually addressable for a **morning-focused** concept, and which are mostly irrelevant (e.g., late-day tourists on Nanjing East Road).\n\n- **Impact of urban context on customer base**\n  - Comparative density and quality of office stock (core CBD vs. suburban hub) translated into:\n    - Estimated white-collar base.\n    - Expected purchase frequency and ticket size.\n  - Adjustments for suburban vs. CBD context rather than treating both as equivalent “very strong” environments.\n\n- **Customer accessibility frictions**\n  - Quantified impact of:\n    - Long walking distances from key stations (e.g., 400–650 m corridor walks).\n    - Complex vertical circulation (multiple levels, transfers, escalators) that reduce the share of total station passengers who *actually* reach the store.\n    - Dispersed station exits diluting flows (e.g., People’s Square, Jing’an Temple).\n\n---\n\n## Missing information – Traffic analysis\n\n- **Storefront-level pass-by counts**\n  - Directly measured or consistently modeled **pass-by within 0–30 m of the storefront** in the 7:00–10:00 window.\n  - A single, like-for-like figure per site instead of:\n    - “10–20k vs 10–16k” overlapping ranges.\n    - Whole-station or whole-mall daily numbers.\n\n- **Conversion logic from macro flows to corridor-level flows**\n  - For each location:\n    - Share of station passengers using each **specific exit**.\n    - Share of those who enter the particular **mall** vs. street or other projects.\n    - Share of mall entrants that use the **exact corridor** the store sits on.\n  - Explicit discounting of flows that:\n    - Use other exits.\n    - Go to other malls or street paths.\n    - Never pass near the candidate storefront.\n\n- **Early-morning access patterns (7:00–8:00)**\n  - Verified details for each site:\n    - Which **entrances** and **corridors** are open in early morning.\n    - Actual earliest **opening times** of relevant mall doors and internal corridors.\n    - Whether there is a **continuous, open route** from main commuter exits to the storefront in the peak window.\n  - Hard numeric adjustments:\n    - Cap on maximum reachable pass-by if certain corridors are closed or only partially open.\n\n- **Micro-location quality, quantified**\n  - Corridor tier and positioning:\n    - Primary vs. secondary vs. tertiary corridors.\n    - Distance from main funnels and escalators.\n  - Sightlines and visibility:\n    - Whether the storefront is visible from main approach lines.\n  - Numeric penalties/discounts for:\n    - Being off the main flow.\n    - Being on an upper/lower level with less direct flow.\n    - Indirect access or detours.\n\n- **Consistent catchment and time-window definitions**\n  - Same:\n    - Time windows (e.g., 7:00–10:00).\n    - Catchment logic (e.g., “≤3 minutes detour from main path”).\n  - Applied identically across all sites so that “reachable peak base” and “pass-by” figures are directly comparable.\n\n- **Concise quantitative summary per site**\n  - A clear table summarizing for each location:\n    - Reachable peak base (7:00–10:00) at storefront.\n    - Estimated corridor-level pass-by (0–30 m).\n    - Key access risks/constraints and the applied numeric penalties.\n\n---\n\n## Missing information – Competition analysis\n\n- **Flow-relevant competitor mapping**\n  - Differentiation between:\n    - Competitors **on the same primary commuter corridor** (true direct competition).\n    - Competitors on **secondary/side flows** with much lower shared traffic.\n  - Distance and flow overlap:\n    - How many direct competitors are within, say, 30–50 m on the same line of flow.\n\n- **Demand vs. supply quantification**\n  - Numeric **demand-per-store** view:\n    - Reachable peak base / number of directly competing stores on the same flow.\n  - Classification into per-store volume bands:\n    - E.g., “likely >X cups/day,” “Y–Z cups/day,” “high risk of <A cups/day.”\n  - Use of these bands to:\n    - Differentiate saturation risk between nodes of different absolute scale.\n\n- **Performance-based competition evidence**\n  - Where possible:\n    - Observed queues, visible sales intensity, or operator feedback per competitor cluster.\n    - Identification of **underperforming** or frequently rotated units as signs of over-saturation.\n  - Use of this data to adjust competition scores, not just count brands.\n\n- **Concept and daypart relevance of competitors**\n  - Separation of:\n    - Direct, same-daypart competitors (e.g., morning coffee/bakery, quick breakfast).\n    - Indirect or off-peak competitors (dessert shops, late-night bars, tourist-only formats).\n  - Adjusted competition pressure score focusing on competitors that target the **same morning commuter** demand.\n\n- **Quantitative competition summary**\n  - Per location:\n    - Number of directly relevant competitors on main commuter corridor.\n    - Estimated average demand per competitor.\n    - Qualitative notes tied back to numeric implications (e.g., “hyper-clustered but also hyper-demand; per-store volumes still high” vs. “many stores but limited corridor traffic; high risk of underperformance”).\n\n---\n\nIf you’d like, I can turn this into a checklist/template you can use for future site reports (with concrete data fields to fill in for each location)."
print(resp)

In [None]:
print(resp)


In [11]:
current_rubric

{'customer': "# Customer Potential Rubric (Further Revised, Location-Oriented, Commuter-Focused)\n\nPurpose: Score 0–10 based on the strength of the realistically reachable customer base and how well it matches the specific store type and its key earning dayparts.\n\n---\n\n## Sub-dimension weights for Customer (sum = 100%)\n\n1. Population density and catchment strength – 10%\n2. Demographic and spending fit – 10%\n3. Customer behavior, daypart and peak demand – 70%\n4. Opportunities and risks – 10%\n\n---\n\n## 1. Population density and catchment strength (10%)\n\nEvaluate both residential and daytime or visitor population within a practical catchment, typically 5–10 minutes' walk (≈300–800 m) in dense urban areas.\n\n### Excellent (9–10)\n\n9.5–10.0 (city-level super-hub)\n- Within roughly 10 minutes' walk at least one of:\n  - Residents ≥80k plus strong daytime inflow (major offices and at least two large malls or complexes), or\n  - Estimated average daytime plus visitor populatio

In [13]:
pair = evaluated_pairs[0]
alignment1 = evaluate_pair_with_rubric(pair, current_rubric, separate=False)
alignment2 = evaluate_pair_with_rubric(pair, current_rubric, separate=True)

In [17]:
alignment2["scores_a"]

{'final_score': 7.068,
 'customer_score': 8.6,
 'traffic_score': 8.2,
 'competition_score': 4.0,
 'customer_criterion_scores': {'Population density and catchment strength': 9.7,
  'Demographic and spending fit': 9.2,
  'Customer behavior, daypart and peak demand': 8.5,
  'Opportunities and risks': 8.0},
 'traffic_criterion_scores': {'Public transit and connectivity quality': 9.0,
  'Walkability, parking and road access': 8.0,
  'Target customer mobility fit': 9.0,
  'Traffic volume and temporal benefits': 7.8},
 'competition_criterion_scores': {'Competitor density and micro-proximity': 3.3,
  'Market saturation versus demand': 3.8,
  'Positioning opportunity': 5.2,
  'Competitive risk': 3.7},
 'weights': {'customer': 0.32,
  'traffic': 0.38,
  'competition': 0.3,
  'justification': 'For this boutique coffee shop optimized for morning commuters, traffic & accessibility is weighted highest at 0.38 because the business model depends on capturing high-frequency, time-sensitive visits durin

In [21]:
revised

{'weakness': ['Hub guidance is too soft and conflicts with hyper-cluster rules, allowing mid-range scores in clearly over-supplied, interior-mall clusters.',
  'Level definitions rely on vague terms like “several” or “many” instead of concrete outlet-count and distance thresholds, causing inconsistent scoring between locations.',
  'Interior-mall and flagship/hero-unit situations are not penalized strongly or consistently enough in competitor-density and risk scoring.',
  'Market-saturation scoring is weakly tied to explicit demand-versus-capacity ratios, so obvious over-supply can still receive ‘adequate’ scores.',
  'Positioning-opportunity rules are broad and optimistic, letting small differentiation angles offset very severe competitive pressure.'],
 'problems': ['Reports emphasize qualitative descriptions over normalized metrics (exact outlet counts by radius, per-outlet demand), making it easy for evaluators to under- or over-estimate hyper-density.',
  'Both reports repeatedly s

In [22]:
# RUBRIC REVISION LOOP with SEPARATION
# Choose which pair set to feed into the rubric revision loop.
pairs_for_revision = evaluated_pairs  # swap to evaluated_pairs after running brand sessions

if not pairs_for_revision:
    raise ValueError("No comparison pairs available; run the setup cells first.")

rubric_history = [{"iteration": 0, "rubric": current_rubric}]

for idx, pair in enumerate(pairs_for_revision, start=1):
    alignment = evaluate_pair_with_rubric(pair, current_rubric, separate=True)
    print(
        f"Pair {idx}: {pair['location_a'].store_name} vs {pair['location_b'].store_name}"
    )
    print(
        f"    Predicted ratio {alignment['predicted_ratio']:.2f} vs GT {alignment['gt_ratio']:.2f} -> diff {alignment['score_diff']:.3f}"
    )

    score1 = alignment["scores_a"]["final_score"]
    score2 = alignment["scores_b"]["final_score"]
    print(f"Before revision, score comparison: {score1} <-> {score2}")

    if alignment["order_matches"] and alignment["within_threshold"]:
        print("    Alignment within threshold, skipping revision for this pair.")
        print("-" * 40)
        continue
    
    wkns = []
    pbls = []
    res_a = pair["location_a_result"]
    res_b = pair["location_b_result"]
    for analysis in ["customer", "traffic", "competition"]:
        score1 = alignment["scores_a"]["{}_score".format(analysis)]
        score2 = alignment["scores_b"]["{}_score".format(analysis)]
        print(f"Start revising {analysis}: score1: {score1}, score2: {score2}")
        revised = rubric_revision(
            report1=res_a['report_sections'][analysis],
            report2=res_b['report_sections'][analysis],
            rubric=current_rubric[analysis],
            score1=alignment["scores_a"]["{}_score".format(analysis)],
            score2=alignment["scores_b"]["{}_score".format(analysis)],
            gt_location_score=f"{pair['gt_ratio']:.4f}",
            pred_location_score=f"{alignment['predicted_ratio']:.4f}",
            separate=True
        )
        try:
            revised = parse_json_from_text(revised)
        except Exception as e:
            print("error in parsing revision, try again ...", e)
            revised = parse_json_from_text(fix_json_error(revised))
        current_rubric[analysis] = revised["rubric"]
        wkns.append(revised.get("weakness", ""))
        pbls.append(revised.get("problems", ""))

    rubric_history.append({"iteration": idx, "rubric": revised, "weakness": wkns, "problem": pbls})

    alignment_after = evaluate_pair_with_rubric(pair, current_rubric, separate=True)

    score1 = alignment_after["scores_a"]["final_score"]
    score2 = alignment_after["scores_b"]["final_score"]
    print(f"After revising, score comparison: {score1} <-> {score2}")

    for analysis in ["customer", "traffic", "competition"]:
        score1 = alignment_after["scores_a"]["{}_score".format(analysis)]
        score2 = alignment_after["scores_b"]["{}_score".format(analysis)]
        print(f"{analysis} : {score1} <-> {score2}")
    
    print(
        f"[[[ Rubric updated (iter {idx}) ]]]\nWeakness in rubric: {wkns}\nProblem in reports: {pbls}"
    )
    print("-" * 40)

revised_rubric_path = "rubrics/revised_rubric.json"
json.dump(revised, open(revised_rubric_path, 'w'), ensure_ascii=False, indent=4)
print(f"Latest rubric saved to {revised_rubric_path}")


Pair 1: Starbucks 甄选(美罗城店) vs Starbucks 甄选(白玉兰广场1F店)
    Predicted ratio 1.03 vs GT 1.74 -> diff 1.683
Before revision, score comparison: 6.619999999999999 <-> 6.408
Start revising customer: score1: 8.2, score2: 7.6
Start revising traffic: score1: 8.7, score2: 7.7
Start revising competition: score1: 2.3, score2: 3.2
After revising, score comparison: 6.624 <-> 6.540000000000001
customer : 8.2 <-> 7.6
traffic : 8.7 <-> 7.7
competition : 2.3 <-> 3.2
[[[ Rubric updated (iter 1) ]]]
Weakness in rubric: [['Top score bands are too broad, causing very different hubs (multi线城市级枢纽 vs 单一综合体商务区) to fall into a similar 8–9 分区间，压缩差距', 'Macro catchment scale is under-weighted relative to demographic fit, so高收入但体量明显更小的片区得分被拉高，接近一线城市级商圈', 'Peak commuter flow criteria侧重模糊的人流区间，未区分多线换乘枢纽与单线站点的结构性差异，导致通勤体量差异被低估', 'Micro-location accessibility标准较笼统，没有量化“主动线直线门面”和“次级通道/退让一排”的差别，使二线位置在强商圈中过于加分', 'Opportunities / risks未明确“高度依赖单一综合体” vs “多元锚点分布”的结构风险差异，使依赖单项目的点位被高估稳定性和上限'], ['Transit node strength bands are to

KeyboardInterrupt: 