## Evaluation Framework (No Ground-Truth Setting)

This notebook evaluates the **correctness, robustness, and internal consistency** of the Access4All accessibility scores stored in `default.access4all_final_scores`.

Because no reliable ground-truth labels exist for large-scale accessibility quality across cities, standard supervised metrics (e.g., accuracy, RMSE) are not applicable. Instead, we validate the scoring logic using **axiom-based and property-driven evaluation**, ensuring that the results obey mandatory domain constraints (hard blocks), exhibit correct directional behavior (monotonicity), remain stable under small parameter changes (rank stability), respect non-compensatory veto logic, and avoid redundancy between scoring components.


### This cell runs the evaluation suite on `default.access4all_final_scores`, checking hard invariants, slope-based monotonicity, rank stability under small weight perturbations, conflict/veto behavior, and component correlations.###


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import numpy as np

# --- Base configuration ---
# Update table name if needed
TABLE_NAME = "default.access4all_final_scores"
W1, W2, W3 = 0.20, 0.35, 0.45

# Cache for performance during iterative evaluation checks
df = spark.table(TABLE_NAME).cache()

# ---------------------------------------------------------
# PILLAR 1 & 2: Logical validity and monotonicity checks
# ---------------------------------------------------------

def run_logical_checks(df):
    print("--- Pillar 1: Hard Invariants & Correctness ---")
    
    # Hard block invariant: if any hard block exists, final score must be 0
    hard_block_fail = df.filter(
        (F.col("any_hard_block") == True) & (F.col("final_score") > 0)
    ).count()
    
    # Final score must be within [0,1]
    range_fail = df.filter(
        (F.col("final_score") < 0) | (F.col("final_score") > 1)
    ).count()
    
    print(f"✅ Hard Block Violations: {hard_block_fail}")
    print(f"✅ Score Range Violations [0,1]: {range_fail}")

def run_monotonicity_tests(df):
    print("\n--- Pillar 2: Monotonicity (Directional Sanity) ---")
    
    # Terrain slope (v3): score should decrease as slope increases
    print("V3 Score by Slope Bucket:")
    df.withColumn(
        "slope_bucket",
        F.when(F.col("slope_p50_pct_200m") < 3, "1. Flat (<3%)")
         .when(F.col("slope_p50_pct_200m") < 6, "2. Mild (3–6%)")
         .when(F.col("slope_p50_pct_200m") < 10, "3. Steep (6–10%)")
         .otherwise("4. Extreme (>10%)")
    ) \
    .groupBy("slope_bucket") \
    .agg(
        F.avg("v3_score").alias("avg_v3"),
        F.count("*").alias("count")
    ) \
    .orderBy("slope_bucket") \
    .show()

# ---------------------------------------------------------
# PILLAR 5: Rank stability (sensitivity analysis)
# ---------------------------------------------------------

def test_rank_stability(df, perturbation=0.05, top_k=1000):
    """
    Tests whether a small change in weights (e.g. +5% to terrain weight)
    drastically changes the ranking.
    High stability (>90%) indicates a robust scoring model.
    """
    print(f"\n--- Pillar 5: Rank Stability (Top {top_k}) ---")
    
    window_spec = Window.orderBy(F.desc("final_score"))
    
    # Original top-K based on final score
    original_top = (
        df.withColumn("rank", F.row_number().over(window_spec))
          .filter(F.col("rank") <= top_k)
          .select("property_id")
          .collect()
    )
    
    orig_set = set(row["property_id"] for row in original_top)
    
    # Recompute score with a small perturbation in v3 weight
    perturbed_w3 = W3 + perturbation
    perturbed_df = df.withColumn(
        "p_score",
        (F.lit(W1 + W2 + perturbed_w3) /
         ((F.lit(W1) / (F.col("v1_score") + 1e-6)) +
          (F.lit(W2) / (F.col("v2_score") + 1e-6)) +
          (F.lit(perturbed_w3) / (F.col("v3_score") + 1e-6))))
    )
    
    new_window = Window.orderBy(F.desc("p_score"))
    new_top = (
        perturbed_df.withColumn("rank", F.row_number().over(new_window))
                    .filter(F.col("rank") <= top_k)
                    .select("property_id")
                    .collect()
    )
    
    new_set = set(row["property_id"] for row in new_top)
    
    overlap = len(orig_set.intersection(new_set)) / top_k
    print(f"Rank Stability Score (Jaccard Overlap): {overlap:.2%}")
    
    if overlap < 0.80:
        print("⚠️ Warning: Ranking is overly sensitive to small weight changes.")

# ---------------------------------------------------------
# PILLAR 6: Conflict auditing (veto behavior)
# ---------------------------------------------------------

def audit_conflicts(df):
    """
    Identifies listings with conflicting signals:
    e.g., highly accessible property (v1) but very poor surrounding terrain (v3).
    Used to verify veto / non-compensatory behavior.
    """
    print("\n--- Pillar 6: Conflict Auditing (Veto Check) ---")
    
    conflicts = df.filter(
        (F.col("v1_score") > 0.8) & (F.col("v3_score") < 0.2)
    )
    
    count = conflicts.count()
    print(f"Found {count} listings with good property but bad terrain.")
    
    # Final score should remain low in such cases
    failures = conflicts.filter(F.col("final_score") > 0.4).count()
    print(f"Veto Failures (final score stayed too high): {failures}")
    
    if count > 0:
        conflicts.select(
            "property_id",
            "city",
            "v1_score",
            "v3_score",
            "final_score"
        ).show(5)

# ---------------------------------------------------------
# PILLAR 8: Component independence
# ---------------------------------------------------------

def check_correlations(df):
    """
    Ensures that v1, v2, and v3 capture different information.
    High correlation (>0.7) may indicate redundancy.
    """
    print("\n--- Pillar 8: Component Independence (Pearson Correlation) ---")
    cols = ["v1_score", "v2_score", "v3_score"]
    
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            corr = df.stat.corr(cols[i], cols[j])
            print(f"Correlation {cols[i]} vs {cols[j]}: {corr:.4f}")

# ---------------------------------------------------------
# Execution
# ---------------------------------------------------------

run_logical_checks(df)
run_monotonicity_tests(df)
test_rank_stability(df)
audit_conflicts(df)
check_correlations(df)


###This cell evaluates `default.access4all_final_scores` via hard invariants, slope-bucket monotonicity (median/p90), Top-K rank stability under a small renormalized weight perturbation, conflict/veto auditing, and component Pearson correlations.
###

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

TABLE_NAME = "default.access4all_final_scores"
df = spark.table(TABLE_NAME).cache()

W1, W2, W3 = 0.20, 0.35, 0.45
BONUS_MAX = 0.075  # 0.15*(1-0.5)
K = 1000

# -------------------------
# Pillar 1: Core invariants (minimal but correct)
# -------------------------
print("--- Pillar 1: Hard Invariants ---")

viol_hard_block = df.filter((F.col("any_hard_block") == True) & (F.col("final_score") > 0)).count()
viol_range = df.filter((F.col("final_score") < 0) | (F.col("final_score") > 1)).count()

# if any component is 0 -> harmonic should be 0 -> final should be 0 (bonus only applies when v3>=0.5, so if v3=0 it's irrelevant)
viol_any_zero = df.filter(
    ((F.col("v1_score") == 0) | (F.col("v2_score") == 0) | (F.col("v3_score") == 0)) &
    (F.col("final_score") > 0) &
    (F.col("any_hard_block") == False)
).count()

print("Hard block => final_score=0 violations:", viol_hard_block)
print("final_score in [0,1] violations:", viol_range)
print("Any(v1,v2,v3)=0 => final_score=0 violations:", viol_any_zero)

# -------------------------
# Pillar 2: Monotonicity sanity (use median)
# -------------------------
print("\n--- Pillar 2: Monotonicity (median by bucket) ---")

slope_bucketed = (
    df.withColumn(
        "slope_bucket",
        F.when(F.col("slope_p50_pct_200m").isNull(), F.lit("NULL"))
         .when(F.col("slope_p50_pct_200m") < 3, F.lit("<3%"))
         .when(F.col("slope_p50_pct_200m") < 6, F.lit("3-6%"))
         .when(F.col("slope_p50_pct_200m") < 10, F.lit("6-10%"))
         .when(F.col("slope_p50_pct_200m") < 15, F.lit("10-15%"))
         .otherwise(F.lit(">=15%"))
    )
)

(slope_bucketed.groupBy("slope_bucket")
 .agg(
     F.count("*").alias("n"),
     F.expr("percentile_approx(v3_score, 0.5)").alias("median_v3"),
     F.expr("percentile_approx(v3_score, 0.9)").alias("p90_v3"),
 )
 .orderBy("slope_bucket")
 .show(200, truncate=False)
)

# -------------------------
# Helper: recompute final_score under perturbed weights (aligned to YOUR spec)
# -------------------------
def compute_final_expr(v1, v2, v3, v2_hb, v3_hb, w1, w2, w3):
    any_zero = (v1 == 0) | (v2 == 0) | (v3 == 0)
    any_hb = v2_hb | v3_hb

    harmonic = F.when(
        any_zero, F.lit(0.0)
    ).otherwise(
        (F.lit(w1 + w2 + w3)) / (F.lit(w1)/v1 + F.lit(w2)/v2 + F.lit(w3)/v3)
    )

    bonus = F.when(v3 >= 0.5, (v3 - 0.5) * 0.15).otherwise(F.lit(0.0))
    raw = harmonic + bonus
    clamped = F.greatest(F.lit(0.0), F.least(F.lit(1.0), raw))

    return F.when(any_hb, F.lit(0.0)).otherwise(clamped)

# -------------------------
# Extra Pillar: Rank Stability (Top-K overlap) — Correct + deterministic
# -------------------------
print(f"\n--- Rank Stability (Top {K} overlap) ---")

orig_top_ids = set(
    r["property_id"]
    for r in df.orderBy(F.desc("final_score")).select("property_id").limit(K).collect()
)

# Perturb weights (example: +5% to w3, then renormalize so weights sum to 1)
perturb = 0.05
w1p, w2p, w3p = W1, W2, W3 * (1 + perturb)
s = w1p + w2p + w3p
w1p, w2p, w3p = w1p/s, w2p/s, w3p/s

df_p = df.withColumn(
    "final_score_perturbed",
    compute_final_expr(
        F.col("v1_score"), F.col("v2_score"), F.col("v3_score"),
        F.col("v2_hard_block"), F.col("v3_hard_block"),
        w1p, w2p, w3p
    )
)

new_top_ids = set(
    r["property_id"]
    for r in df_p.orderBy(F.desc("final_score_perturbed")).select("property_id").limit(K).collect()
)

overlap = len(orig_top_ids.intersection(new_top_ids)) / K
print(f"Top-{K} overlap after +{int(perturb*100)}% w3 (renormalized): {overlap:.2%}")

# -------------------------
# Conflict Auditing: good v1 but bad v3 should veto final score
# -------------------------
print("\n--- Conflict Auditing (Veto check) ---")
conflicts = df.filter((F.col("v1_score") >= 0.8) & (F.col("v3_score") <= 0.2))
n_conf = conflicts.count()
n_bad = conflicts.filter(F.col("final_score") > 0.4).count()
print("Conflicts (v1>=0.8 & v3<=0.2):", n_conf)
print("Veto failures (final_score>0.4 among conflicts):", n_bad)
conflicts.select("property_id","city","v1_score","v2_score","v3_score","final_score","limiting_layer","reasons").show(20, truncate=False)

# -------------------------
# Independence: correlations (fix the loop)
# -------------------------
print("\n--- Component Independence (Pearson corr) ---")
cols = ["v1_score","v2_score","v3_score"]
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        c = df.stat.corr(cols[i], cols[j])
        print(f"corr({cols[i]}, {cols[j]}) = {c:.4f}")
