In [1]:
!pip -q install sentence-transformers scikit-learn matplotlib networkx

import os, re, math, io, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import ndcg_score

warnings.filterwarnings("ignore")
np.set_printoptions(suppress=True)

In [2]:
# ---- EDIT THIS PATH ----
CSV_PATH = "/content/Capstone_Final_Dataset - Sheet1.csv"  # e.g., uploaded to Colab
OUT_DIR  = "ops_supervised_outputs"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

BRIEF = ("IterLight builds data-driven funding analytics for education equity, "
         "prioritizing K-12 and workforce learning initiatives that improve access, "
         "outcomes, and operational impact.")

K_NEIGHBORS = 5
LAMBDA_CV   = 0.30   # risk penalty
BETA_UCB    = 0.20   # exploration bonus
BOOTSTRAPS  = 20
RANDOM_SEED = 42

os.makedirs(OUT_DIR, exist_ok=True)

def _safe_str(x): return "" if (pd.isna(x) or x is None) else str(x)

def _winsorize(a, lo=5, hi=95):
    a = np.asarray(a, float)
    L, H = np.nanpercentile(a, lo), np.nanpercentile(a, hi)
    return np.clip(a, L, H)

In [3]:
_money_re = re.compile(r"(?i)\$?\s*([0-9][0-9,\.]*)\s*([kKmM]?)")
def money_to_float(s):
    if pd.isna(s): return np.nan
    s = str(s)
    m = _money_re.findall(s)
    if not m: return np.nan
    vals = []
    for num, suf in m:
        x = float(num.replace(",", ""))
        if suf.lower() == "k": x *= 1_000
        elif suf.lower() == "m": x *= 1_000_000
        vals.append(x)
    return (min(vals)+max(vals))/2.0 if len(vals)>=2 else vals[0]

def detect_columns(df: pd.DataFrame):
    # org / name
    name_candidates = [c for c in df.columns if re.search(r"(org|name|foundation|funder|company|partner|entity|institution)", c, re.I)]
    org_col = name_candidates[0] if name_candidates else df.columns[0]
    # text fields
    text_cols = [c for c in df.columns if re.search(r"(mission|summary|about|description|focus|program|area|topic|category|sector|geo|region|location|audience|type)", c, re.I)]
    # grant-ish
    grant_cols = [c for c in df.columns if re.search(r"(grant|fund|amount|award|size|avg|average|median|min|max|range|typical)", c, re.I)]
    # label
    label_candidates = [c for c in df.columns if re.search(r"(label|response|outcome|y|clicked|engaged|replied|positive)", c, re.I)]
    label_col = label_candidates[0] if label_candidates else None
    return org_col, text_cols, grant_cols, label_col

def build_texts(df, org_col, text_cols):
    texts = df[org_col].map(_safe_str)
    for c in text_cols:
        texts = texts + " | " + df[c].map(_safe_str)
    return texts.fillna("").astype(str).tolist()

In [4]:
def sentence_transformer_embeddings(texts, model_name=MODEL_NAME):
    try:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(model_name)
        Z = model.encode(texts, normalize_embeddings=True, show_progress_bar=False)
        return np.asarray(Z), f"Transformer embeddings: {model_name}", None, None
    except Exception as e:
        return None, f"Transformer unavailable ({e}); using TF-IDF + SVD fallback.", None, None

def tfidf_svd_embeddings(texts, max_features=3000, svd_dim=64, seed=RANDOM_SEED):
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=max_features, min_df=1)
    X = tfidf.fit_transform(texts)
    svd = TruncatedSVD(n_components=min(svd_dim, X.shape[1]-1), random_state=seed)
    Z = svd.fit_transform(X)
    Z = Z / (np.linalg.norm(Z, axis=1, keepdims=True) + 1e-9)
    return Z, "Fallback embedding: TF-IDF + TruncatedSVD", tfidf, svd

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def knn_stats(Z, grant_dollars, k=K_NEIGHBORS):
    n = len(grant_dollars)
    k = min(max(2, k), max(2, n-1))
    nbrs = NearestNeighbors(n_neighbors=k, metric="cosine").fit(Z)
    distances, indices = nbrs.kneighbors(Z, n_neighbors=k)

    mu = np.full(n, np.nan, dtype=float)
    sigma_dollars = np.full(n, np.nan, dtype=float)
    u_iso = np.zeros(n, dtype=float)

    for i in range(n):
        neigh = indices[i]
        g_local = grant_dollars[neigh]
        g_local = g_local[np.isfinite(g_local)]  # drop NaN
        # isolation always computable
        d = distances[i]
        u_iso[i] = float(np.mean(d[d > 0])) if d.size > 1 else 0.0

        if len(g_local) == 0:
            continue  # keep mu/sigma as NaN when no neighbor grants exist

        mu[i] = float(np.nanmedian(g_local))

        # risk: log-space variability mapped back (safe)
        lg = np.log(np.clip(g_local, 1.0, None))
        if len(lg) > 1:
            s_log = float(np.nanstd(lg, ddof=1))
            sigma_dollars[i] = float(np.nanmean(g_local) * s_log)
        else:
            sigma_dollars[i] = 0.0

    # winsorize only finite values (keep NaN as NaN)
    mu_f = mu[np.isfinite(mu)]
    sig_f = sigma_dollars[np.isfinite(sigma_dollars)]
    if len(mu_f) > 0:
        lo, hi = np.percentile(mu_f, 5), np.percentile(mu_f, 95)
        mu[np.isfinite(mu)] = np.clip(mu[np.isfinite(mu)], lo, hi)
    if len(sig_f) > 0:
        lo, hi = np.percentile(sig_f, 5), np.percentile(sig_f, 95)
        sigma_dollars[np.isfinite(sigma_dollars)] = np.clip(sigma_dollars[np.isfinite(sigma_dollars)], lo, hi)

    u_norm = (u_iso - np.min(u_iso)) / (np.ptp(u_iso) + 1e-9)
    return mu, sigma_dollars, u_norm, indices

def p_from_brief_similarity(texts, Z, brief_text, tfidf=None, svd=None):
    # if fallback embedding is used, embed brief in same space
    if (tfidf is not None) and (svd is not None):
        Xb = tfidf.transform([brief_text])
        Zb = svd.transform(Xb)
        Zb = Zb / (np.linalg.norm(Zb, axis=1, keepdims=True) + 1e-9)
        sim = cosine_similarity(Z, Zb).ravel()
    else:
        try:
            from sentence_transformers import SentenceTransformer
            model = SentenceTransformer(MODEL_NAME)
            vb = model.encode([brief_text], normalize_embeddings=True, show_progress_bar=False)[0]
            sim = cosine_similarity(Z, vb.reshape(1,-1)).ravel()
        except Exception:
            # last-resort: small independent TF-IDF space
            tfidf2 = TfidfVectorizer(ngram_range=(1,2), max_features=2000, min_df=1)
            Xc = tfidf2.fit_transform(texts + [brief_text])
            svd2 = TruncatedSVD(n_components=min(64, Xc.shape[1]-1), random_state=RANDOM_SEED)
            Zc = svd2.fit_transform(Xc); Zc /= (np.linalg.norm(Zc, axis=1, keepdims=True) + 1e-9)
            sim = cosine_similarity(Zc[:-1], Zc[-1:].reshape(1,-1)).ravel()
    # logistic of z-score for [0,1]-like probability
    z = (sim - sim.mean()) / (sim.std() + 1e-9)
    return 1.0 / (1.0 + np.exp(-z))

In [6]:
df = pd.read_csv(CSV_PATH)
org_col, text_cols, grant_cols, label_col = detect_columns(df)
if label_col is None:
    raise ValueError("No label column detected (look for 'label'/'response' etc).")
texts = build_texts(df, org_col, text_cols)

# Parse grant-like columns → row-wise median
grant_matrix = []
for c in grant_cols:
    if pd.api.types.is_numeric_dtype(df[c]):
        grant_matrix.append(df[c].astype(float).values)
    else:
        grant_matrix.append(df[c].apply(money_to_float).values)
grant_matrix = np.vstack(grant_matrix) if grant_matrix else np.zeros((1, len(df)))
grant_dollars = np.nanmedian(grant_matrix, axis=0) if grant_matrix.size else np.full(len(df), np.nan)

# keep missing as NaN (do NOT force 1 or median)
grant_dollars = grant_dollars.astype(float)
grant_missing = np.isnan(grant_dollars).astype(int)

print("Grant missing rate:", grant_missing.mean())

# Embeddings
Z, embed_note, tfidf, svd = sentence_transformer_embeddings(texts)
if Z is None:
    Z, embed_note, tfidf, svd = tfidf_svd_embeddings(texts)

# Unsupervised descriptors
mu_w, sigma_w, u_norm, neighbor_idx = knn_stats(Z, grant_dollars, k=K_NEIGHBORS)
mu_w = mu_w.astype(float)
sigma_w = sigma_w.astype(float)

cv = np.full_like(mu_w, np.nan, dtype=float)
mask = np.isfinite(mu_w) & (mu_w > 0) & np.isfinite(sigma_w)
cv[mask] = sigma_w[mask] / (mu_w[mask] + 1e-9)

# optional: for scoring, treat missing cv as median cv
cv_fill = np.nanmedian(cv[np.isfinite(cv)]) if np.isfinite(cv).any() else 0.0
cv_w = np.where(np.isfinite(cv), cv, cv_fill)
cv = np.nan_to_num(cv, nan=0.0, posinf=1.0, neginf=0.0)
cv_w = _winsorize(cv, 5, 95)

# Mission fit prior p_raw
p_raw = p_from_brief_similarity(texts, Z, BRIEF, tfidf=tfidf, svd=svd)

# Clusters (for diversity/features)
from sklearn.cluster import KMeans
K = 6
cluster_id = KMeans(n_clusters=K, n_init="auto", random_state=RANDOM_SEED).fit_predict(Z)

print("Embedding:", embed_note)
print("Detected label column:", label_col)

Grant missing rate: 0.027777777777777776




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding: Transformer embeddings: sentence-transformers/all-MiniLM-L6-v2
Detected label column: Category


In [7]:
# force exact label column name if you prefer:
LABEL_COL_OVERRIDE = None  # e.g., "outreach_result" or "label"

def detect_label_col(df):
    if LABEL_COL_OVERRIDE and LABEL_COL_OVERRIDE in df.columns:
        return LABEL_COL_OVERRIDE
    cands = [c for c in df.columns if re.search(r"(label|response|outcome|y|clicked|engaged|replied|positive)", c, re.I)]
    return cands[0] if cands else None

def coerce_binary(series: pd.Series) -> np.ndarray:
    if pd.api.types.is_numeric_dtype(series):
        s = series.fillna(0).astype(float)
        return np.clip(s, 0, 1).astype(int).values
    t = series.fillna("").astype(str).str.strip().str.lower()
    pos_tokens = {
        "1","1.0","yes","y","true","t","engaged","responded","positive",
        "interested","scheduled","accepted","warm","replied"
    }
    return t.isin(pos_tokens).astype(int).values

label_col = detect_label_col(df)
if label_col is None:
    raise ValueError("No label column found. Set LABEL_COL_OVERRIDE or add a 0/1 column.")

y_raw = coerce_binary(df[label_col])
print("Initial label counts:", {0:int((y_raw==0).sum()), 1:int((y_raw==1).sum())})

# Seed positives if needed (PU fallback)
all_zero = (y_raw.sum() == 0)
POSITIVE_ANCHORS = {
    "newschools venture fund", "at&t aspire", "schmidt futures"
    # add more if present in your dataset
}
org_names_lc = df[org_col].astype(str).str.lower().values
anchor_hits = np.array([n in POSITIVE_ANCHORS for n in org_names_lc], dtype=bool)

TOP_M_SEED = 10
y_seed = y_raw.copy()
if all_zero:
    if anchor_hits.sum() > 0:
        y_seed[anchor_hits] = 1
        print(f"Anchors found: {int(anchor_hits.sum())} seeded as positives.")
    else:
        top_idx = np.argsort(-p_raw)[:min(TOP_M_SEED, len(p_raw))]
        y_seed[top_idx] = 1
        print(f"No anchors present; seeded positives with top-{len(top_idx)} by mission fit.")
print("Final seed label counts:", {0:int((y_seed==0).sum()), 1:int((y_seed==1).sum())})

Initial label counts: {0: 216, 1: 0}
No anchors present; seeded positives with top-10 by mission fit.
Final seed label counts: {0: 206, 1: 10}


In [8]:
# Feature matrix: [p_raw, mu, CV, u, cluster one-hots, PCA(Z)]
pca_dim = int(min(16, Z.shape[1], max(2, Z.shape[0]-1)))
Zp  = PCA(n_components=pca_dim, random_state=RANDOM_SEED).fit_transform(Z)
cluster_onehots = pd.get_dummies(pd.Series(cluster_id), prefix="c", dtype=int).values

X = np.column_stack([
    p_raw.reshape(-1,1),
    mu_w.reshape(-1,1),
    cv_w.reshape(-1,1),
    u_norm.reshape(-1,1),
    cluster_onehots,
    Zp
]).astype(float)

# ---- PATCH: median-impute NaNs in X + add missingness indicator ----
col_median = np.nanmedian(X, axis=0)
col_median = np.where(np.isfinite(col_median), col_median, 0.0)

inds = np.where(~np.isfinite(X))
X[inds] = np.take(col_median, inds[1])

mu_missing = (~np.isfinite(mu_w)).astype(int).reshape(-1, 1)
X = np.column_stack([X, mu_missing])

scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

def supervised_prob(X, y):
    # choose between LogReg and RF by PR-AUC, adapt folds to minority size
    def adaptive_folds(y):
        m = int(min((y==0).sum(), (y==1).sum()))
        return 5 if m>=5 else (3 if m>=3 else (2 if m>=2 else 0))

    def eval_model(model, Xd, yd, use_scaled):
        n_splits = adaptive_folds(yd)
        if n_splits >= 2:
            skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
            aucs, aps = [], []
            for tr, va in skf.split(Xd, yd):
                Xm = X_scaled[tr] if use_scaled else Xd[tr]
                Xv = X_scaled[va] if use_scaled else Xd[va]
                model.fit(Xm, yd[tr])
                p = model.predict_proba(Xv)[:,1]
                aucs.append(roc_auc_score(yd[va], p))
                aps.append(average_precision_score(yd[va], p))
            return np.mean(aucs), np.mean(aps)
        else:
            X_tr, X_va, y_tr, y_va = train_test_split(Xd, yd, test_size=0.3, stratify=yd if yd.sum()>0 else None, random_state=RANDOM_SEED)
            Xm = X_scaled if use_scaled else X_tr
            Xv = X_scaled if use_scaled else X_va
            model.fit(Xm, y_tr)
            p = model.predict_proba(Xv)[:,1]
            return roc_auc_score(y_va, p), average_precision_score(y_va, p)

    lr = LogisticRegression(max_iter=200, C=1.0, class_weight="balanced", solver="liblinear")
    rf = RandomForestClassifier(n_estimators=400, max_depth=None, min_samples_leaf=2,
                                random_state=RANDOM_SEED, class_weight="balanced_subsample")

    auc_lr, ap_lr = eval_model(lr, X, y, use_scaled=True)
    auc_rf, ap_rf = eval_model(rf, X, y, use_scaled=False)

    use_rf = ap_rf >= ap_lr
    best = rf if use_rf else lr
    best.fit(X if use_rf else X_scaled, y)
    p_hat = best.predict_proba(X if use_rf else X_scaled)[:,1]
    print(f"[Supervised] Using {'RandomForest' if use_rf else 'LogReg'} | AUC={max(auc_lr, auc_rf):.3f} PR-AUC={max(ap_lr, ap_rf):.3f}")
    return p_hat

def pu_bagging_prob(X, y_seed, n_bags=50, neg_ratio=2.0):
    P = np.where(y_seed==1)[0]
    U = np.where(y_seed==0)[0]
    if len(P)==0:
        raise ValueError("PU bagging requires at least one seeded positive.")
    probs = np.zeros(len(y_seed), float)
    for b in range(n_bags):
        m_neg = max(1, int(len(P)*neg_ratio))
        N = np.random.choice(U, size=min(m_neg, len(U)), replace=False)
        idx = np.concatenate([P, N])
        yb  = np.concatenate([np.ones(len(P), int), np.zeros(len(N), int)])
        clf = LogisticRegression(max_iter=200, C=1.0, class_weight="balanced", solver="liblinear", random_state=RANDOM_SEED+b)
        clf.fit(X_scaled[idx], yb)   # scale helps linear model
        probs += clf.predict_proba(X_scaled)[:,1]
    probs /= n_bags
    print(f"[PU] bags={n_bags}, seeded_pos={len(P)}, unlabeled={len(U)}")
    return probs

# Pick mode
if y_raw.sum() > 0:
    p_hat = supervised_prob(X, y_raw)
else:
    p_hat = pu_bagging_prob(X, y_seed, n_bags=50, neg_ratio=2.0)

# Scores
mu_fill = np.nanmedian(mu_w[np.isfinite(mu_w)]) if np.isfinite(mu_w).any() else 0.0
mu_for_score = np.where(np.isfinite(mu_w), mu_w, mu_fill)

EV_sup      = p_hat * mu_for_score
RiskAdj_sup = EV_sup - LAMBDA_CV * cv_w * mu_for_score

# Bootstrap EV uncertainty from neighbor grants
rng = np.random.default_rng(RANDOM_SEED)
SE_EV = np.zeros_like(EV_sup, dtype=float)
lo_mu, hi_mu = np.percentile(mu_w[np.isfinite(mu_w)], 5), np.percentile(mu_w[np.isfinite(mu_w)], 95)
for i, neigh in enumerate(neighbor_idx):
    g_local = grant_dollars[neigh]
    g_local = g_local[np.isfinite(g_local)]
    if len(g_local) < 2:
        SE_EV[i] = 0.0
        continue
    boots = []
    for _ in range(BOOTSTRAPS):
        samp = rng.choice(g_local, size=len(g_local), replace=True)
        mu_b = float(np.clip(np.median(samp), lo_mu, hi_mu))
        boots.append(p_hat[i] * mu_b)
    SE_EV[i] = float(np.std(boots, ddof=1))

UCB_sup = RiskAdj_sup + BETA_UCB * SE_EV

def priority_index(score):
    ranks = pd.Series(score).rank(method="average", ascending=False)
    return 100.0 * (ranks.max() - ranks) / (ranks.max() - 1 + 1e-9)

PI_EV  = priority_index(EV_sup)
PI_RA  = priority_index(RiskAdj_sup)
PI_UCB = priority_index(UCB_sup)

# Label-aware diagnostics (uses y_raw; fine even if all zeros—shows 0s)
# ---- Correct ranking metrics (fixes NDCG bug + evaluates on real labels when available) ----
def rank_metrics(scores, y, ks=(5,10,20)):
    order = np.argsort(-scores)
    res = {}
    y = np.asarray(y).astype(int)

    for k in ks:
        idx = order[:k]
        prec = float(y[idx].mean()) if k > 0 else 0.0
        rec  = float(y[idx].sum() / (y.sum() + 1e-9))

        # Correct NDCG: relevance = y, predicted scores = scores
        ndcg = float(ndcg_score(y.reshape(1,-1), scores.reshape(1,-1), k=k))
        res[k] = (prec, rec, ndcg)
    return res

# Evaluate on true labels if we have them; otherwise show PU-proxy metrics on seeded positives.
y_eval = y_raw if y_raw.sum() > 0 else y_seed
tag = "" if y_raw.sum() > 0 else " [PU-proxy: seeded positives]"

print("\nPrecision/Recall/NDCG @k (EV, RiskAdj, UCB) — using" + (" real labels:" if tag=="" else " seeded PU labels:"))
for name, s in [("EV", EV_sup), ("RiskAdj", RiskAdj_sup), ("UCB", UCB_sup)]:
    metr = rank_metrics(s, y_eval, ks=(5,10,20))
    line = ", ".join([f"@{k} P={metr[k][0]:.2f} R={metr[k][1]:.2f} NDCG={metr[k][2]:.2f}" for k in metr])
    print(f"{name}{tag}: {line}")

# Classifier AUC/PR-AUC are only meaningful with real positives in y_raw.
if y_raw.sum() > 0:
    print(f"AUC={roc_auc_score(y_raw, p_hat):.3f}  PR-AUC={average_precision_score(y_raw, p_hat):.3f}")
else:
    print("AUC/PR-AUC skipped (no real positives in y_raw; collect outreach responses for true evaluation).")

[PU] bags=50, seeded_pos=10, unlabeled=206

Precision/Recall/NDCG @k (EV, RiskAdj, UCB) — using seeded PU labels:
EV [PU-proxy: seeded positives]: @5 P=0.00 R=0.00 NDCG=0.00, @10 P=0.00 R=0.00 NDCG=0.00, @20 P=0.00 R=0.00 NDCG=0.00
RiskAdj [PU-proxy: seeded positives]: @5 P=0.00 R=0.00 NDCG=0.00, @10 P=0.00 R=0.00 NDCG=0.00, @20 P=0.00 R=0.00 NDCG=0.00
UCB [PU-proxy: seeded positives]: @5 P=0.40 R=0.20 NDCG=0.55, @10 P=0.40 R=0.40 NDCG=0.49, @20 P=0.20 R=0.40 NDCG=0.49
AUC/PR-AUC skipped (no real positives in y_raw; collect outreach responses for true evaluation).


In [9]:
ranked = pd.DataFrame({
    "org_name": df[org_col].astype(str).values,
    "label_raw": y_raw,
    "label_seed": y_seed,            # useful if PU used
    "p_raw": np.round(p_raw, 4),
    "p_hat": np.round(p_hat, 4),
    "mu_capacity_$": np.round(mu_w, 0),
    "risk_cv": np.round(cv_w, 3),
    "u_iso": np.round(u_norm, 3),
    "SE_EV": np.round(SE_EV, 2),
    "EV_supervised": np.round(EV_sup, 2),
    "RiskAdj_supervised": np.round(RiskAdj_sup, 2),
    "UCB_supervised": np.round(UCB_sup, 2),
    "PI_EV": np.round(PI_EV, 1),
    "PI_RiskAdj": np.round(PI_RA, 1),
    "PI_UCB": np.round(PI_UCB, 1),
    "cluster": cluster_id
}).sort_values("PI_UCB", ascending=False).reset_index(drop=True)

csv_path = os.path.join(OUT_DIR, "ops_supervised_rankings.csv")
ranked = ranked[~ranked["org_name"].str.contains("IterLight", case=False, na=False)].reset_index(drop=True)
ranked.to_csv(csv_path, index=False)
print("Wrote:", csv_path)
display(ranked.head(12))

# Plot 1: Top-15 by Priority Index (UCB)
top15 = ranked.head(15).iloc[::-1]
plt.figure(figsize=(8,6))
plt.barh(top15["org_name"], top15["PI_UCB"])
plt.xlabel("Priority Index (UCB, 0–100)")
plt.title("Top 15 Outreach Priorities")
plt.tight_layout(); plt.savefig(os.path.join(OUT_DIR, "plot_top15_supervised.png"), dpi=150); plt.close()

# Plot 2: p_hat vs mu (log), bubble size = SE(EV)
mask = np.isfinite(mu_w)
plt.figure(figsize=(8,6))

se = np.nan_to_num(SE_EV, nan=0.0, posinf=0.0, neginf=0.0)
sizes = 30 + 150 * (se - se.min()) / (np.ptp(se) + 1e-9)

plt.scatter(p_hat[mask], mu_w[mask], s=sizes[mask], alpha=0.7)
plt.yscale("log")
plt.xlabel("p_hat (supervised/PU)")
plt.ylabel("mu_capacity_$ (log)")
plt.title("On-mission fit vs Capacity (known grant neighbors only)")
plt.tight_layout(); plt.savefig(os.path.join(OUT_DIR, "plot_phat_vs_mu_log.png"), dpi=150); plt.close()

Wrote: ops_supervised_outputs/ops_supervised_rankings.csv


Unnamed: 0,org_name,label_raw,label_seed,p_raw,p_hat,mu_capacity_$,risk_cv,u_iso,SE_EV,EV_supervised,RiskAdj_supervised,UCB_supervised,PI_EV,PI_RiskAdj,PI_UCB,cluster
0,U.S. Dept. of Education (EIR Program),0,1,0.9329,0.9406,0.0,0.0,0.805,2103.39,0.0,0.0,420.68,0.0,64.2,100.0,2
1,Austin FC - 4ATX Foundation,0,0,0.3524,0.0937,0.0,0.0,0.366,384.6,0.0,0.0,76.92,0.0,64.2,99.1,1
2,University of Florida Foundation,0,0,0.7788,0.5439,0.0,0.0,0.613,356.57,0.0,0.0,71.31,0.0,64.2,98.6,2
3,University of South Florida - Community Partne...,0,0,0.6379,0.442,0.0,0.0,0.471,289.8,0.0,0.0,57.96,0.0,64.2,98.1,2
4,University of Southern California (USC) Good N...,0,0,0.5891,0.3755,0.0,0.0,0.772,146.96,0.0,0.0,29.39,0.0,64.2,97.7,2
5,San Jose Earthquakes Community Fund,0,0,0.3982,0.0384,0.0,0.0,0.382,118.33,0.0,0.0,23.67,0.0,64.2,97.2,1
6,Virginia Tech - Outreach and International Aff...,0,0,0.848,0.8981,0.0,0.0,0.671,82.3,0.0,0.0,16.46,0.0,64.2,96.7,2
7,University of Colorado Boulder - Office for Ou...,0,1,0.9036,0.9329,0.0,0.0,0.543,52.23,0.0,0.0,10.45,0.0,64.2,96.3,2
8,Texas A&M University - Public Partnership & Ou...,0,1,0.8724,0.9265,0.0,0.0,0.535,51.9,0.0,0.0,10.38,0.0,64.2,95.8,2
9,Penn State University - Outreach & Engagement,0,0,0.8216,0.8861,0.0,0.0,0.583,49.56,0.0,0.0,9.91,0.0,64.2,95.3,2
