In [1]:
import re
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
import umap.umap_ as umap
import hdbscan

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = "../data/Capstone_Final_Dataset.xlsx"
MODEL_KEY = "bge_large"
LABEL_COL = "Label"  # must exist in the spreadsheet for supervised step (1/0 for contacted, NaN otherwise)

data_full = pd.read_excel(DATA_PATH)

data_full = data_full.drop(columns=[
    'Category', 
    'Geographic Scope',
    'DEI Priorities', 'Tier',
    'Verified Grant URL / Contact', 'Confirmed Application Deadlines',
    'Strategic Note / Pitch Angle', 'Partnership Potential',
    'Best Pitch Angle', 'Enhanced Notes', 'Contact Person', 'Typical Grant Size',
    'Application Process'
], errors="ignore")

mask = (
    (data_full['Financial Funder'] == 1) &
    data_full['Typical Min. Grant Size'].notna() &
    data_full['Typical Max. Grant Size'].notna()
)
data_full = data_full.loc[mask].copy()

org_col = "Organization" if "Organization" in data_full.columns else data_full.columns[0]

iter_mask = data_full[org_col].astype(str).str.contains(r"\biter\s*light\b", case=False, na=False)
assert iter_mask.any(), "IterLight row not found (needed for sim_to_iter)."

assert "Mission / Stated Pillars" in data_full.columns, "Missing column: Mission / Stated Pillars"
assert "Geo_Focus" in data_full.columns, "Missing column: Geo_Focus"


In [3]:
_PLACES = [
    "los angeles", "new york", "north carolina", "south carolina", "fort worth",
    "tampa bay", "kansas city",
    "anaheim", "atlanta", "augusta", "baltimore", "berkeley", "bronx", "brooklyn",
    "durham", "england", "florida", "houston", "indiana", "jersey", "madison",
    "michigan", "midwest", "minnesota", "oakland", "ontario", "philadelphia",
    "pittsburgh", "south", "tampa", "tennessee", "texas", "utah", "waco",
    "western", "wisconsin", "york", "alberta",
    "la", "nyc", "kc", "nyy", "waco"
]
_PLACES_RX = re.compile(
    r"\b(" + "|".join(map(re.escape, sorted(_PLACES, key=len, reverse=True))) + r")\b",
    re.IGNORECASE
)

URL_EMAIL_RX      = re.compile(r"https?://\S+|www\.\S+|\S+@\S+")
YEAR_RANGE_RX     = re.compile(r"\b(?:19|20)\d{2}\s*[-–—]\s*(?:19|20)\d{2}\b")
YEAR_RX           = re.compile(r"\b(?:19|20)\d{2}\b")
MONEY_RX          = re.compile(r"\$\s?\d[\d,]*(?:\.\d+)?\s*[kKmMbB]?")
STANDALONE_NUM_RX = re.compile(r"(?<![A-Za-z-])\b\d+\b(?![A-Za-z-])")
ORDINAL_RX        = re.compile(r"\b\d+(?:st|nd|rd|th)\b", re.IGNORECASE)
PUNCT_RX          = re.compile(r"[\$&'()/,.:;]")

def clean_mission(text: str) -> str:
    if pd.isna(text):
        return ""
    s = str(text).strip()
    s = (s.replace("’","'").replace("‘","'")
           .replace("“",'"').replace("”",'"')
           .replace("—","-").replace("–","-").replace("…","."))

    s = URL_EMAIL_RX.sub(" ", s)
    s = MONEY_RX.sub(" ", s)
    s = YEAR_RANGE_RX.sub(" ", s)
    s = YEAR_RX.sub(" ", s)
    s = _PLACES_RX.sub(" ", s)

    s = PUNCT_RX.sub(" ", s)
    s = re.sub(r"(?<!\w)-(?!\w)", " ", s)

    s = STANDALONE_NUM_RX.sub(" ", s)
    s = ORDINAL_RX.sub(" ", s)

    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    return s

data_full["mission_clean"] = data_full["Mission / Stated Pillars"].apply(clean_mission)
texts = data_full["mission_clean"].astype(str).tolist()


In [4]:
st_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
E = st_model.encode(texts, normalize_embeddings=True, show_progress_bar=True)


Batches: 100%|██████████| 5/5 [00:00<00:00,  5.84it/s]


In [5]:
UMAP_NEIGHBORS = 10
UMAP_COMPONENTS_FOR_CLUSTER = 15
UMAP_MIN_DIST = 0.1

HDB_MIN_CLUSTER_SIZE = 5
HDB_SELECTION = "eom"

um_cluster = umap.UMAP(
    n_neighbors=UMAP_NEIGHBORS,
    n_components=UMAP_COMPONENTS_FOR_CLUSTER,
    min_dist=UMAP_MIN_DIST,
    metric="cosine",
    random_state=42
)
X_umap = um_cluster.fit_transform(E)

clusterer = hdbscan.HDBSCAN(
    metric="euclidean",
    min_cluster_size=HDB_MIN_CLUSTER_SIZE,
    cluster_selection_method=HDB_SELECTION
)
labels = clusterer.fit_predict(X_umap)

data_full[f"cluster_raw__{MODEL_KEY}"] = labels


  warn(


In [6]:
proxy_patterns = [
    r"\breach\b", r"\breach capital\b", r"\bnational science foundation\b", r"\bnsf\b",
    r"\bsolve\b", r"\by combinator\b|\byc\b", r"\bskydeck\b",
    r"\binstitute of education sciences\b|\bies\b"
]

proxy_mask = np.zeros(len(data_full), dtype=bool)
for pat in proxy_patterns:
    proxy_mask |= data_full[org_col].astype(str).str.contains(pat, case=False, na=False)

if proxy_mask.sum() == 0:
    TARGET_CLUSTER = labels[iter_mask][0]
else:
    lbls = labels[proxy_mask & (labels != -1)]
    TARGET_CLUSTER = pd.Series(lbls).mode()[0] if len(lbls) else -1
    if TARGET_CLUSTER == -1:
        TARGET_CLUSTER = labels[iter_mask][0]

data_full["proxy_success"] = proxy_mask.astype(int)


In [7]:
valid = labels != -1
cluster_ids = sorted(np.unique(labels[valid]))
assert len(cluster_ids) > 0, "All points were labeled noise (-1). Adjust HDBSCAN params."

centroids = {cid: E[(labels == cid) & valid].mean(axis=0) for cid in cluster_ids}
C = np.vstack([centroids[cid] for cid in cluster_ids])

t_idx = cluster_ids.index(TARGET_CLUSTER)
sim_mat = cosine_similarity(C)
order = np.argsort(sim_mat[t_idx])[::-1]
ordered_cids = [cluster_ids[i] for i in order]

base_weights = [1.0, 0.5, 0.25, 0.15, 0.10]
weights_map = {}
for rank, cid in enumerate(ordered_cids):
    weights_map[cid] = base_weights[rank] if rank < len(base_weights) else base_weights[-1]

sim_to_centroids = cosine_similarity(E, C)
nearest_idx = sim_to_centroids.argmax(axis=1)
nearest_cid = np.array([cluster_ids[i] for i in nearest_idx])

labels_soft = labels.copy()
labels_soft[labels == -1] = nearest_cid[labels == -1]

soft_fit = sim_to_centroids.max(axis=1)
cluster_w = np.array([weights_map.get(c, 0.0) for c in labels_soft])

data_full[f"cluster_soft__{MODEL_KEY}"] = labels_soft
data_full[f"soft_fit__{MODEL_KEY}"] = soft_fit
data_full[f"cluster_weight__{MODEL_KEY}"] = cluster_w


In [8]:
iter_emb = E[iter_mask][0]
sim_to_iter = cosine_similarity(E, [iter_emb]).ravel()

priority_score = sim_to_iter * cluster_w * soft_fit

data_full[f"sim_to_iter__{MODEL_KEY}"] = sim_to_iter
data_full[f"priority_score__{MODEL_KEY}"] = priority_score


In [9]:
geo = np.where(data_full["Geo_Focus"] == 1, 0.7, 0.3)
p_x = priority_score * geo

mean_typical_grant = data_full[["Typical Min. Grant Size", "Typical Max. Grant Size"]].mean(axis=1).to_numpy()
expected_value = p_x * mean_typical_grant

data_full[f"geo_weight__{MODEL_KEY}"] = geo
data_full[f"p_x_geo__{MODEL_KEY}"] = p_x
data_full[f"mean_typical_grant__{MODEL_KEY}"] = mean_typical_grant
data_full[f"log_grant__{MODEL_KEY}"] = np.log1p(mean_typical_grant)
data_full[f"expected_value__{MODEL_KEY}"] = expected_value


In [10]:
exclude_mask = data_full[org_col].astype(str).str.contains(r"\biter\s*light\b", case=False, na=False)

baseline_rank = (data_full.loc[~exclude_mask].copy()
                 .sort_values(f"expected_value__{MODEL_KEY}", ascending=False))

baseline_rank.to_excel("baseline_rank_bgelarge_cosine_original.xlsx", index=False)


In [11]:
assert LABEL_COL in data_full.columns, (
    f"Missing '{LABEL_COL}' column. Add it to the Excel file: "
    "1/0 for contacted orgs, NaN for not contacted."
)

feat_cols = [
    f"sim_to_iter__{MODEL_KEY}",
    f"cluster_weight__{MODEL_KEY}",
    f"soft_fit__{MODEL_KEY}",
    "Geo_Focus",
    f"log_grant__{MODEL_KEY}",
]

train_df = data_full.loc[~exclude_mask].copy()
train_df = train_df.loc[train_df[LABEL_COL].notna()].copy()
train_df[LABEL_COL] = train_df[LABEL_COL].astype(int)

X_train = train_df[feat_cols].astype(float).to_numpy()
y_train = train_df[LABEL_COL].to_numpy()

# minimal sanity
assert len(train_df) >= 10, "Too few labeled rows to train anything meaningful."
assert y_train.sum() >= 1, "Need at least one positive label=1 to train."


In [12]:
clf = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        penalty="l2",
        C=1.0,
        class_weight="balanced",
        solver="liblinear",
        max_iter=2000,
        random_state=42
    ))
])

clf.fit(X_train, y_train)


0,1,2
,steps,"[('scaler', ...), ('lr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,2000


In [13]:
score_df = data_full.loc[~exclude_mask].copy()
X_all = score_df[feat_cols].astype(float).to_numpy()

p_hat = clf.predict_proba(X_all)[:, 1]
score_df[f"p_hat_supervised__{MODEL_KEY}"] = p_hat

sup_rank = score_df.sort_values(f"p_hat_supervised__{MODEL_KEY}", ascending=False).copy()
sup_rank.to_excel("supervised_rank_engineered_features.xlsx", index=False)


In [14]:
contacted_mask = sup_rank[LABEL_COL].notna()
next_outreach = sup_rank.loc[~contacted_mask].copy()

K = 30
next_outreach_topK = next_outreach.head(K).copy()
next_outreach_topK.to_excel("next_outreach_top30_supervised.xlsx", index=False)
