# Phase 1: Data Preprocessing and Feature Engineering

Step 1: Load → Target → Clean names

In [3]:
"""
STEP 1:
* Builds is_south_asian using all present SA indicator columns.
* Normalizes names (unicode, spacing, case) without throwing information away.
* Extracts first_clean and last_clean so we can handle mixed-origin names later.
* Adds simple diagnostics (share of SA, avg lengths).

"""

import re, unicodedata, sqlite3
from pathlib import Path
import pandas as pd

# ======= CONFIG =======
CSV_PATH   = Path("/mnt/data/americans_by_descent.csv")   # update path if needed
DB_PATH    = Path("./sa_names.db")                        # SQLite file
CHUNK_SIZE = 50_000                                       # adjust to your RAM (e.g., 20k–200k)
# ======================

# 1) Peek the header to select only needed columns
hdr = pd.read_csv(CSV_PATH, nrows=0, encoding="ISO-8859-1", low_memory=False)
all_cols = list(map(str, hdr.columns))

south_asian_candidates = [
    "South Asian",
    "Bangladeshi","Bhutanese","Indian","Nepali","Pakistani","Sri Lankan",
    "Bengali","Gujarati","Kashmiri","Malayali","Punjabi","Sindhi",
    "Sinhala","Sri Lankan Tamil","Tamil","Telugu","Marathi","Kannadiga",
    "Tuluva","Rajasthani","Bihari","Awadhi","Bhojpuri","Maithil","Haryanvi",
]
present_sa_cols = [c for c in south_asian_candidates if c in all_cols]

usecols = [c for c in ["id","name"] if c in all_cols] + present_sa_cols
assert "name" in usecols, "The CSV must contain a 'name' column."

# 2) Cleaning helpers
def normalize_unicode(s: str) -> str:
    if not isinstance(s, str): return ""
    s = (s.replace("\u2018", "'").replace("\u2019", "'")
           .replace("\u201C", '"').replace("\u201D", '"')
           .replace("\u2013", "-").replace("\u2014", "-")
           .replace("\u00A0", " "))
    return unicodedata.normalize("NFKC", s)

def ascii_fold(s: str) -> str:
    return unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")

def clean_name(raw: str) -> str:
    if not isinstance(raw, str): return ""
    s = normalize_unicode(raw).strip().lower()
    s = re.sub(r"[,\u200b]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def split_first_last(cleaned: str):
    if not cleaned: return "", ""
    parts = cleaned.split()
    if len(parts) == 1: return parts[0], ""
    return parts[0], parts[-1]

# 3) SQLite: create table with UNIQUE(name_ascii) to dedup at ingest
con = sqlite3.connect(DB_PATH)
cur = con.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS names_preprocessed (
  id                TEXT,
  name              TEXT NOT NULL,
  name_clean        TEXT NOT NULL,
  name_ascii        TEXT NOT NULL,
  first_clean       TEXT,
  last_clean        TEXT,
  is_south_asian    INTEGER NOT NULL,
  name_len          INTEGER,
  word_count        INTEGER,
  PRIMARY KEY (name_ascii)                -- unique key for dedup
);
""")
con.commit()

# 4) Stream the CSV in chunks
total_rows = 0
inserted_rows = 0
for chunk in pd.read_csv(CSV_PATH, usecols=usecols, chunksize=CHUNK_SIZE,
                         encoding="ISO-8859-1", low_memory=True):
    total_rows += len(chunk)

    # build target
    if present_sa_cols:
        chunk["is_south_asian"] = (chunk[present_sa_cols].fillna(0).sum(axis=1) > 0).astype("int8")
        # free unneeded SA columns immediately to save RAM
        chunk.drop(columns=present_sa_cols, inplace=True, errors="ignore")
    else:
        chunk["is_south_asian"] = 0

    # clean + structure
    chunk["name_clean"] = chunk["name"].map(clean_name)
    fl = chunk["name_clean"].map(split_first_last)
    chunk["first_clean"] = fl.map(lambda t: t[0])
    chunk["last_clean"]  = fl.map(lambda t: t[1])
    chunk["name_ascii"]  = chunk["name_clean"].map(ascii_fold)
    chunk["name_len"]    = chunk["name_clean"].str.len().astype("Int32")
    chunk["word_count"]  = chunk["name_clean"].str.split().str.len().astype("Int32")

    # keep only columns we store
    keep = [c for c in ["id","name","name_clean","name_ascii","first_clean","last_clean",
                        "is_south_asian","name_len","word_count"] if c in chunk.columns]
    chunk = chunk[keep]

    # insert with IGNORE to skip duplicates by PRIMARY KEY(name_ascii)
    # (executemany is fastest; wrap in transaction)
    recs = list(chunk.itertuples(index=False, name=None))
    cur.executemany(f"""
        INSERT OR IGNORE INTO names_preprocessed
        ({",".join(keep)}) VALUES ({",".join(["?"]*len(keep))})
    """, recs)
    con.commit()

    # stats
    # SQLite change count isn’t trivial, so we can estimate on next loop if desired
    # (or query COUNT(*) occasionally)
    print(f"Ingested chunk: {len(recs):,} rows | Total read: {total_rows:,}")

# final count
cnt = cur.execute("SELECT COUNT(*) FROM names_preprocessed").fetchone()[0]
print(f"SQLite rows (deduped): {cnt:,}")
con.close()

  for chunk in pd.read_csv(CSV_PATH, usecols=usecols, chunksize=CHUNK_SIZE,


Ingested chunk: 28,694 rows | Total read: 28,694
SQLite rows (deduped): 26,646


Step 2 — Exact Dedup + Stratified Splits

In [4]:
# STEP 2: We deduplicate by a normalized ASCII name to prevent duplicate names leaking across splits (optimistic test scores). Then we build 70/15/15 stratified splits on is_south_asian.s

import sqlite3

DB_PATH = "./sa_names.db"
con = sqlite3.connect(DB_PATH)
cur = con.cursor()

# Make sure we can use window functions efficiently
cur.execute("PRAGMA journal_mode=WAL;")
cur.execute("PRAGMA temp_store=MEMORY;")
cur.execute("PRAGMA synchronous=NORMAL;")

# 1) Add a persistent random key for deterministic partitioning (only fills if missing)
try:
    cur.execute("ALTER TABLE names_preprocessed ADD COLUMN rnd TEXT;")
    con.commit()
except sqlite3.OperationalError:
    # column exists
    pass

cur.execute("""
UPDATE names_preprocessed
SET rnd = COALESCE(rnd, lower(hex(randomblob(16))));
""")
con.commit()

# 2) Build splits table with ~70/15/15 per class (stratified by is_south_asian)
cur.execute("DROP TABLE IF EXISTS names_splits;")
cur.execute("""
CREATE TABLE names_splits AS
WITH ranked AS (
  SELECT
    name_ascii,
    is_south_asian,
    rnd,
    ROW_NUMBER() OVER (PARTITION BY is_south_asian ORDER BY rnd) AS rnk,
    COUNT(*)    OVER (PARTITION BY is_south_asian)               AS n
  FROM names_preprocessed
),
assigned AS (
  SELECT
    name_ascii, is_south_asian, rnd, rnk, n,
    CASE
      WHEN rnk <= 0.70*n THEN 'train'
      WHEN rnk <= 0.85*n THEN 'val'
      ELSE 'test'
    END AS split
  FROM ranked
)
SELECT * FROM assigned;
""")
con.commit()

# 3) Quick sanity: counts by split and class
def split_stats(tag):
    cnt = cur.execute(
        "SELECT COUNT(*) FROM names_splits WHERE split=?;", (tag,)
    ).fetchone()[0]
    sa  = cur.execute(
        "SELECT COALESCE(SUM(is_south_asian),0) FROM names_splits WHERE split=?;", (tag,)
    ).fetchone()[0]
    return cnt, sa, cnt-sa

train_cnt, train_sa, train_non = split_stats("train")
val_cnt,   val_sa,   val_non   = split_stats("val")
test_cnt,  test_sa,  test_non  = split_stats("test")

print({
    "split_sizes": {
        "train": train_cnt, "val": val_cnt, "test": test_cnt
    },
    "class_balance": {
        "train": {"sa": train_sa, "non_sa": train_non},
        "val":   {"sa": val_sa,   "non_sa": val_non},
        "test":  {"sa": test_sa,  "non_sa": test_non},
    }
})

# 4) Optional: verify no overlap between splits
overlap = cur.execute("""
SELECT COUNT(*) FROM (
  SELECT name_ascii, COUNT(DISTINCT split) AS k
  FROM names_splits
  GROUP BY name_ascii
  HAVING k > 1
);
""").fetchone()[0]
print({"name_overlap_across_splits": overlap})

con.close()

{'split_sizes': {'train': 18651, 'val': 3997, 'test': 3998}, 'class_balance': {'train': {'sa': 1696, 'non_sa': 16955}, 'val': {'sa': 364, 'non_sa': 3633}, 'test': {'sa': 364, 'non_sa': 3634}}}
{'name_overlap_across_splits': 0}


Step 3 — Build TF-IDF char n-gram features, then transform Val/Test Set

In [5]:
# STEP 3: TF-IDF char n-grams for full/first/last views (fit on train only)

"""
* Loads the train/val/test splits directly from SQLite (low memory).
* Fits three TF-IDF vectorizers (char n-grams 2–5) on train only.
* Transforms val/test with those exact vectorizers.
* Saves vectorizers and matrices to disk for modeling in the next step.
"""

from pathlib import Path
import sqlite3, json, joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

DB_PATH = "./sa_names.db"
ART_DIR = Path("./artifacts_tfidf")
ART_DIR.mkdir(parents=True, exist_ok=True)

# ---------- helpers ----------
def fetch_split(con, split: str, cols=('name_ascii','first_clean','last_clean','is_south_asian')):
    q = f"""
    SELECT {",".join(cols)}
    FROM names_preprocessed
    WHERE name_ascii IN (
        SELECT name_ascii FROM names_splits WHERE split=?
    );
    """
    return pd.read_sql_query(q, con, params=(split,))

def fit_vec(train_series):
    # Char n-grams capture name sub-structure well; min_df=2 reduces noise.
    vec = TfidfVectorizer(
        analyzer="char",
        ngram_range=(2,5),
        min_df=2,
        lowercase=False,   # we already normalized to lowercase
        norm="l2"
    )
    return vec.fit(train_series)

def transform_to_npz(vec, series, path_npz):
    X = vec.transform(series.fillna(""))
    sparse.save_npz(path_npz, X)
    return X.shape

# ---------- load splits ----------
con = sqlite3.connect(DB_PATH)
train_df = fetch_split(con, "train")
val_df   = fetch_split(con, "val")
test_df  = fetch_split(con, "test")
con.close()

# Sanity checks
assert {"name_ascii","first_clean","last_clean","is_south_asian"} <= set(train_df.columns)
print({
    "train_rows": len(train_df),
    "val_rows": len(val_df),
    "test_rows": len(test_df),
    "sa_rate_train_pct": round(train_df["is_south_asian"].mean()*100, 3),
    "sa_rate_val_pct": round(val_df["is_south_asian"].mean()*100, 3),
    "sa_rate_test_pct": round(test_df["is_south_asian"].mean()*100, 3),
})

# ---------- fit vectorizers on TRAIN only ----------
vec_full  = fit_vec(train_df["name_ascii"])
vec_first = fit_vec(train_df["first_clean"])
vec_last  = fit_vec(train_df["last_clean"])

# Save vectorizers
joblib.dump(vec_full,  ART_DIR / "vec_full_tfidf.joblib")
joblib.dump(vec_first, ART_DIR / "vec_first_tfidf.joblib")
joblib.dump(vec_last,  ART_DIR / "vec_last_tfidf.joblib")

# ---------- transform & save matrices ----------
shapes = {}

# full
shapes["Xtr_full"] = transform_to_npz(vec_full,  train_df["name_ascii"], ART_DIR / "Xtr_full.npz")
shapes["Xva_full"] = transform_to_npz(vec_full,  val_df["name_ascii"],   ART_DIR / "Xva_full.npz")
shapes["Xte_full"] = transform_to_npz(vec_full,  test_df["name_ascii"],  ART_DIR / "Xte_full.npz")

# first
shapes["Xtr_first"] = transform_to_npz(vec_first, train_df["first_clean"], ART_DIR / "Xtr_first.npz")
shapes["Xva_first"] = transform_to_npz(vec_first, val_df["first_clean"],   ART_DIR / "Xva_first.npz")
shapes["Xte_first"] = transform_to_npz(vec_first, test_df["first_clean"],  ART_DIR / "Xte_first.npz")

# last
shapes["Xtr_last"] = transform_to_npz(vec_last,  train_df["last_clean"], ART_DIR / "Xtr_last.npz")
shapes["Xva_last"] = transform_to_npz(vec_last,  val_df["last_clean"],   ART_DIR / "Xva_last.npz")
shapes["Xte_last"] = transform_to_npz(vec_last,  test_df["last_clean"],  ART_DIR / "Xte_last.npz")

report = {
    "tfidf_vocab_sizes": {
        "full":  len(vec_full.vocabulary_),
        "first": len(vec_first.vocabulary_),
        "last":  len(vec_last.vocabulary_),
    },
    "matrix_shapes": shapes
}
print(report)

{'train_rows': 18651, 'val_rows': 3997, 'test_rows': 3998, 'sa_rate_train_pct': np.float64(9.093), 'sa_rate_val_pct': np.float64(9.107), 'sa_rate_test_pct': np.float64(9.105)}
{'tfidf_vocab_sizes': {'full': 57322, 'first': 10835, 'last': 23277}, 'matrix_shapes': {'Xtr_full': (18651, 57322), 'Xva_full': (3997, 57322), 'Xte_full': (3998, 57322), 'Xtr_first': (18651, 10835), 'Xva_first': (3997, 10835), 'Xte_first': (3998, 10835), 'Xtr_last': (18651, 23277), 'Xva_last': (3997, 23277), 'Xte_last': (3998, 23277)}}


Step 4: Logistic Regression Baseline + Evaluation Harness

In [6]:
# STEP 4: Logistic Regression baselines for full / first / last + fusion

"""
* Memory-safe: loads splits from SQLite
* Class imbalance handled via class_weight="balanced"
* Reports PR-AUC, ROC-AUC, and thresholded metrics (max-F1, high-precision, high-recall)
* Saves models/vectorizers to ./artifacts_baseline
"""

from pathlib import Path
import json
import sqlite3
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    average_precision_score, roc_auc_score,
    precision_recall_curve, classification_report, confusion_matrix
)
from scipy import sparse

# -------------------
# Config
# -------------------
DB_PATH = "./sa_names.db"
ART_DIR = Path("./artifacts_baseline")
ART_DIR.mkdir(parents=True, exist_ok=True)

NGRAM_RANGE = (2, 5)
MIN_DF_DEFAULT = 2      # reduce noise, consistent with prior step
MIN_DF_FALLBACK = 1     # fallback if a view gets empty vocab
MAX_ITER = 400
SEED = 42

# -------------------
# Helpers
# -------------------
def fetch_split(con, split: str, cols=('name_ascii','first_clean','last_clean','is_south_asian')):
    q = f"""
    SELECT {",".join(cols)}
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    return pd.read_sql_query(q, con, params=(split,))

def mk_vectorizer(min_df):
    return TfidfVectorizer(
        analyzer="char",
        ngram_range=NGRAM_RANGE,
        min_df=min_df,
        lowercase=False,
        norm="l2"
    )

def fit_vec_safe(train_series, min_df=MIN_DF_DEFAULT):
    """Fit TF-IDF, falling back to lower min_df if needed."""
    try:
        vec = mk_vectorizer(min_df=min_df).fit(train_series)
    except ValueError as e:
        # Empty vocabulary likely; retry with fallback
        vec = mk_vectorizer(min_df=MIN_DF_FALLBACK).fit(train_series)
    return vec

def train_logreg(X, y):
    # Class-weighted logistic regression for imbalance
    lr = LogisticRegression(
        penalty="l2",
        solver="saga",
        class_weight="balanced",
        max_iter=MAX_ITER,
        n_jobs=-1,
        random_state=SEED,
        verbose=0,
    )
    lr.fit(X, y)
    return lr

def eval_probs(y_true, p_scores, label="model"):
    """Return metrics dict + threshold table entries for max-F1, R>=0.90, P>=0.90."""
    # Scores
    pr_auc = float(average_precision_score(y_true, p_scores))
    try:
        roc = float(roc_auc_score(y_true, p_scores))
    except ValueError:
        roc = float("nan")

    # Precision-Recall curve and best-F1 threshold
    precision, recall, thresholds = precision_recall_curve(y_true, p_scores)
    # precision_recall_curve returns n+1 points; make thresholds align
    thr = np.concatenate([thresholds, [1.0]])

    f1 = np.where((precision + recall) > 0, 2 * precision * recall / (precision + recall), 0.0)
    best_idx = int(np.nanargmax(f1))
    best = {
        "threshold": float(thr[best_idx]),
        "precision": float(precision[best_idx]),
        "recall": float(recall[best_idx]),
        "f1": float(f1[best_idx]),
    }

    # Threshold achieving at least target recall (choose highest precision under that constraint)
    def threshold_for_target_recall(target=0.90):
        ok = np.where(recall >= target)[0]
        if len(ok) == 0:
            return None
        i = ok[np.argmax(precision[ok])]  # among those, best precision
        return {"threshold": float(thr[i]), "precision": float(precision[i]), "recall": float(recall[i]), "f1": float(f1[i])}

    # Threshold achieving at least target precision (choose highest recall under that constraint)
    def threshold_for_target_precision(target=0.90):
        ok = np.where(precision >= target)[0]
        if len(ok) == 0:
            return None
        i = ok[np.argmax(recall[ok])]
        return {"threshold": float(thr[i]), "precision": float(precision[i]), "recall": float(recall[i]), "f1": float(f1[i])}

    at_r90 = threshold_for_target_recall(0.90)
    at_p90 = threshold_for_target_precision(0.90)

    # Confusion matrix at best-F1 threshold
    y_hat = (p_scores >= best["threshold"]).astype(int)
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    report = classification_report(y_true, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)

    return {
        "label": label,
        "pr_auc": pr_auc,
        "roc_auc": roc,
        "best_f1": best,
        "p_at_recall_0.90": at_r90,
        "r_at_precision_0.90": at_p90,
        "confusion_matrix": {"tn": int(cm[0,0]), "fp": int(cm[0,1]), "fn": int(cm[1,0]), "tp": int(cm[1,1])},
        "classification_report_macro_f1": float(report["macro avg"]["f1-score"]),
        "classification_report_SA_f1": float(report["SA"]["f1-score"]),
    }

def to_csr(vec, series):
    return vec.transform(series.fillna(""))

# -------------------
# Load splits
# -------------------
con = sqlite3.connect(DB_PATH)
train_df = fetch_split(con, "train")
val_df   = fetch_split(con, "val")
test_df  = fetch_split(con, "test")
con.close()

y_tr = train_df["is_south_asian"].values.astype(int)
y_va = val_df["is_south_asian"].values.astype(int)
y_te = test_df["is_south_asian"].values.astype(int)

# -------------------
# Vectorize three views (fit on TRAIN only)
# -------------------
vec_full  = fit_vec_safe(train_df["name_ascii"])
vec_first = fit_vec_safe(train_df["first_clean"])
vec_last  = fit_vec_safe(train_df["last_clean"])

# Save vectorizers
joblib.dump(vec_full,  ART_DIR / "vec_full_tfidf.joblib")
joblib.dump(vec_first, ART_DIR / "vec_first_tfidf.joblib")
joblib.dump(vec_last,  ART_DIR / "vec_last_tfidf.joblib")

# Transform
Xtr_full,  Xva_full,  Xte_full  = to_csr(vec_full,  train_df["name_ascii"]), to_csr(vec_full,  val_df["name_ascii"]), to_csr(vec_full,  test_df["name_ascii"])
Xtr_first, Xva_first, Xte_first = to_csr(vec_first, train_df["first_clean"]), to_csr(vec_first, val_df["first_clean"]), to_csr(vec_first, test_df["first_clean"])
Xtr_last,  Xva_last,  Xte_last  = to_csr(vec_last,  train_df["last_clean"]),  to_csr(vec_last,  val_df["last_clean"]),  to_csr(vec_last,  test_df["last_clean"])

print({
    "vocab_sizes": {
        "full":  len(vec_full.vocabulary_),
        "first": len(vec_first.vocabulary_),
        "last":  len(vec_last.vocabulary_),
    },
    "train_shapes": {
        "full":  Xtr_full.shape,
        "first": Xtr_first.shape,
        "last":  Xtr_last.shape,
    }
})

# -------------------
# Train models
# -------------------
lr_full  = train_logreg(Xtr_full,  y_tr)
lr_first = train_logreg(Xtr_first, y_tr)
lr_last  = train_logreg(Xtr_last,  y_tr)

# Save models
joblib.dump(lr_full,  ART_DIR / "lr_full.joblib")
joblib.dump(lr_first, ART_DIR / "lr_first.joblib")
joblib.dump(lr_last,  ART_DIR / "lr_last.joblib")

# -------------------
# Validation metrics
# -------------------
p_full_va  = lr_full.predict_proba(Xva_full)[:,1]
p_first_va = lr_first.predict_proba(Xva_first)[:,1]
p_last_va  = lr_last.predict_proba(Xva_last)[:,1]

# Fusion: 1 - (1-p_first)*(1-p_last)
p_fusion_va = 1.0 - (1.0 - p_first_va) * (1.0 - p_last_va)

metrics = {
    "full":   eval_probs(y_va, p_full_va,  label="logreg_full"),
    "first":  eval_probs(y_va, p_first_va, label="logreg_first"),
    "last":   eval_probs(y_va, p_last_va,  label="logreg_last"),
    "fusion": eval_probs(y_va, p_fusion_va,label="fusion_first_last"),
}

print(json.dumps(metrics, indent=2))

# Persist metrics for traceability
with open(ART_DIR / "val_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# -------------------
# OPTIONAL: quick test-set snapshot at the chosen val thresholds
# We apply each model's "best F1" threshold from validation to the test set
# (This is only a sanity peek; the real selection happens after you review val metrics.)
# -------------------
def apply_threshold(y_true, p_scores, thr):
    y_hat = (p_scores >= thr).astype(int)
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    return {
        "tn": int(cm[0,0]), "fp": int(cm[0,1]),
        "fn": int(cm[1,0]), "tp": int(cm[1,1])
    }

# Compute test probs
p_full_te  = lr_full.predict_proba(Xte_full)[:,1]
p_first_te = lr_first.predict_proba(Xte_first)[:,1]
p_last_te  = lr_last.predict_proba(Xte_last)[:,1]
p_fusion_te= 1.0 - (1.0 - p_first_te) * (1.0 - p_last_te)

test_snapshot = {}
for key, p in [("full", p_full_te), ("first", p_first_te), ("last", p_last_te), ("fusion", p_fusion_te)]:
    thr = metrics[key]["best_f1"]["threshold"]
    test_snapshot[key] = {
        "threshold_from_val_best_f1": thr,
        "confusion_matrix_on_test": apply_threshold(y_te, p, thr)
    }

with open(ART_DIR / "test_snapshot.json", "w") as f:
    json.dump(test_snapshot, f, indent=2)

print("Saved artifacts to:", str(ART_DIR.resolve()))

{'vocab_sizes': {'full': 57322, 'first': 10835, 'last': 23277}, 'train_shapes': {'full': (18651, 57322), 'first': (18651, 10835), 'last': (18651, 23277)}}




{
  "full": {
    "label": "logreg_full",
    "pr_auc": 0.8252703469956613,
    "roc_auc": 0.9651644116962037,
    "best_f1": {
      "threshold": 0.6819498025374165,
      "precision": 0.7647058823529411,
      "recall": 0.7857142857142857,
      "f1": 0.7750677506775069
    },
    "p_at_recall_0.90": {
      "threshold": 0.4506668394029149,
      "precision": 0.5578231292517006,
      "recall": 0.9010989010989011,
      "f1": 0.6890756302521008
    },
    "r_at_precision_0.90": {
      "threshold": 0.8530234181777896,
      "precision": 0.9004524886877828,
      "recall": 0.5467032967032966,
      "f1": 0.6803418803418803
    },
    "confusion_matrix": {
      "tn": 3545,
      "fp": 88,
      "fn": 78,
      "tp": 286
    },
    "classification_report_macro_f1": 0.8760950660774525,
    "classification_report_SA_f1": 0.7750677506775068
  },
  "first": {
    "label": "logreg_first",
    "pr_auc": 0.6816878126145355,
    "roc_auc": 0.915621984676485,
    "best_f1": {
      "threshold":



Step 5 — Calibration, threshold selection, and a production-style inference API

In [7]:
# STEP 5: Calibrate (isotonic), pick thresholds, add inference function with fusion + abstain

"""
* Fit isotonic calibration on validation scores (per view + fusion).
* Re-evaluate metrics on val after calibration; choose operating threshold:
* Evaluate on test at the chosen threshold.
* Save a lightweight inference module
"""

from pathlib import Path
import json
import joblib
import re, unicodedata, json, joblib, numpy as np
import pandas as pd
import sqlite3, re, unicodedata

from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_recall_curve, confusion_matrix, classification_report
)

ART_IN   = Path("./artifacts_baseline")   # from step 4
ART_OUT  = Path("./artifacts_calibrated")
ART_OUT.mkdir(parents=True, exist_ok=True)

# -------------------
# Load artifacts and data
# -------------------
vec_full  = joblib.load(ART_IN / "vec_full_tfidf.joblib")
vec_first = joblib.load(ART_IN / "vec_first_tfidf.joblib")
vec_last  = joblib.load(ART_IN / "vec_last_tfidf.joblib")

lr_full   = joblib.load(ART_IN / "lr_full.joblib")
lr_first  = joblib.load(ART_IN / "lr_first.joblib")
lr_last   = joblib.load(ART_IN / "lr_last.joblib")

# Fetch splits
con = sqlite3.connect("./sa_names.db")
def fetch_split(split):
    q = """
    SELECT name, name_ascii, first_clean, last_clean, is_south_asian
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    return pd.read_sql_query(q, con, params=(split,))
val_df  = fetch_split("val")
test_df = fetch_split("test")
con.close()

y_va = val_df["is_south_asian"].to_numpy(dtype=int)
y_te = test_df["is_south_asian"].to_numpy(dtype=int)

# -------------------
# Get raw (uncalibrated) probabilities for VAL/TEST
# -------------------
def probs_for(df):
    Xf  = vec_full.transform(df["name_ascii"].fillna(""))
    Xfi = vec_first.transform(df["first_clean"].fillna(""))
    Xla = vec_last.transform(df["last_clean"].fillna(""))

    p_full  = lr_full.predict_proba(Xf)[:,1]
    p_first = lr_first.predict_proba(Xfi)[:,1]
    p_last  = lr_last.predict_proba(Xla)[:,1]
    p_fusion = 1.0 - (1.0 - p_first) * (1.0 - p_last)
    return p_full, p_first, p_last, p_fusion

p_full_va, p_first_va, p_last_va, p_fusion_va = probs_for(val_df)
p_full_te, p_first_te, p_last_te, p_fusion_te = probs_for(test_df)

# -------------------
# Calibrate with isotonic regression on VAL
# -------------------
def fit_iso(y_true, p_scores):
    # Isotonic expects sorted by scores sometimes; but scikit handles internally.
    ir = IsotonicRegression(out_of_bounds="clip")
    ir.fit(p_scores, y_true)
    return ir

cal_full   = fit_iso(y_va, p_full_va)
cal_first  = fit_iso(y_va, p_first_va)
cal_last   = fit_iso(y_va, p_last_va)
cal_fusion = fit_iso(y_va, p_fusion_va)

# Apply calibration
def apply_cal(ir, p): return np.clip(ir.predict(p), 0.0, 1.0)

cp_full_va   = apply_cal(cal_full,   p_full_va)
cp_first_va  = apply_cal(cal_first,  p_first_va)
cp_last_va   = apply_cal(cal_last,   p_last_va)
cp_fusion_va = apply_cal(cal_fusion, p_fusion_va)

cp_full_te   = apply_cal(cal_full,   p_full_te)
cp_first_te  = apply_cal(cal_first,  p_first_te)
cp_last_te   = apply_cal(cal_last,   p_last_te)
cp_fusion_te = apply_cal(cal_fusion, p_fusion_te)

# -------------------
# Metric helper
# -------------------
def summarize(y_true, p, label):
    pr_auc = float(average_precision_score(y_true, p))
    try:
        roc = float(roc_auc_score(y_true, p))
    except ValueError:
        roc = float("nan")
    precision, recall, thresholds = precision_recall_curve(y_true, p)
    thr = np.concatenate([thresholds, [1.0]])
    f1 = np.where((precision+recall)>0, 2*precision*recall/(precision+recall), 0.0)
    best_i = int(np.nanargmax(f1))
    best = {"threshold": float(thr[best_i]), "precision": float(precision[best_i]), "recall": float(recall[best_i]), "f1": float(f1[best_i])}

    # P@R>=0.90 and R@P>=0.90
    def at_recall(target=0.90):
        ok = np.where(recall >= target)[0]
        if len(ok)==0: return None
        i = ok[np.argmax(precision[ok])]
        return {"threshold": float(thr[i]), "precision": float(precision[i]), "recall": float(recall[i]), "f1": float(f1[i])}
    def at_precision(target=0.90):
        ok = np.where(precision >= target)[0]
        if len(ok)==0: return None
        i = ok[np.argmax(recall[ok])]
        return {"threshold": float(thr[i]), "precision": float(precision[i]), "recall": float(recall[i]), "f1": float(f1[i])}

    return {
        "label": label,
        "pr_auc": pr_auc,
        "roc_auc": roc,
        "best_f1": best,
        "p_at_recall_0.90": at_recall(0.90),
        "r_at_precision_0.90": at_precision(0.90),
    }

metrics_val_cal = {
    "full":   summarize(y_va, cp_full_va,   "cal_full"),
    "first":  summarize(y_va, cp_first_va,  "cal_first"),
    "last":   summarize(y_va, cp_last_va,   "cal_last"),
    "fusion": summarize(y_va, cp_fusion_va, "cal_fusion"),
}
print("CALIBRATED VAL METRICS:\n", json.dumps(metrics_val_cal, indent=2))

# -------------------
# Choose operating threshold from calibrated VAL (edit policy here)
# For directory completeness, prefer RECALL≥0.90 on fusion; fallback to best_F1 if not available.
# -------------------
op = metrics_val_cal["fusion"]["p_at_recall_0.90"] or metrics_val_cal["fusion"]["best_f1"]
OP_THRESHOLD = float(op["threshold"])
print("Chosen operating threshold (fusion, calibrated):", OP_THRESHOLD)

# -------------------
# Test-set evaluation at the chosen threshold
# -------------------
def confusion(y_true, p, thr):
    y_hat = (p >= thr).astype(int)
    cm = confusion_matrix(y_true, y_hat, labels=[0,1])
    rpt = classification_report(y_true, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)
    return {
        "threshold": thr,
        "tn": int(cm[0,0]), "fp": int(cm[0,1]),
        "fn": int(cm[1,0]), "tp": int(cm[1,1]),
        "sa_f1": float(rpt["SA"]["f1-score"]),
        "macro_f1": float(rpt["macro avg"]["f1-score"])
    }

test_eval = confusion(y_te, cp_fusion_te, OP_THRESHOLD)
print("TEST EVAL @ chosen threshold:\n", json.dumps(test_eval, indent=2))

# -------------------
# Save calibrators + config
# -------------------
joblib.dump(cal_full,   ART_OUT / "cal_full_isotonic.joblib")
joblib.dump(cal_first,  ART_OUT / "cal_first_isotonic.joblib")
joblib.dump(cal_last,   ART_OUT / "cal_last_isotonic.joblib")
joblib.dump(cal_fusion, ART_OUT / "cal_fusion_isotonic.joblib")

cfg = {
    "threshold": OP_THRESHOLD,
    "policy": "p_at_recall_0.90 on calibrated fusion if available else best_f1",
    "abstain_band": [0.45, 0.65]  # tweak after looking at calibration curves
}
with open(ART_OUT / "inference_config.json", "w") as f:
    json.dump(cfg, f, indent=2)

print("Saved calibrated artifacts to:", str(ART_OUT.resolve()))

# -------------------
# Inference API (single-name)
# -------------------
# Same cleaning as training (minimal: lower, unicode norm, collapse spaces)
HONORIFICS = r"\b(dr|mr|mrs|ms|miss|mx|prof|sir|madam|lady|lord)\b\.?"
SUFFIXES   = r"\b(jr|sr|ii|iii|iv|phd|md|esq)\b\.?"

def normalize_unicode(s: str) -> str:
    if not isinstance(s, str): return ""
    s = (s.replace("\u2018","'").replace("\u2019","'")
           .replace("\u201C",'"').replace("\u201D",'"')
           .replace("\u2013","-").replace("\u2014","-")
           .replace("\u00A0"," "))
    return unicodedata.normalize("NFKC", s)

def clean_name(raw: str) -> str:
    s = normalize_unicode(raw).strip().lower()
    s = re.sub(r"[,\u200b]", " ", s)
    s = re.sub(rf"^\s*{HONORIFICS}\s+", "", s)
    s = re.sub(rf"\s+{SUFFIXES}\s*$", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def split_first_last(cleaned: str):
    if not cleaned: return "", ""
    parts = cleaned.split()
    if len(parts) == 1: return parts[0], ""
    return parts[0], parts[-1]

# Load for inference
CAL_FULL   = joblib.load(ART_OUT / "cal_full_isotonic.joblib")
CAL_FIRST  = joblib.load(ART_OUT / "cal_first_isotonic.joblib")
CAL_LAST   = joblib.load(ART_OUT / "cal_last_isotonic.joblib")
CAL_FUSION = joblib.load(ART_OUT / "cal_fusion_isotonic.joblib")
CFG        = json.load(open(ART_OUT / "inference_config.json"))
THR        = CFG["threshold"]
ABSTAIN_L, ABSTAIN_H = CFG["abstain_band"]

def predict_is_south_asian(name: str):
    name_c = clean_name(name)
    first, last = split_first_last(name_c)

    Xf  = vec_full.transform([name_c])
    Xfi = vec_first.transform([first])
    Xla = vec_last.transform([last])

    p_full   = lr_full.predict_proba(Xf)[:,1]
    p_first  = lr_first.predict_proba(Xfi)[:,1]
    p_last   = lr_last.predict_proba(Xla)[:,1]
    p_fusion = 1.0 - (1.0 - p_first) * (1.0 - p_last)

    # Calibrated (index [0] to avoid NumPy deprecation)
    cp_full   = float(np.clip(CAL_FULL.predict(p_full)[0],    0, 1))
    cp_first  = float(np.clip(CAL_FIRST.predict(p_first)[0],  0, 1))
    cp_last   = float(np.clip(CAL_LAST.predict(p_last)[0],    0, 1))
    cp_fusion = float(np.clip(CAL_FUSION.predict(p_fusion)[0],0, 1))

    # Decision with abstain band
    if cp_fusion < ABSTAIN_L:
        decision = "non_sa"
    elif cp_fusion > ABSTAIN_H:
        decision = "sa"
    else:
        decision = "abstain"

    hard = "sa" if cp_fusion >= THR else "non_sa"

    return {
        "input": name,
        "clean": name_c,
        "first": first,
        "last": last,
        "probabilities": {
            "full": cp_full,
            "first": cp_first,
            "last": cp_last,
            "fusion": cp_fusion
        },
        "decision_abstain_band": decision,
        "hard_decision_at_threshold": {"threshold": THR, "label": hard}
    }

# Quick smoke test
for ex in ["A. R. Rahman", "Daniel Singh", "Mary Thomas", "Noah Patel", "Kevin Johnson"]:
    print(ex, "->", predict_is_south_asian(ex))

CALIBRATED VAL METRICS:
 {
  "full": {
    "label": "cal_full",
    "pr_auc": 0.8186329985855617,
    "roc_auc": 0.9686349639900425,
    "best_f1": {
      "threshold": 0.45454545454545453,
      "precision": 0.7647058823529411,
      "recall": 0.7857142857142857,
      "f1": 0.7750677506775069
    },
    "p_at_recall_0.90": {
      "threshold": 0.1357142857142857,
      "precision": 0.554806070826307,
      "recall": 0.9038461538461539,
      "f1": 0.6875653082549633
    },
    "r_at_precision_0.90": {
      "threshold": 0.8055555555555556,
      "precision": 0.9036697247706422,
      "recall": 0.5412087912087912,
      "f1": 0.6769759450171821
    }
  },
  "first": {
    "label": "cal_first",
    "pr_auc": 0.6704845171367658,
    "roc_auc": 0.9203020692492204,
    "best_f1": {
      "threshold": 0.4318181818181818,
      "precision": 0.6505376344086021,
      "recall": 0.6648351648351648,
      "f1": 0.657608695652174
    },
    "p_at_recall_0.90": {
      "threshold": 0.029478458049

Step 5a: Precision-first threshold

In [8]:
# STEP 5a: Use calibrated fusion threshold at R@P>=0.90, update config & re-eval test
import json, joblib, numpy as np, sqlite3, pandas as pd
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report

ART_IN  = "./artifacts_calibrated"   # from Step 5
ART_BL  = "./artifacts_baseline"     # from Step 4

# Load calibrator (fusion), models, vectorizers
cal_fusion = joblib.load(f"{ART_IN}/cal_fusion_isotonic.joblib")
vec_full   = joblib.load(f"{ART_BL}/vec_full_tfidf.joblib")
vec_first  = joblib.load(f"{ART_BL}/vec_first_tfidf.joblib")
vec_last   = joblib.load(f"{ART_BL}/vec_last_tfidf.joblib")
lr_full    = joblib.load(f"{ART_BL}/lr_full.joblib")
lr_first   = joblib.load(f"{ART_BL}/lr_first.joblib")
lr_last    = joblib.load(f"{ART_BL}/lr_last.joblib")

def fetch(split: str):
    con = sqlite3.connect("./sa_names.db")
    q = """
    SELECT name_ascii, first_clean, last_clean, is_south_asian
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    df = pd.read_sql_query(q, con, params=(split,))
    con.close()
    return df

val_df  = fetch("val")
test_df = fetch("test")
y_va = val_df["is_south_asian"].to_numpy(int)
y_te = test_df["is_south_asian"].to_numpy(int)

def calibrated_fusion_probs(df: pd.DataFrame) -> np.ndarray:
    Xf  = vec_full.transform(df["name_ascii"].fillna(""))
    Xfi = vec_first.transform(df["first_clean"].fillna(""))
    Xla = vec_last.transform(df["last_clean"].fillna(""))

    p_first = lr_first.predict_proba(Xfi)[:,1]
    p_last  = lr_last.predict_proba(Xla)[:,1]
    p_fus   = 1.0 - (1.0 - p_first) * (1.0 - p_last)
    cp_fus  = np.clip(cal_fusion.predict(p_fus), 0, 1)
    return cp_fus

cp_fus_va = calibrated_fusion_probs(val_df)
cp_fus_te = calibrated_fusion_probs(test_df)

# Choose threshold achieving R@P >= 0.90 on VAL; fallback to best-F1 if none
prec, rec, thr = precision_recall_curve(y_va, cp_fus_va)
thr = np.concatenate([thr, [1.0]])
mask = np.where(prec >= 0.90)[0]
if len(mask) == 0:
    # fallback: best F1
    f1 = np.where((prec + rec) > 0, 2 * prec * rec / (prec + rec), 0.0)
    i = int(np.argmax(f1))
    thr_star = float(thr[i])
    policy = "best_f1_fallback"
else:
    i = mask[np.argmax(rec[mask])]  # within P>=0.90, take highest recall
    thr_star = float(thr[i])
    policy = "r_at_precision_0.90"

# Evaluate on TEST at the chosen threshold
y_hat = (cp_fus_te >= thr_star).astype(int)
cm = confusion_matrix(y_te, y_hat, labels=[0,1])
rpt = classification_report(y_te, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)

report = {
    "policy": policy,
    "threshold": thr_star,
    "test_confusion": {"tn": int(cm[0,0]), "fp": int(cm[0,1]), "fn": int(cm[1,0]), "tp": int(cm[1,1])},
    "test_sa_f1": float(rpt["SA"]["f1-score"]),
    "test_macro_f1": float(rpt["macro avg"]["f1-score"]),
}
print(json.dumps(report, indent=2))

# Update inference_config.json with new threshold and tighter abstain band
cfg_path = f"{ART_IN}/inference_config.json"
cfg = json.load(open(cfg_path))
cfg["threshold"] = thr_star
cfg["policy"] = "fusion_calibrated_r_at_precision_0.90"
cfg["abstain_band"] = [max(0.0, thr_star - 0.10), min(1.0, thr_star + 0.10)]
json.dump(cfg, open(cfg_path, "w"), indent=2)
print("Updated config:", cfg_path, "->", cfg)

{
  "policy": "r_at_precision_0.90",
  "threshold": 0.8055555555555556,
  "test_confusion": {
    "tn": 3613,
    "fp": 21,
    "fn": 146,
    "tp": 218
  },
  "test_sa_f1": 0.7230514096185738,
  "test_macro_f1": 0.8502312370695331
}
Updated config: ./artifacts_calibrated/inference_config.json -> {'threshold': 0.8055555555555556, 'policy': 'fusion_calibrated_r_at_precision_0.90', 'abstain_band': [0.7055555555555556, 0.9055555555555556]}


Step 6: Baseline tightening

In [9]:
# Step 6: SGDClassifier (logistic) sweep + early stopping + calibration + precision-first operating point

from pathlib import Path
import json, sqlite3, joblib, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_recall_curve,
    classification_report, confusion_matrix
)
from sklearn.isotonic import IsotonicRegression
from sklearn.utils.class_weight import compute_class_weight
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

DB_PATH = "./sa_names.db"
ART_OUT = Path("./artifacts_tuned_sgd")
ART_OUT.mkdir(parents=True, exist_ok=True)
SEED = 42
rng = np.random.default_rng(SEED)

# ----------------- data helpers -----------------
def fetch_split(split: str, cols=('name_ascii','first_clean','last_clean','is_south_asian')):
    con = sqlite3.connect(DB_PATH)
    q = f"""
    SELECT {",".join(cols)}
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    df = pd.read_sql_query(q, con, params=(split,))
    con.close()
    return df

train_df = fetch_split("train")
val_df   = fetch_split("val")
test_df  = fetch_split("test")

y_tr = train_df["is_south_asian"].to_numpy(int)
y_va = val_df["is_south_asian"].to_numpy(int)
y_te = test_df["is_south_asian"].to_numpy(int)

# sample weights for imbalance (like class_weight='balanced')
classes = np.array([0,1], dtype=int)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_tr)
w_map = {0: cw[0], 1: cw[1]}
sw_tr = np.array([w_map[y] for y in y_tr], dtype=float)

# ----------------- vectorizers (fit on train only) -----------------
def fit_vec(series, ngram=(2,5), min_df=2):
    return TfidfVectorizer(analyzer="char", ngram_range=ngram, min_df=min_df, lowercase=False, norm="l2").fit(series.fillna(""))

vecs = {
    "full":  fit_vec(train_df["name_ascii"]),
    "first": fit_vec(train_df["first_clean"]),
    "last":  fit_vec(train_df["last_clean"]),
}
def X_of(vec, series): return vec.transform(series.fillna(""))

Xtr = {
    "full":  X_of(vecs["full"],  train_df["name_ascii"]),
    "first": X_of(vecs["first"], train_df["first_clean"]),
    "last":  X_of(vecs["last"],  train_df["last_clean"]),
}
Xva = {
    "full":  X_of(vecs["full"],  val_df["name_ascii"]),
    "first": X_of(vecs["first"], val_df["first_clean"]),
    "last":  X_of(vecs["last"],  val_df["last_clean"]),
}
Xte = {
    "full":  X_of(vecs["full"],  test_df["name_ascii"]),
    "first": X_of(vecs["first"], test_df["first_clean"]),
    "last":  X_of(vecs["last"],  test_df["last_clean"]),
}

# ----------------- fast sweep with SGDClassifier -----------------
grid_alpha = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3]
grid_l1    = [0.0, 0.3, 0.5]   # 0.0 ~ pure L2

def train_sgd(Xtr, ytr, Xva, yva, alpha, l1_ratio):
    # Early stopping uses Xva/yva internally
    clf = SGDClassifier(
        loss="log_loss",
        penalty="elasticnet" if l1_ratio>0 else "l2",
        l1_ratio=l1_ratio if l1_ratio>0 else 0.0,
        alpha=alpha,
        max_iter=2000,
        tol=1e-4,
        early_stopping=True,
        n_iter_no_change=5,
        validation_fraction=0.1,   # still keeps our separate val for model selection; early stopping uses internal split of train
        random_state=SEED,
        n_jobs=-1
    )
    clf.fit(Xtr, ytr, sample_weight=sw_tr)  # handle imbalance
    # Evaluate on our external validation
    p_va = clf.predict_proba(Xva)[:,1]
    pr = float(average_precision_score(yva, p_va))
    try:
        roc = float(roc_auc_score(yva, p_va))
    except ValueError:
        roc = float("nan")
    prec, rec, thr = precision_recall_curve(yva, p_va)
    thr = np.concatenate([thr, [1.0]])
    f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)
    best_f1 = float(np.nanmax(f1))
    return clf, p_va, {"pr_auc": pr, "roc_auc": roc, "best_f1": best_f1}

results = {"full": [], "first": [], "last": []}
val_scores_cache = {"full": None, "first": None, "last": None}

for view in ["full","first","last"]:
    for alpha in grid_alpha:
        for l1 in grid_l1:
            clf, pva, scores = train_sgd(Xtr[view], y_tr, Xva[view], y_va, alpha, l1)
            results[view].append({
                "view": view, "alpha": alpha, "l1_ratio": l1,
                "pr_auc": scores["pr_auc"], "roc_auc": scores["roc_auc"], "best_f1": scores["best_f1"],
                "model": clf
            })

# pick best per view by PR-AUC, tie-break by best_F1
best = {}
for view in ["full","first","last"]:
    cand = sorted(results[view], key=lambda d: (d["pr_auc"], d["best_f1"]), reverse=True)[0]
    best[view] = cand
    print(f"Best {view}: alpha={cand['alpha']} l1={cand['l1_ratio']} | PR-AUC={cand['pr_auc']:.3f} F1*={cand['best_f1']:.3f}")

# ----------------- calibrate best models (isotonic on external VAL) -----------------
def fit_iso(y, p):
    ir = IsotonicRegression(out_of_bounds="clip")
    ir.fit(p, y)
    return ir

cal = {}
for view in ["full","first","last"]:
    model = best[view]["model"]
    p_va = model.predict_proba(Xva[view])[:,1]
    cal[view] = fit_iso(y_va, p_va)

def calibrated_probs(view, split):
    if split=="val":
        X = Xva[view]
    elif split=="test":
        X = Xte[view]
    else:
        raise ValueError
    model = best[view]["model"]
    raw = model.predict_proba(X)[:,1]
    return np.clip(cal[view].predict(raw), 0, 1)

cp_first_va = calibrated_probs("first","val")
cp_last_va  = calibrated_probs("last","val")
cp_fusion_va = 1.0 - (1.0 - cp_first_va) * (1.0 - cp_last_va)

# precision-first threshold on VAL (R@P>=0.90), fallback to best F1
prec, rec, thr = precision_recall_curve(y_va, cp_fusion_va)
thr = np.concatenate([thr, [1.0]])
mask = np.where(prec >= 0.90)[0]
if len(mask)==0:
    f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)
    i = int(np.argmax(f1))
    thr_star = float(thr[i]); policy = "best_f1_fallback"
else:
    i = mask[np.argmax(rec[mask])]
    thr_star = float(thr[i]); policy = "r_at_precision_0.90"
print("Chosen policy:", policy, "| threshold:", thr_star)

# evaluate on TEST
cp_first_te = calibrated_probs("first","test")
cp_last_te  = calibrated_probs("last","test")
cp_fusion_te = 1.0 - (1.0 - cp_first_te) * (1.0 - cp_last_te)

y_hat = (cp_fusion_te >= thr_star).astype(int)
cm = confusion_matrix(y_te, y_hat, labels=[0,1])
rpt = classification_report(y_te, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)

summary = {
    "test_confusion": {"tn": int(cm[0,0]), "fp": int(cm[0,1]), "fn": int(cm[1,0]), "tp": int(cm[1,1])},
    "test_SA_f1": float(rpt["SA"]["f1-score"]),
    "test_macro_f1": float(rpt["macro avg"]["f1-score"]),
}
print(json.dumps(summary, indent=2))

# ----------------- save tuned artifacts -----------------
# vectorizers
joblib.dump(vecs["full"],  ART_OUT / "vec_full_tfidf.joblib")
joblib.dump(vecs["first"], ART_OUT / "vec_first_tfidf.joblib")
joblib.dump(vecs["last"],  ART_OUT / "vec_last_tfidf.joblib")

# models
joblib.dump(best["full"]["model"],  ART_OUT / "sgd_full_tuned.joblib")
joblib.dump(best["first"]["model"], ART_OUT / "sgd_first_tuned.joblib")
joblib.dump(best["last"]["model"],  ART_OUT / "sgd_last_tuned.joblib")

# calibrators
joblib.dump(cal["full"],  ART_OUT / "cal_full_isotonic.joblib")
joblib.dump(cal["first"], ART_OUT / "cal_first_isotonic.joblib")
joblib.dump(cal["last"],  ART_OUT / "cal_last_isotonic.joblib")

# config
cfg = {
    "threshold": thr_star,
    "policy": policy,
    "abstain_band": [max(0.0, thr_star-0.10), min(1.0, thr_star+0.10)],
    "best_params": {
        "full":  {"alpha": float(best["full"]["alpha"]),  "l1_ratio": float(best["full"]["l1_ratio"])},
        "first": {"alpha": float(best["first"]["alpha"]), "l1_ratio": float(best["first"]["l1_ratio"])},
        "last":  {"alpha": float(best["last"]["alpha"]),  "l1_ratio": float(best["last"]["l1_ratio"])},
    }
}
json.dump(cfg, open(ART_OUT / "inference_config.json","w"), indent=2)
print("Saved tuned artifacts to:", str(ART_OUT.resolve()))

  f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)
  f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)


Best full: alpha=1e-05 l1=0.0 | PR-AUC=0.851 F1*=0.798
Best first: alpha=1e-05 l1=0.0 | PR-AUC=0.688 F1*=0.663
Best last: alpha=1e-05 l1=0.0 | PR-AUC=0.728 F1*=0.718
Chosen policy: r_at_precision_0.90 | threshold: 0.8443938901527462
{
  "test_confusion": {
    "tn": 3597,
    "fp": 37,
    "fn": 123,
    "tp": 241
  },
  "test_SA_f1": 0.7507788161993769,
  "test_macro_f1": 0.8645109745941133
}
Saved tuned artifacts to: /content/artifacts_tuned_sgd


# Phase X: Creating Primary

In [10]:
# STEP 7: Char-BiLSTM (two-tower: first + last), calibrated, precision-first threshold

"""
* Build a character vocab from train names.
* Encode first and last separately with shared Embedding + BiLSTM (two inputs).
* Concatenate towers → dense → sigmoid.
* Train with class weights, early stopping, ReduceLROnPlateau.
* Calibrate with isotonic on val.
* Choose precision-first threshold (R@P≥0.90) on the calibrated fusion (here the model already learns fusion, but we’ll still report).
* Evaluate on test and save artifacts.
"""

import sqlite3, json, re, unicodedata, numpy as np, pandas as pd, joblib, os
from pathlib import Path
import tensorflow as tf
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix, average_precision_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

tf.random.set_seed(42)
np.random.seed(42)

DB_PATH = "./sa_names.db"
ART_DIR = Path("./artifacts_char_bilstm")
ART_DIR.mkdir(parents=True, exist_ok=True)

# ---------- 1) Load splits ----------
def fetch_split(split: str):
    con = sqlite3.connect(DB_PATH)
    q = """
    SELECT first_clean, last_clean, is_south_asian
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    df = pd.read_sql_query(q, con, params=(split,))
    con.close()
    return df

train_df = fetch_split("train")
val_df   = fetch_split("val")
test_df  = fetch_split("test")

y_tr = train_df["is_south_asian"].to_numpy(np.int32)
y_va = val_df["is_south_asian"].to_numpy(np.int32)
y_te = test_df["is_south_asian"].to_numpy(np.int32)

# ---------- 2) Char vocab & encoding ----------
def build_char_vocab(series_list):
    chars = set()
    for s in series_list:
        for name in s.fillna(""):
            chars.update(list(name))
    # reserve 0 for PAD, 1 for OOV
    itos = ["<PAD>", "<OOV>"] + sorted(chars)
    stoi = {ch:i for i,ch in enumerate(itos)}
    return stoi, itos

stoi, itos = build_char_vocab([
    train_df["first_clean"], train_df["last_clean"]
])

MAX_LEN_FIRST = 20
MAX_LEN_LAST  = 25

def encode(name: str, max_len: int):
    idxs = []
    for ch in (name or "")[:max_len]:
        idxs.append(stoi.get(ch, 1))  # 1 = OOV
    if len(idxs) < max_len:
        idxs += [0]*(max_len-len(idxs))  # 0 = PAD
    return np.array(idxs, dtype=np.int32)

def encode_df(df: pd.DataFrame):
    X_first = np.stack([encode(x, MAX_LEN_FIRST) for x in df["first_clean"].fillna("")], axis=0)
    X_last  = np.stack([encode(x, MAX_LEN_LAST)  for x in df["last_clean"].fillna("")], axis=0)
    return X_first, X_last

Xtr_first, Xtr_last = encode_df(train_df)
Xva_first, Xva_last = encode_df(val_df)
Xte_first, Xte_last = encode_df(test_df)

# ---------- 3) Model (shared embedding + BiLSTM per tower) ----------
VOCAB_SIZE = len(itos)
EMB_DIM = 32
LSTM_UNITS = 64
DROPOUT = 0.25

first_in = tf.keras.Input(shape=(MAX_LEN_FIRST,), name="first")
last_in  = tf.keras.Input(shape=(MAX_LEN_LAST,),  name="last")

shared_emb = tf.keras.layers.Embedding(
    input_dim=VOCAB_SIZE, output_dim=EMB_DIM, mask_zero=True, name="char_emb"
)

def tower(x, name_prefix):
    x = shared_emb(x)
    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(LSTM_UNITS, return_sequences=False, dropout=DROPOUT, recurrent_dropout=0.0, use_cudnn=False),
        name=f"{name_prefix}_bilstm")(x)
    x = tf.keras.layers.Dense(64, activation="relu", name=f"{name_prefix}_dense")(x)
    x = tf.keras.layers.Dropout(DROPOUT, name=f"{name_prefix}_drop")(x)
    return x

t1 = tower(first_in, "first")
t2 = tower(last_in,  "last")

h = tf.keras.layers.Concatenate(name="concat")([t1, t2])
h = tf.keras.layers.Dense(64, activation="relu")(h)
h = tf.keras.layers.Dropout(DROPOUT)(h)
out = tf.keras.layers.Dense(1, activation="sigmoid", name="out")(h)

model = tf.keras.Model(inputs=[first_in, last_in], outputs=out)
model.summary()

# ---------- 4) Train setup ----------
# Class imbalance → class weights
classes = np.array([0,1], dtype=int)
cw = compute_class_weight("balanced", classes=classes, y=y_tr)
class_weight = {0: float(cw[0]), 1: float(cw[1])}

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-3),
    loss="binary_crossentropy",
    metrics=[tf.keras.metrics.AUC(curve="PR", name="pr_auc"), tf.keras.metrics.AUC(curve="ROC", name="roc_auc")]
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_pr_auc", mode="max", patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_pr_auc", mode="max", factor=0.5, patience=2, verbose=1, min_lr=1e-5)
]

history = model.fit(
    x={"first": Xtr_first, "last": Xtr_last}, y=y_tr,
    validation_data=({"first": Xva_first, "last": Xva_last}, y_va),
    epochs=25,
    batch_size=256,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=1
)

# ---------- 5) Probabilities, calibration, threshold selection ----------
p_va_raw = model.predict({"first": Xva_first, "last": Xva_last}, batch_size=512).reshape(-1)
p_te_raw = model.predict({"first": Xte_first, "last": Xte_last}, batch_size=512).reshape(-1)

# Isotonic calibration on VAL
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(p_va_raw, y_va)

cp_va = np.clip(iso.predict(p_va_raw), 0, 1)
cp_te = np.clip(iso.predict(p_te_raw), 0, 1)

# Precision-first threshold (R@P>=0.90) on VAL
prec, rec, thr = precision_recall_curve(y_va, cp_va)
thr = np.concatenate([thr, [1.0]])
mask = np.where(prec >= 0.90)[0]
if len(mask) == 0:
    # fallback to best-F1
    f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)
    i = int(np.argmax(f1)); thr_star = float(thr[i]); policy = "best_f1_fallback"
else:
    i = mask[np.argmax(rec[mask])]
    thr_star = float(thr[i]); policy = "r_at_precision_0.90"

# ---------- 6) Test evaluation ----------
y_hat = (cp_te >= thr_star).astype(int)
cm = confusion_matrix(y_te, y_hat, labels=[0,1])
rpt = classification_report(y_te, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)

report = {
    "policy": policy,
    "threshold": thr_star,
    "val_pr_auc": float(average_precision_score(y_va, cp_va)),
    "val_roc_auc": float(roc_auc_score(y_va, cp_va)),
    "test_confusion": {"tn": int(cm[0,0]), "fp": int(cm[0,1]), "fn": int(cm[1,0]), "tp": int(cm[1,1])},
    "test_SA_f1": float(rpt["SA"]["f1-score"]),
    "test_macro_f1": float(rpt["macro avg"]["f1-score"]),
}
print(json.dumps(report, indent=2))

# ---------- 7) Save artifacts ----------
model.save(ART_DIR / "char_bilstm_twintower.keras")
joblib.dump({"stoi": stoi, "itos": itos, "MAX_LEN_FIRST": MAX_LEN_FIRST, "MAX_LEN_LAST": MAX_LEN_LAST}, ART_DIR / "char_vocab.joblib")
joblib.dump(iso, ART_DIR / "cal_isotonic.joblib")
json.dump({"threshold": thr_star, "policy": policy, "abstain_band": [max(0.0, thr_star-0.10), min(1.0, thr_star+0.10)]}, open(ART_DIR / "inference_config.json","w"), indent=2)

print("Saved char-BiLSTM artifacts to:", str(ART_DIR.resolve()))

Epoch 1/25
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 302ms/step - loss: 0.5730 - pr_auc: 0.3386 - roc_auc: 0.7792 - val_loss: 0.3482 - val_pr_auc: 0.5690 - val_roc_auc: 0.9081 - learning_rate: 0.0030
Epoch 2/25
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 339ms/step - loss: 0.4124 - pr_auc: 0.5339 - roc_auc: 0.8959 - val_loss: 0.3692 - val_pr_auc: 0.6035 - val_roc_auc: 0.9189 - learning_rate: 0.0030
Epoch 3/25
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 303ms/step - loss: 0.3924 - pr_auc: 0.5825 - roc_auc: 0.9059 - val_loss: 0.4085 - val_pr_auc: 0.6256 - val_roc_auc: 0.9223 - learning_rate: 0.0030
Epoch 4/25
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 319ms/step - loss: 0.3651 - pr_auc: 0.6193 - roc_auc: 0.9189 - val_loss: 0.4188 - val_pr_auc: 0.6534 - val_roc_auc: 0.9266 - learning_rate: 0.0030
Epoch 5/25
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 334ms/step - loss: 0.3541 - pr_a

# Step 8: Boosting primary performance

In [11]:
# STEP 8.1: Train SentencePiece tokenizer (unigram) on train names only
!pip -q install sentencepiece==0.2.0 transformers==4.43.3 datasets==2.20.0 accelerate==0.32.1

import sqlite3, pandas as pd, sentencepiece as spm, os, json
from pathlib import Path

DB_PATH = "./sa_names.db"
TOK_DIR = Path("./artifacts_namebert_sa/tokenizer")
TOK_DIR.mkdir(parents=True, exist_ok=True)

def fetch_train_names():
    con = sqlite3.connect(DB_PATH)
    q = """
    SELECT first_clean, last_clean, name_ascii
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split='train');
    """
    df = pd.read_sql_query(q, con)
    con.close()
    return df

df = fetch_train_names()

# Build a simple training corpus (one name per line) with light augmentation
lines = []
for r in df.itertuples(index=False):
    first = (r.first_clean or "").strip()
    last  = (r.last_clean or "").strip()
    full  = (r.name_ascii or "").strip()
    if full:
        lines.append(full)
    if first:
        lines.append(first)
    if last:
        lines.append(last)
    # drop-view augmentations
    if first and last:
        lines.append(f"{first} <SEP> {last}")
        lines.append(first)  # simulate missing last
        lines.append(last)   # simulate missing first

corpus_path = TOK_DIR / "names_corpus.txt"
with open(corpus_path, "w", encoding="utf-8") as f:
    for ln in lines:
        f.write(ln.strip() + "\n")

# Train SentencePiece (unigram) — small vocab is good for names
spm.SentencePieceTrainer.Train(
    input=str(corpus_path),
    model_prefix=str(TOK_DIR / "spm_unigram"),
    vocab_size=800,
    model_type="unigram",
    character_coverage=1.0,
    user_defined_symbols=["<SEP>","<FIRST>","<LAST>"]
)

print("Tokenizer files:", list(TOK_DIR.iterdir()))

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [12]:
# Step: 8.2a Train a fast SentencePiece-Unigram tokenizer directly to tokenizer.json (no .model needed)

!pip -q install tokenizers==0.19.1

from pathlib import Path
from tokenizers import SentencePieceUnigramTokenizer

TOK_DIR = Path("./artifacts_namebert_sa/tokenizer")
TOK_DIR.mkdir(parents=True, exist_ok=True)

corpus_path = TOK_DIR / "names_corpus.txt"
assert corpus_path.exists(), f"Corpus not found at {corpus_path} (run Step 8.1 first)."

# Train directly to a fast tokenizer object
spu = SentencePieceUnigramTokenizer()
spu.train(
    files=[str(corpus_path)],
    vocab_size=800,
    unk_token="<unk>",
    special_tokens=["<s>", "</s>", "<pad>", "<unk>", "<SEP>", "<FIRST>", "<LAST>"]
)

# Save in fast-tokenizers JSON format
tok_json_path = TOK_DIR / "tokenizer.json"
spu.save(str(tok_json_path))

print("Fast tokenizer saved:", tok_json_path)


Fast tokenizer saved: artifacts_namebert_sa/tokenizer/tokenizer.json


In [13]:
# STEP 8.2: Dataset + tiny RoBERTa config
import sqlite3, pandas as pd, numpy as np, torch, os, json
from datasets import Dataset, DatasetDict
from transformers import PreTrainedTokenizerFast, RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification
from pathlib import Path

ART_DIR = Path("./artifacts_namebert_sa")
ART_DIR.mkdir(parents=True, exist_ok=True)

# Load sentencepiece as HF tokenizer
TOK_DIR = Path("./artifacts_namebert_sa/tokenizer")
tok = PreTrainedTokenizerFast(
    tokenizer_file=str(TOK_DIR / "tokenizer.json"),
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
)
tok.add_special_tokens({"additional_special_tokens": ["<SEP>", "<FIRST>", "<LAST>"]})

def fetch(split):
    con = sqlite3.connect("./sa_names.db")
    q = """
    SELECT first_clean, last_clean, is_south_asian
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    df = pd.read_sql_query(q, con, params=(split,))
    con.close()
    return df

def to_hf(split):
    df = fetch(split)
    def make_text(r):
        first = (r["first_clean"] or "").strip()
        last  = (r["last_clean"] or "").strip()
        return f"<FIRST> {first} <SEP> <LAST> {last}".strip()
    texts = df.apply(make_text, axis=1).tolist()
    labels = df["is_south_asian"].astype(int).tolist()
    return Dataset.from_dict({"text": texts, "label": labels})

ds = DatasetDict({
    "train": to_hf("train"),
    "validation": to_hf("val"),
    "test": to_hf("test"),
})

# Tokenize function
MAX_LEN = 48
def tok_fn(batch):
    enc = tok(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)
    enc["labels"] = batch["label"]
    return enc

ds_tok = ds.map(tok_fn, batched=True, remove_columns=["text","label"])

# Tiny RoBERTa configs
config_mlm = RobertaConfig(
    vocab_size=len(tok),
    max_position_embeddings=MAX_LEN+2,
    hidden_size=384,
    num_hidden_layers=6,
    num_attention_heads=6,
    intermediate_size=768,
    hidden_act="gelu",
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,
    type_vocab_size=1,
    pad_token_id=tok.pad_token_id,
    bos_token_id=tok.convert_tokens_to_ids("<s>"),
    eos_token_id=tok.convert_tokens_to_ids("</s>")
)

config_cls = RobertaConfig.from_dict({**config_mlm.to_dict(), "num_labels": 1, "problem_type": "single_label_classification"})

with open(ART_DIR / "roberta_tiny_config.json", "w") as f:
    json.dump(config_cls.to_dict(), f, indent=2)

print("Tokenizer size:", len(tok), "| Max len:", MAX_LEN)

Map:   0%|          | 0/18651 [00:00<?, ? examples/s]

Map:   0%|          | 0/3997 [00:00<?, ? examples/s]

Map:   0%|          | 0/3998 [00:00<?, ? examples/s]

You passed along `num_labels=1` with an incompatible id to label map: {0: 'LABEL_0', 1: 'LABEL_1'}. The number of labels wil be overwritten to 2.


Tokenizer size: 800 | Max len: 48


In [14]:
# Step 8.3a: add <mask> token to the fast tokenizer and sync configs
from pathlib import Path
from transformers import PreTrainedTokenizerFast, RobertaConfig

TOK_DIR = Path("./artifacts_namebert_sa/tokenizer")

# 1) Load the fast tokenizer we trained to tokenizer.json
tok = PreTrainedTokenizerFast(
    tokenizer_file=str(TOK_DIR / "tokenizer.json"),
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
)
# Keep our extra specials
tok.add_special_tokens({"additional_special_tokens": ["<SEP>", "<FIRST>", "<LAST>"]})

# 2) Add a proper mask token if missing
if tok.mask_token is None:
    tok.add_special_tokens({"mask_token": "<mask>"})   # ✅ set mask_token explicitly

# 3) Save updated tokenizer (vocab size may have grown by 1)
tok.save_pretrained("./artifacts_namebert_sa/tokenizer_hf")
print("Tokenizer saved with mask token. vocab_size:", len(tok), "| mask_id:", tok.mask_token_id)

# 4) If you already built configs earlier, reload & sync their vocab/IDs
# (If you don't have config_mlm/config_cls in memory, you can rebuild them now like this:)
MAX_LEN = 48  # use the same as before
config_mlm = RobertaConfig(
    vocab_size=len(tok),
    max_position_embeddings=MAX_LEN+2,
    hidden_size=384,
    num_hidden_layers=6,
    num_attention_heads=6,
    intermediate_size=768,
    hidden_act="gelu",
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,
    type_vocab_size=1,
    pad_token_id=tok.pad_token_id,
    bos_token_id=tok.bos_token_id,
    eos_token_id=tok.eos_token_id,
)
# RobertaConfig doesn’t have a dedicated mask_token_id field, but the model will read it from the tokenizer.
# We'll still persist a classifier config for later:
from copy import deepcopy
config_cls = deepcopy(config_mlm)
config_cls.num_labels = 1
config_cls.problem_type = "single_label_classification"

# Save configs for clarity (optional)
import json, os
ART_DIR = Path("./artifacts_namebert_sa")
os.makedirs(ART_DIR, exist_ok=True)
with open(ART_DIR / "roberta_tiny_config.json", "w") as f:
    json.dump(config_cls.to_dict(), f, indent=2)

print("Configs synced. Hidden size/layers match earlier setup.")

Tokenizer saved with mask token. vocab_size: 801 | mask_id: 800
Configs synced. Hidden size/layers match earlier setup.


In [15]:
# STEP 8.3 (Trainer-free, patched): short MLM warmup for NameBERT-SA
import math, json, os, torch, numpy as np
from torch.utils.data import DataLoader
from transformers import RobertaForMaskedLM, DataCollatorForLanguageModeling, get_linear_schedule_with_warmup

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# Reuse tok, config_mlm, MAX_LEN, ART_DIR, ds from previous steps
train_texts = [ex["text"] for ex in ds["train"]]

def encode_batch(texts):
    # returns dict of tensors (input_ids, attention_mask)
    return tok(texts, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt")

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts): self.texts = texts
    def __len__(self): return len(self.texts)
    def __getitem__(self, i): return self.texts[i]

raw_ds = TextDataset(train_texts)
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=True, mlm_probability=0.15)

def collate_fn(batch_texts):
    enc = encode_batch(batch_texts)
    # ✅ collator expects a list of examples; provide a list of per-row tensors
    examples = [enc["input_ids"][i] for i in range(enc["input_ids"].size(0))]
    return collator(examples)  # returns a dict: {'input_ids', 'labels', 'attention_mask'}

loader = DataLoader(raw_ds, batch_size=64, shuffle=True, drop_last=True, collate_fn=collate_fn)

# Model (vocab size must match tok)
from transformers import RobertaConfig, RobertaForMaskedLM
model_mlm = RobertaForMaskedLM(config=config_mlm).to(DEVICE)

# Optimizer/scheduler (use torch.optim.AdamW to avoid warning)
optimizer = torch.optim.AdamW(model_mlm.parameters(), lr=5e-4, weight_decay=0.01)

epochs = 2  # keep short; bump to 3–4 later if useful
num_update_steps = epochs * len(loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=max(50, int(0.1 * num_update_steps)),
    num_training_steps=num_update_steps
)

model_mlm.train()
global_step, running = 0, 0.0
for epoch in range(epochs):
    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        out = model_mlm(**batch)
        loss = out.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_mlm.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        global_step += 1
        running += loss.item()
        if global_step % 100 == 0:
            print(f"step {global_step}/{num_update_steps} | loss {running/100:.4f}")
            running = 0.0

# Save MLM warmup
os.makedirs(ART_DIR / "mlm_manual", exist_ok=True)
model_mlm.save_pretrained(ART_DIR / "mlm_manual")
tok.save_pretrained(ART_DIR / "tokenizer_hf")
print("MLM warmup saved to:", str(ART_DIR / "mlm_manual"))

Device: cpu


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


step 100/582 | loss 4.6400
step 200/582 | loss 3.7798
step 300/582 | loss 3.6681
step 400/582 | loss 3.4929
step 500/582 | loss 3.4574
MLM warmup saved to: artifacts_namebert_sa/mlm_manual


In [None]:
# STEP 8.4 (fixed): Fine-tune classifier (2 logits) + calibration + precision-first threshold
import os, json, math, sqlite3, numpy as np, torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import (
    average_precision_score, roc_auc_score, precision_recall_curve,
    classification_report, confusion_matrix
)
from sklearn.isotonic import IsotonicRegression
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    PreTrainedTokenizerFast, RobertaConfig, RobertaForMaskedLM,
    RobertaForSequenceClassification, get_linear_schedule_with_warmup
)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

ART_DIR = Path("./artifacts_namebert_sa")
TOK_DIR = ART_DIR / "tokenizer_hf"          # from earlier steps
MLM_DIR = ART_DIR / "mlm_manual"            # from Step 8.3 (trainer-free)
CLS_DIR = ART_DIR / "cls_manual"
CLS_DIR.mkdir(parents=True, exist_ok=True)

# ---------- (Re)load tokenizer, config, datasets if needed ----------
try:
    tok
    ds_tok
    MAX_LEN
except NameError:
    from transformers import PreTrainedTokenizerFast
    import pandas as pd, datasets as hfds
    tok = PreTrainedTokenizerFast.from_pretrained(str(TOK_DIR))
    MAX_LEN = 48

    # Rebuild tokenized datasets quickly from SQLite
    def fetch(split):
        import sqlite3
        con = sqlite3.connect("./sa_names.db")
        q = """
        SELECT first_clean, last_clean, is_south_asian
        FROM names_preprocessed
        WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
        """
        df = pd.read_sql_query(q, con, params=(split,))
        con.close()
        return df

    def to_hf(split):
        df = fetch(split)
        texts = ("<FIRST> " + df["first_clean"].fillna("") +
                 " <SEP> <LAST> " + df["last_clean"].fillna("")).tolist()
        labels = df["is_south_asian"].astype(int).tolist()
        return hfds.Dataset.from_dict({"text": texts, "label": labels})

    ds = hfds.DatasetDict({"train": to_hf("train"),
                           "validation": to_hf("val"),
                           "test": to_hf("test")})

    def tok_fn(batch):
        enc = tok(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)
        enc["labels"] = batch["label"]
        return enc

    ds_tok = ds.map(tok_fn, batched=True, remove_columns=["text","label"])

# Two-logit classifier config (binary via softmax)
config_cls = RobertaConfig(
    vocab_size=len(tok),
    max_position_embeddings=MAX_LEN+2,
    hidden_size=384,
    num_hidden_layers=6,
    num_attention_heads=6,
    intermediate_size=768,
    hidden_act="gelu",
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1,
    type_vocab_size=1,
    pad_token_id=tok.pad_token_id,
    bos_token_id=tok.bos_token_id,
    eos_token_id=tok.eos_token_id,
    num_labels=2,
    problem_type="single_label_classification",
    id2label={0: "non_sa", 1: "sa"},
    label2id={"non_sa": 0, "sa": 1},
)

# ---------- Datasets -> PyTorch ----------
class HFDatasetWrapper(Dataset):
    def __init__(self, ds_split): self.ds = ds_split
    def __len__(self): return len(self.ds)
    def __getitem__(self, i):
        ex = self.ds[i]
        return {
            "input_ids": torch.tensor(ex["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(ex["attention_mask"], dtype=torch.long),
            "labels": torch.tensor(ex["labels"], dtype=torch.long),   # int labels (0/1) for CE
        }

train_ds = HFDatasetWrapper(ds_tok["train"])
val_ds   = HFDatasetWrapper(ds_tok["validation"])
test_ds  = HFDatasetWrapper(ds_tok["test"])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=128, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=128, shuffle=False)

# ---------- Class weights for imbalance ----------
y_tr = np.array([ds_tok["train"][i]["labels"] for i in range(len(ds_tok["train"]))], dtype=int)
cw = compute_class_weight("balanced", classes=np.array([0,1], int), y=y_tr)
w0, w1 = float(cw[0]), float(cw[1])
print("Class weights:", {"non_sa": w0, "sa": w1})
ce_weight = torch.tensor([w0, w1], dtype=torch.float, device=DEVICE)

# ---------- Model: init encoder from MLM warmup, 2-logit head ----------
mlm_model = RobertaForMaskedLM.from_pretrained(str(MLM_DIR))
model = RobertaForSequenceClassification(config=config_cls)
model.roberta.load_state_dict(mlm_model.roberta.state_dict(), strict=False)
del mlm_model
model.to(DEVICE)

# ---------- Optimizer & scheduler ----------
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
EPOCHS = 6
num_training_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=max(100, int(0.1 * num_training_steps)), num_training_steps=num_training_steps
)
criterion = torch.nn.CrossEntropyLoss(weight=ce_weight)

# ---------- Helper to evaluate probs ----------
def eval_probs(dloader):
    model.eval()
    probs, labels = [], []
    with torch.no_grad():
        for batch in dloader:
            inputs = {k: v.to(DEVICE) for k,v in batch.items() if k in ("input_ids","attention_mask")}
            y = batch["labels"].numpy()
            logits = model(**inputs).logits         # (B,2)
            p = torch.softmax(logits, dim=-1)[:,1]  # P(sa)
            probs.append(p.cpu().numpy()); labels.append(y)
    return np.concatenate(probs), np.concatenate(labels)

# ---------- Train with early stopping on val PR-AUC ----------
best_val_prauc = -1.0
best_state = None
patience, best_epoch = 3, -1
no_improve = 0

for epoch in range(1, EPOCHS+1):
    model.train()
    running = 0.0
    for batch in train_loader:
        inputs = {k: v.to(DEVICE) for k,v in batch.items() if k in ("input_ids","attention_mask")}
        labels = batch["labels"].to(DEVICE)         # long, shape (B,)
        logits = model(**inputs).logits             # (B,2)
        loss = criterion(logits, labels)            # CE with class weights
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step(); scheduler.step(); optimizer.zero_grad()
        running += loss.item()

    p_va, y_va = eval_probs(val_loader)
    pra = average_precision_score(y_va, p_va)
    try:
        roa = roc_auc_score(y_va, p_va)
    except:
        roa = float("nan")
    print(f"Epoch {epoch}/{EPOCHS}  train_loss={running/len(train_loader):.4f}  val_PR-AUC={pra:.4f}  val_ROC-AUC={roa:.4f}")
    if pra > best_val_prauc + 1e-4:
        best_val_prauc = pra
        best_state = {k: v.cpu().clone() for k,v in model.state_dict().items()}
        best_epoch = epoch
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch} (best epoch {best_epoch}, PR-AUC={best_val_prauc:.4f})")
            break

# Restore best
if best_state is not None:
    model.load_state_dict(best_state)

# ---------- Calibrate (isotonic) on VAL, choose precision-first threshold ----------
iso = IsotonicRegression(out_of_bounds="clip")
iso.fit(p_va, y_va)                         # use last computed p_va/y_va
cp_va = np.clip(iso.predict(p_va), 0, 1)

prec, rec, thr = precision_recall_curve(y_va, cp_va)
thr = np.concatenate([thr, [1.0]])
mask = np.where(prec >= 0.90)[0]
if len(mask)==0:
    f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)
    i = int(np.argmax(f1)); thr_star = float(thr[i]); policy="best_f1_fallback"
else:
    i = mask[np.argmax(rec[mask])]; thr_star = float(thr[i]); policy="r_at_precision_0.90"

# ---------- Test eval ----------
p_te, y_te = eval_probs(test_loader)
cp_te = np.clip(iso.predict(p_te), 0, 1)
y_hat = (cp_te >= thr_star).astype(int)

cm = confusion_matrix(y_te, y_hat, labels=[0,1])
rpt = classification_report(y_te, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)
report = {
    "policy": policy,
    "threshold": thr_star,
    "val_pr_auc": float(average_precision_score(y_va, cp_va)),
    "val_roc_auc": float(roc_auc_score(y_va, cp_va)),
    "test_confusion": {"tn": int(cm[0,0]), "fp": int(cm[0,1]), "fn": int(cm[1,0]), "tp": int(cm[1,1])},
    "test_SA_f1": float(rpt["SA"]["f1-score"]),
    "test_macro_f1": float(rpt["macro avg"]["f1-score"]),
}
print(json.dumps(report, indent=2))

# ---------- Save artifacts ----------
torch.save(model.state_dict(), CLS_DIR / "roberta_cls_state.pt")
import joblib
joblib.dump(iso, ART_DIR / "cal_isotonic_cls.joblib")
with open(ART_DIR / "inference_config.json","w") as f:
    json.dump({"threshold": thr_star, "policy": policy,
               "abstain_band": [max(0.0, thr_star-0.10), min(1.0, thr_star+0.10)]}, f, indent=2)
print("Saved NameBERT-SA classifier + calibrator to:", str(ART_DIR.resolve()))

Device: cpu
Class weights: {'non_sa': 0.5500147449130051, 'sa': 5.498525943396227}


# Step 9: Creating a Combination Model

In [18]:
# STEP 9: Calibrated ensemble of tuned SGD (fusion) + NameBERT-SA classifier
import os, json, sqlite3, numpy as np, pandas as pd, torch, joblib
from pathlib import Path
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix, average_precision_score, roc_auc_score
from sklearn.isotonic import IsotonicRegression

# -------- Paths --------
DB_PATH   = "./sa_names.db"
SGD_DIR   = Path("./artifacts_tuned_sgd")       # from Step 6 (fast SGD)
NB_DIR    = Path("./artifacts_namebert_sa")     # from Step 8.4 (trainer-free)
ENS_DIR   = Path("./artifacts_ensemble")
ENS_DIR.mkdir(parents=True, exist_ok=True)

# -------- Load SGD artifacts --------
vec_full  = joblib.load(SGD_DIR / "vec_full_tfidf.joblib")
vec_first = joblib.load(SGD_DIR / "vec_first_tfidf.joblib")
vec_last  = joblib.load(SGD_DIR / "vec_last_tfidf.joblib")
sgd_full  = joblib.load(SGD_DIR / "sgd_full_tuned.joblib")
sgd_first = joblib.load(SGD_DIR / "sgd_first_tuned.joblib")
sgd_last  = joblib.load(SGD_DIR / "sgd_last_tuned.joblib")
cal_full  = joblib.load(SGD_DIR / "cal_full_isotonic.joblib")
cal_first = joblib.load(SGD_DIR / "cal_first_isotonic.joblib")
cal_last  = joblib.load(SGD_DIR / "cal_last_isotonic.joblib")

# -------- Load NameBERT-SA artifacts --------
from transformers import PreTrainedTokenizerFast, RobertaConfig, RobertaForSequenceClassification
tok = PreTrainedTokenizerFast.from_pretrained(str(NB_DIR / "tokenizer_hf"))
MAX_LEN = 48
config_cls = RobertaConfig(
    vocab_size=len(tok), max_position_embeddings=MAX_LEN+2,
    hidden_size=384, num_hidden_layers=6, num_attention_heads=6, intermediate_size=768,
    hidden_act="gelu", attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1,
    type_vocab_size=1, pad_token_id=tok.pad_token_id, bos_token_id=tok.bos_token_id, eos_token_id=tok.eos_token_id,
    num_labels=2, problem_type="single_label_classification", id2label={0:"non_sa",1:"sa"}, label2id={"non_sa":0,"sa":1}
)
nb_model = RobertaForSequenceClassification(config=config_cls)
state = torch.load(NB_DIR / "cls_manual/roberta_cls_state.pt", map_location="cpu")
nb_model.load_state_dict(state, strict=True)
nb_model.eval()
iso_nb = joblib.load(NB_DIR / "cal_isotonic_cls.joblib")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
nb_model.to(DEVICE)

# -------- Data helpers --------
def fetch_split(split: str):
    con = sqlite3.connect(DB_PATH)
    q = """
    SELECT name_ascii, first_clean, last_clean, is_south_asian
    FROM names_preprocessed
    WHERE name_ascii IN (SELECT name_ascii FROM names_splits WHERE split=?);
    """
    df = pd.read_sql_query(q, con, params=(split,))
    con.close()
    return df

val_df  = fetch_split("val")
test_df = fetch_split("test")
y_va = val_df["is_south_asian"].to_numpy(int)
y_te = test_df["is_south_asian"].to_numpy(int)

# -------- SGD calibrated fusion probs --------
def sgd_calibrated_fusion(df: pd.DataFrame) -> np.ndarray:
    Xf  = vec_full.transform(df["name_ascii"].fillna(""))
    Xfi = vec_first.transform(df["first_clean"].fillna(""))
    Xla = vec_last.transform(df["last_clean"].fillna(""))

    p_first = sgd_first.predict_proba(Xfi)[:,1]
    p_last  = sgd_last.predict_proba(Xla)[:,1]

    # Calibrate per-view
    cp_first = np.clip(cal_first.predict(p_first), 0, 1)
    cp_last  = np.clip(cal_last.predict(p_last),  0, 1)

    # Fusion: OR in probability space
    cp_fusion = 1.0 - (1.0 - cp_first) * (1.0 - cp_last)
    return cp_fusion

cp_sgd_va = sgd_calibrated_fusion(val_df)
cp_sgd_te = sgd_calibrated_fusion(test_df)

# -------- NameBERT-SA calibrated probs --------
import torch
from torch.utils.data import DataLoader, Dataset

class SimpleDS(Dataset):
    def __init__(self, first, last, labels=None):
        self.first, self.last, self.labels = list(first), list(last), labels
    def __len__(self): return len(self.first)
    def __getitem__(self, i):
        text = f"<FIRST> {self.first[i] or ''} <SEP> <LAST> {self.last[i] or ''}"
        enc = tok(text, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt")
        ex = {k: enc[k][0] for k in ("input_ids","attention_mask")}
        if self.labels is not None:
            ex["labels"] = torch.tensor(int(self.labels[i]), dtype=torch.long)
        return ex

def nb_probs(df: pd.DataFrame) -> np.ndarray:
    ds = SimpleDS(df["first_clean"], df["last_clean"], None)
    dl = DataLoader(ds, batch_size=128, shuffle=False)
    probs = []
    with torch.no_grad():
        for batch in dl:
            inputs = {k: v.to(DEVICE) for k,v in batch.items() if k in ("input_ids","attention_mask")}
            logits = nb_model(**inputs).logits              # (B,2)
            p = torch.softmax(logits, dim=-1)[:,1].cpu().numpy()
            probs.append(p)
    p = np.concatenate(probs)
    # Calibrate with isotonic fitted on val in Step 8.4
    cp = np.clip(iso_nb.predict(p), 0, 1)
    return cp

cp_nb_va = nb_probs(val_df)
cp_nb_te = nb_probs(test_df)

# -------- Ensemble & threshold selection --------
w_sgd, w_nb = 1.0, 1.0   # start with equal weights; you can tune later
cp_ens_va = (w_sgd*cp_sgd_va + w_nb*cp_nb_va) / (w_sgd + w_nb)
cp_ens_te = (w_sgd*cp_sgd_te + w_nb*cp_nb_te) / (w_sgd + w_nb)

# Precision-first threshold on validation (R@P >= 0.90), fallback best-F1
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_va, cp_ens_va)
thr = np.concatenate([thr, [1.0]])
mask = np.where(prec >= 0.90)[0]
if len(mask)==0:
    f1 = np.where((prec+rec)>0, 2*prec*rec/(prec+rec), 0.0)
    i = int(np.argmax(f1)); thr_star = float(thr[i]); policy="best_f1_fallback"
else:
    i = mask[np.argmax(rec[mask])]; thr_star = float(thr[i]); policy="r_at_precision_0.90"

# -------- Test evaluation --------
from sklearn.metrics import classification_report, confusion_matrix
y_hat = (cp_ens_te >= thr_star).astype(int)
cm = confusion_matrix(y_te, y_hat, labels=[0,1])
rpt = classification_report(y_te, y_hat, labels=[0,1], target_names=["non-SA","SA"], output_dict=True)
report = {
    "policy": policy,
    "threshold": thr_star,
    "val_pr_auc": float(average_precision_score(y_va, cp_ens_va)),
    "val_roc_auc": float(roc_auc_score(y_va, cp_ens_va)),
    "test_confusion": {"tn": int(cm[0,0]), "fp": int(cm[0,1]), "fn": int(cm[1,0]), "tp": int(cm[1,1])},
    "test_SA_f1": float(rpt["SA"]["f1-score"]),
    "test_macro_f1": float(rpt["macro avg"]["f1-score"]),
}
print(json.dumps(report, indent=2))

# -------- Save inference helper (ensemble) --------
INF_PATH = ENS_DIR / "inference_ensemble.py"
with open(INF_PATH, "w") as f:
    f.write(r'''
import json, torch, numpy as np, joblib
from pathlib import Path
from transformers import PreTrainedTokenizerFast, RobertaConfig, RobertaForSequenceClassification

class EnsemblePredictor:
    def __init__(self, db_path=None):
        # Paths
        self.SGD_DIR = Path("./artifacts_tuned_sgd")
        self.NB_DIR  = Path("./artifacts_namebert_sa")
        self.ENS_DIR = Path("./artifacts_ensemble")

        # Load SGD vecs/models/calibrators
        self.vec_first = joblib.load(self.SGD_DIR / "vec_first_tfidf.joblib")
        self.vec_last  = joblib.load(self.SGD_DIR / "vec_last_tfidf.joblib")
        self.sgd_first = joblib.load(self.SGD_DIR / "sgd_first_tuned.joblib")
        self.sgd_last  = joblib.load(self.SGD_DIR / "sgd_last_tuned.joblib")
        self.cal_first = joblib.load(self.SGD_DIR / "cal_first_isotonic.joblib")
        self.cal_last  = joblib.load(self.SGD_DIR / "cal_last_isotonic.joblib")

        # Load NameBERT-SA
        self.tok = PreTrainedTokenizerFast.from_pretrained(str(self.NB_DIR / "tokenizer_hf"))
        cfg = RobertaConfig(
            vocab_size=len(self.tok), max_position_embeddings=50,
            hidden_size=384, num_hidden_layers=6, num_attention_heads=6, intermediate_size=768,
            hidden_act="gelu", attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1,
            type_vocab_size=1, pad_token_id=self.tok.pad_token_id, bos_token_id=self.tok.bos_token_id, eos_token_id=self.tok.eos_token_id,
            num_labels=2, problem_type="single_label_classification", id2label={0:"non_sa",1:"sa"}, label2id={"non_sa":0,"sa":1}
        )
        self.nb_model = RobertaForSequenceClassification(cfg)
        state = torch.load(self.NB_DIR / "cls_manual/roberta_cls_state.pt", map_location="cpu")
        self.nb_model.load_state_dict(state, strict=True)
        self.nb_model.eval()
        self.iso_nb = joblib.load(self.NB_DIR / "cal_isotonic_cls.joblib")

        # Ensemble config
        self.cfg = json.load(open(self.ENS_DIR / "inference_config.json"))
        self.THR = self.cfg["threshold"]
        self.ABSTAIN_L, self.ABSTAIN_H = self.cfg["abstain_band"]
        self.w_sgd = self.cfg.get("w_sgd", 1.0)
        self.w_nb  = self.cfg.get("w_nb", 1.0)

    @staticmethod
    def _split(name: str):
        name = (name or "").strip().lower()
        parts = name.split()
        if len(parts)==0: return "", ""
        if len(parts)==1: return parts[0], ""
        return parts[0], parts[-1]

    def _sgd_prob(self, name: str):
        first, last = self._split(name)
        Xfi = self.vec_first.transform([first])
        Xla = self.vec_last.transform([last])
        p_first = self.sgd_first.predict_proba(Xfi)[:,1]
        p_last  = self.sgd_last.predict_proba(Xla)[:,1]
        cp_first = np.clip(self.cal_first.predict(p_first), 0, 1)[0]
        cp_last  = np.clip(self.cal_last.predict(p_last),  0, 1)[0]
        return 1.0 - (1.0 - cp_first) * (1.0 - cp_last)

    def _nb_prob(self, name: str):
        first, last = self._split(name)
        text = f"<FIRST> {first} <SEP> <LAST> {last}"
        enc = self.tok(text, truncation=True, padding="max_length", max_length=48, return_tensors="pt")
        with torch.no_grad():
            logits = self.nb_model(input_ids=enc["input_ids"], attention_mask=enc["attention_mask"]).logits
            p = torch.softmax(logits, dim=-1)[:,1].item()
        return float(np.clip(self.iso_nb.predict([p])[0], 0, 1))

    def predict(self, name: str):
        ps = self._sgd_prob(name)
        pn = self._nb_prob(name)
        p  = (self.w_sgd*ps + self.w_nb*pn) / (self.w_sgd + self.w_nb)

        if p < self.ABSTAIN_L: decision = "non_sa"
        elif p > self.ABSTAIN_H: decision = "sa"
        else: decision = "abstain"
        hard = "sa" if p >= self.THR else "non_sa"

        return {"name": name, "prob_sgd": ps, "prob_nb": pn, "prob_ens": p,
                "decision_abstain_band": decision,
                "hard_decision_at_threshold": {"threshold": self.THR, "label": hard}}
''')

# Save config for inference
cfg = {
    "threshold": thr_star,
    "policy": policy,
    "abstain_band": [max(0.0, thr_star-0.10), min(1.0, thr_star+0.10)],
    "w_sgd": float(w_sgd),
    "w_nb":  float(w_nb),
}
json.dump(cfg, open(ENS_DIR / "inference_config.json","w"), indent=2)

print("Saved ensemble artifacts to:", str(ENS_DIR.resolve()))

FileNotFoundError: [Errno 2] No such file or directory: 'artifacts_namebert_sa/cls_manual/roberta_cls_state.pt'

# CSV Scorer Script

In [None]:
# batch_score.py
# Score a CSV of names and write probabilities + decisions.
# Usage:
#   python batch_score.py --in input.csv --out scored.csv --name-col name
# Optional:
#   --threshold 0.59 --abstain-low 0.49 --abstain-high 0.69

import argparse
import csv
import json
from pathlib import Path
import numpy as np
import torch
import joblib
from transformers import PreTrainedTokenizerFast, RobertaConfig, RobertaForSequenceClassification

# Paths (adjust if needed)
ROOT      = Path(".")
SGD_DIR   = ROOT / "artifacts_tuned_sgd"
NB_DIR    = ROOT / "artifacts_namebert_sa"
ENS_DIR   = ROOT / "artifacts_ensemble"

class EnsemblePredictor:
    def __init__(self):
        self.vec_first = joblib.load(SGD_DIR / "vec_first_tfidf.joblib")
        self.vec_last  = joblib.load(SGD_DIR / "vec_last_tfidf.joblib")
        self.sgd_first = joblib.load(SGD_DIR / "sgd_first_tuned.joblib")
        self.sgd_last  = joblib.load(SGD_DIR / "sgd_last_tuned.joblib")
        self.cal_first = joblib.load(SGD_DIR / "cal_first_isotonic.joblib")
        self.cal_last  = joblib.load(SGD_DIR / "cal_last_isotonic.joblib")

        self.tok = PreTrainedTokenizerFast.from_pretrained(str(NB_DIR / "tokenizer_hf"))
        self.MAX_LEN = 48
        cfg = RobertaConfig(
            vocab_size=len(self.tok), max_position_embeddings=self.MAX_LEN + 2,
            hidden_size=384, num_hidden_layers=6, num_attention_heads=6, intermediate_size=768,
            hidden_act="gelu", attention_probs_dropout_prob=0.1, hidden_dropout_prob=0.1,
            type_vocab_size=1, pad_token_id=self.tok.pad_token_id, bos_token_id=self.tok.bos_token_id,
            eos_token_id=self.tok.eos_token_id, num_labels=2, problem_type="single_label_classification",
            id2label={0: "non_sa", 1: "sa"}, label2id={"non_sa": 0, "sa": 1},
        )
        self.nb_model = RobertaForSequenceClassification(cfg)
        state = torch.load(NB_DIR / "cls_manual" / "roberta_cls_state.pt", map_location="cpu")
        self.nb_model.load_state_dict(state, strict=True)
        self.nb_model.eval()
        self.iso_nb = joblib.load(NB_DIR / "cal_isotonic_cls.joblib")

        cfg_js = json.load(open(ENS_DIR / "inference_config.json"))
        self.default_thr = float(cfg_js["threshold"])
        self.abstain_l, self.abstain_h = [float(x) for x in cfg_js["abstain_band"]]
        self.w_sgd = float(cfg_js.get("w_sgd", 1.0))
        self.w_nb  = float(cfg_js.get("w_nb", 1.0))

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.nb_model.to(self.device)

    @staticmethod
    def _split(name: str):
        s = (name or "").strip().lower()
        parts = s.split()
        if len(parts) == 0: return "", ""
        if len(parts) == 1: return parts[0], ""
        return parts[0], parts[-1]

    def _sgd_prob(self, name: str) -> float:
        first, last = self._split(name)
        Xfi = self.vec_first.transform([first])
        Xla = self.vec_last.transform([last])
        p_first = self.sgd_first.predict_proba(Xfi)[:, 1]
        p_last  = self.sgd_last.predict_proba(Xla)[:, 1]
        cp_first = np.clip(self.cal_first.predict(p_first), 0, 1)[0]
        cp_last  = np.clip(self.cal_last.predict(p_last),  0, 1)[0]
        return float(1.0 - (1.0 - cp_first) * (1.0 - cp_last))

    def _nb_prob(self, name: str) -> float:
        first, last = self._split(name)
        text = f"<FIRST> {first} <SEP> <LAST> {last}"
        enc = self.tok(text, truncation=True, padding="max_length",
                       max_length=self.MAX_LEN, return_tensors="pt")
        with torch.no_grad():
            logits = self.nb_model(
                input_ids=enc["input_ids"].to(self.device),
                attention_mask=enc["attention_mask"].to(self.device)
            ).logits
            p = torch.softmax(logits, dim=-1)[:, 1].item()
        return float(np.clip(self.iso_nb.predict([p])[0], 0, 1))

    def predict(self, name: str, threshold: float, abstain_l: float, abstain_h: float):
        ps = self._sgd_prob(name)
        pn = self._nb_prob(name)
        p  = (self.w_sgd * ps + self.w_nb * pn) / (self.w_sgd + self.w_nb)

        if p < abstain_l: decision = "non_sa"
        elif p > abstain_h: decision = "sa"
        else: decision = "abstain"
        hard = "sa" if p >= threshold else "non_sa"

        return p, ps, pn, decision, hard

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--in", dest="inp", required=True, help="Input CSV path")
    ap.add_argument("--out", dest="out", required=True, help="Output CSV path")
    ap.add_argument("--name-col", dest="name_col", default="name", help="Column with full name")
    ap.add_argument("--threshold", type=float, default=None, help="Override decision threshold (0..1)")
    ap.add_argument("--abstain-low", type=float, default=None, help="Override abstain low")
    ap.add_argument("--abstain-high", type=float, default=None, help="Override abstain high")
    args = ap.parse_args()

    pred = EnsemblePredictor()
    thr = pred.default_thr if args.threshold is None else float(args.threshold)
    a_low = pred.abstain_l if args.abstain_low is None else float(args.abstain_low)
    a_high = pred.abstain_h if args.abstain_high is None else float(args.abstain_high)

    rows = []
    with open(args.inp, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        if args.name_col not in reader.fieldnames:
            raise ValueError(f"Column '{args.name_col}' not found. Available: {reader.fieldnames}")
        for r in reader:
            name = r[args.name_col]
            p, ps, pn, decision, hard = pred.predict(name, thr, a_low, a_high)
            r_out = dict(r)
            r_out.update({
                "prob_ensemble": f"{p:.6f}",
                "prob_sgd": f"{ps:.6f}",
                "prob_namebert": f"{pn:.6f}",
                "decision_abstain_band": decision,
                "hard_label": hard,
                "threshold_used": thr,
                "abstain_low": a_low,
                "abstain_high": a_high,
            })
            rows.append(r_out)

    out_fields = list(rows[0].keys()) if rows else None
    with open(args.out, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=out_fields)
        writer.writeheader()
        writer.writerows(rows)
    print(f"Wrote {len(rows)} rows to {args.out}")

if __name__ == "__main__":
    main()

How to run it can be found below, as an example.

In [None]:
pip install torch transformers joblib numpy

# Example: score a CSV with a "name" column
python batch_score.py --in providers.csv --out providers_scored.csv --name-col name

# Optional policy overrides
python batch_score.py --in providers.csv --out providers_scored.csv \
  --name-col full_name --threshold 0.60 --abstain-low 0.50 --abstain-high 0.70

# Creating an API Service for the Model

In [None]:
# inference_service.py
# FastAPI service for South Asian name classification (calibrated ensemble).
# Endpoints:
#   GET  /health
#   POST /predict        -> single name
#   POST /predict_batch  -> list of names

import os
import json
from pathlib import Path
from typing import List, Optional

import numpy as np
import torch
import joblib
from fastapi import FastAPI
from pydantic import BaseModel, Field
from transformers import PreTrainedTokenizerFast, RobertaConfig, RobertaForSequenceClassification

# ---------- Paths (adjust if needed) ----------
ROOT      = Path(".")
SGD_DIR   = ROOT / "artifacts_tuned_sgd"
NB_DIR    = ROOT / "artifacts_namebert_sa"
ENS_DIR   = ROOT / "artifacts_ensemble"

# ---------- Ensemble Predictor ----------
class EnsemblePredictor:
    def __init__(self):
        # Load SGD artifacts
        self.vec_first = joblib.load(SGD_DIR / "vec_first_tfidf.joblib")
        self.vec_last  = joblib.load(SGD_DIR / "vec_last_tfidf.joblib")
        self.sgd_first = joblib.load(SGD_DIR / "sgd_first_tuned.joblib")
        self.sgd_last  = joblib.load(SGD_DIR / "sgd_last_tuned.joblib")
        self.cal_first = joblib.load(SGD_DIR / "cal_first_isotonic.joblib")
        self.cal_last  = joblib.load(SGD_DIR / "cal_last_isotonic.joblib")

        # Load NameBERT-SA
        self.tok = PreTrainedTokenizerFast.from_pretrained(str(NB_DIR / "tokenizer_hf"))
        self.MAX_LEN = 48
        cfg = RobertaConfig(
            vocab_size=len(self.tok),
            max_position_embeddings=self.MAX_LEN + 2,
            hidden_size=384,
            num_hidden_layers=6,
            num_attention_heads=6,
            intermediate_size=768,
            hidden_act="gelu",
            attention_probs_dropout_prob=0.1,
            hidden_dropout_prob=0.1,
            type_vocab_size=1,
            pad_token_id=self.tok.pad_token_id,
            bos_token_id=self.tok.bos_token_id,
            eos_token_id=self.tok.eos_token_id,
            num_labels=2,
            problem_type="single_label_classification",
            id2label={0: "non_sa", 1: "sa"},
            label2id={"non_sa": 0, "sa": 1},
        )
        self.nb_model = RobertaForSequenceClassification(cfg)
        state = torch.load(NB_DIR / "cls_manual" / "roberta_cls_state.pt", map_location="cpu")
        self.nb_model.load_state_dict(state, strict=True)
        self.nb_model.eval()
        self.iso_nb = joblib.load(NB_DIR / "cal_isotonic_cls.joblib")

        # Load ensemble config
        self.cfg = json.load(open(ENS_DIR / "inference_config.json"))
        self.threshold = float(self.cfg["threshold"])
        self.abstain_l, self.abstain_h = [float(x) for x in self.cfg["abstain_band"]]
        self.w_sgd = float(self.cfg.get("w_sgd", 1.0))
        self.w_nb  = float(self.cfg.get("w_nb", 1.0))

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.nb_model.to(self.device)

    @staticmethod
    def _split(name: str):
        s = (name or "").strip().lower()
        parts = s.split()
        if len(parts) == 0: return "", ""
        if len(parts) == 1: return parts[0], ""
        return parts[0], parts[-1]

    def _sgd_prob(self, name: str) -> float:
        first, last = self._split(name)
        Xfi = self.vec_first.transform([first])
        Xla = self.vec_last.transform([last])
        p_first = self.sgd_first.predict_proba(Xfi)[:, 1]
        p_last  = self.sgd_last.predict_proba(Xla)[:, 1]
        cp_first = np.clip(self.cal_first.predict(p_first), 0, 1)[0]
        cp_last  = np.clip(self.cal_last.predict(p_last),  0, 1)[0]
        # Probabilistic OR fusion
        return float(1.0 - (1.0 - cp_first) * (1.0 - cp_last))

    def _nb_prob(self, name: str) -> float:
        first, last = self._split(name)
        text = f"<FIRST> {first} <SEP> <LAST> {last}"
        enc = self.tok(text, truncation=True, padding="max_length",
                       max_length=self.MAX_LEN, return_tensors="pt")
        with torch.no_grad():
            logits = self.nb_model(
                input_ids=enc["input_ids"].to(self.device),
                attention_mask=enc["attention_mask"].to(self.device)
            ).logits
            p = torch.softmax(logits, dim=-1)[:, 1].item()
        # Isotonic calibration (fit on validation earlier)
        return float(np.clip(self.iso_nb.predict([p])[0], 0, 1))

    def predict_one(self, name: str,
                    override_threshold: Optional[float] = None,
                    override_abstain: Optional[List[float]] = None):
        ps = self._sgd_prob(name)
        pn = self._nb_prob(name)
        p  = (self.w_sgd * ps + self.w_nb * pn) / (self.w_sgd + self.w_nb)

        thr = self.threshold if override_threshold is None else float(override_threshold)
        if override_abstain is None:
            a_low, a_high = self.abstain_l, self.abstain_h
        else:
            a_low, a_high = float(override_abstain[0]), float(override_abstain[1])

        if p < a_low:
            decision = "non_sa"
        elif p > a_high:
            decision = "sa"
        else:
            decision = "abstain"

        hard = "sa" if p >= thr else "non_sa"
        return {
            "name": name,
            "prob_sgd": ps,
            "prob_nb": pn,
            "prob_ensemble": p,
            "decision_abstain_band": decision,
            "hard_decision_at_threshold": {"threshold": thr, "label": hard},
        }

# ---------- FastAPI app ----------
app = FastAPI(title="South Asian Name Classifier (Ensemble)",
              version="1.0.0",
              description="Calibrated ensemble of SGD fusion + NameBERT-SA with precision-first operating policy.")

PREDICTOR: Optional[EnsemblePredictor] = None

class PredictRequest(BaseModel):
    name: str = Field(..., description="Full name string")
    threshold: Optional[float] = Field(None, description="Override operating threshold (0..1)")
    abstain_band: Optional[List[float]] = Field(None, description="[low, high] override for abstain band")

class PredictBatchRequest(BaseModel):
    names: List[str]
    threshold: Optional[float] = None
    abstain_band: Optional[List[float]] = None

@app.on_event("startup")
def _load():
    global PREDICTOR
    PREDICTOR = EnsemblePredictor()

@app.get("/health")
def health():
    return {"status": "ok"}

@app.post("/predict")
def predict(req: PredictRequest):
    out = PREDICTOR.predict_one(req.name, req.threshold, req.abstain_band)
    return out

@app.post("/predict_batch")
def predict_batch(req: PredictBatchRequest):
    results = [PREDICTOR.predict_one(n, req.threshold, req.abstain_band) for n in req.names]
    return {"results": results}

How to run the service.

In [None]:
# (optional) create & activate a venv first
pip install fastapi uvicorn pydantic torch transformers joblib numpy

# from the directory that contains the artifacts_* folders and inference_service.py:
uvicorn inference_service:app --host 0.0.0.0 --port 8000

Examples of how to request jobs from the service can be found below.:

In [None]:
# Single prediction
curl -X POST http://localhost:8000/predict \
  -H "Content-Type: application/json" \
  -d '{"name": "Daniel Singh"}'

# Or with a batch of names
curl -X POST http://localhost:8000/predict_batch \
  -H "Content-Type: application/json" \
  -d '{"names": ["Mary Thomas", "Noah Patel", "Kevin Johnson"]}'

# Utilizing the Model

Download the batch_score.py file and The model artifacts folders: artifacts_tuned_sgd, artifacts_namebert_sa, artifacts_ensemble. Then in that directory run the following in the command line or terminal.

Or you could launch an API service.