<a href="https://colab.research.google.com/github/ArtDowdy/Scikit-learn/blob/main/talent_analytics_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# Talent Analytics (scikit-learn, Colab-ready)
# - Synthetic ATS/HRIS data generator
# - Candidate Success Classification (Calibrated, fairness audit)
# - Time-to-Hire Regression
# - Sourcing Channel ROI & Lift
# - Funnel Anomaly Detection (Isolation Forest)
# - Leakage-safe CV (GroupKFold by req_id)
# - ColumnTransformer + Pipeline + RandomizedSearchCV
# - Model export (joblib) + Model Card (JSON)
# ============================================================

import os, json, math, random, string, time
from dataclasses import dataclass
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, IsolationForest
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, f1_score, accuracy_score,
    mean_absolute_error, r2_score, confusion_matrix
)
from sklearn.inspection import permutation_importance
import joblib
np.random.seed(42); random.seed(42)

print("Using:", {
    "pandas": pd.__version__,
    "numpy": np.__version__,
})

# ----------------------------
# 1) Synthetic ATS/HRIS dataset
# ----------------------------
@dataclass
class GenCfg:
    n_reqs: int = 220           # open requisitions
    n_cand: int = 12000         # applicants
    roles: Tuple[str,...] = ("ml_eng","data_scientist","analyst","backend","mobile","designer")
    levels: Tuple[str,...] = ("IC1","IC2","IC3","IC4")
    sources: Tuple[str,...] = ("referral","linkedin","career_site","agency","event","internal")
    locations: Tuple[str,...] = ("SF","LA","NY","Remote","Austin","Seattle")
    genders: Tuple[str,...] = ("F","M","NB","Unknown")
    stages: Tuple[str,...] = ("applied","screen","hm_interview","loop","offer")

CFG = GenCfg()

def random_id(prefix, n=8):
    return prefix + "_" + "".join(random.choices(string.ascii_lowercase+string.digits, k=n))

def generate_data(cfg=CFG):
    # requisitions with latent difficulty + urgency
    reqs = []
    for i in range(cfg.n_reqs):
        role = random.choice(cfg.roles)
        level = np.random.choice(cfg.levels, p=[0.25,0.35,0.25,0.15])
        loc = np.random.choice(cfg.locations, p=[.23,.15,.22,.25,.08,.07])
        diff = np.clip(np.random.beta(2,5), 0.05, 0.95)             # hiring difficulty (higher = harder)
        urgency = np.random.uniform(0.2, 1.2)                        # >1 means rush
        reqs.append({"req_id": random_id("REQ"), "role": role, "level": level, "loc": loc,
                     "req_difficulty": diff, "urgency": urgency})
    reqs = pd.DataFrame(reqs)

    # candidates
    rows = []
    for _ in range(cfg.n_cand):
        r = reqs.sample(1).iloc[0]
        source = np.random.choice(cfg.sources, p=[.22,.28,.25,.08,.07,.10])
        gender = np.random.choice(cfg.genders, p=[.32,.50,.03,.15])
        years_exp = np.clip(np.random.normal(5, 2.5), 0, 30)
        top_school = np.random.binomial(1, 0.20)
        skill_count = int(np.clip(np.random.normal(6, 2), 1, 20))
        assessment = np.clip(np.random.normal(0.0, 1.0), -3, 3)      # standardized pre-hire test
        recruiter_load = np.random.randint(8, 28)
        comp_expect = np.random.normal(160, 30) * (1.15 if r["loc"] in ["SF","NY","LA"] else 0.95)
        # interview stage scores (missing if filtered earlier)
        pass_screen = np.random.rand() < (0.55 + 0.1*top_school + 0.02*assessment - 0.25*r["req_difficulty"])
        hm_score = np.nan
        loop_score = np.nan
        if pass_screen:
            hm_score = np.clip(np.random.normal(2.7 + 0.25*assessment + 0.04*skill_count, 0.8), 1, 5)
            pass_hm = hm_score > (3.1 + 0.3*r["req_difficulty"])
            if pass_hm:
                loop_score = np.clip(np.random.normal(3.0 + 0.3*assessment + 0.05*skill_count, 0.7), 1, 5)

        # Offer probability depends on req difficulty + performance + urgency
        offer_prob = 0.10 + 0.25*float(loop_score>3.2) + 0.12*float(hm_score and hm_score>3.2) \
                     + 0.05*top_school + 0.02*skill_count - 0.25*r["req_difficulty"] + 0.06*(r["urgency"]-0.8)
        offer_prob += 0.05 if source=="referral" else 0
        offer_prob = np.clip(offer_prob, 0.01, 0.95)
        offer = np.random.rand() < offer_prob

        # Accept offer probability & future success (retention+performance proxy)
        accept_prob = 0.45 + 0.15*(offer) - 0.0015*max(comp_expect-170,0) + 0.05*(source=="referral")
        accept = (np.random.rand() < np.clip(accept_prob,0.02,0.95))
        # Success label (what we want to predict): depends on interview signals & req difficulty; slight group skew
        success = (np.random.rand() <
                   np.clip(0.35 + 0.18*float(loop_score>3.3) + 0.12*float(hm_score and hm_score>3.4)
                           + 0.05*top_school + 0.01*skill_count - 0.15*r["req_difficulty"], 0.02, 0.98))
        # time-to-hire in days (if hired), else NaN; depends on urgency, recruiter_load, source
        if accept:
            tth = np.random.gamma(shape=3.0, scale=5.5) * (1.2 + 0.01*recruiter_load) * (1.1 if source=="agency" else 0.95)
            tth *= (0.9 if r["urgency"]>1.0 else 1.1)
        else:
            tth = np.nan

        rows.append({
            "req_id": r["req_id"], "role": r["role"], "level": r["level"], "loc": r["loc"],
            "source": source, "gender": gender,
            "years_exp": years_exp, "top_school": top_school, "skill_count": skill_count,
            "assessment_z": assessment, "recruiter_load": recruiter_load,
            "comp_expect_k": comp_expect, "hm_score": hm_score, "loop_score": loop_score,
            "offer": int(offer), "accept": int(accept),
            "success_label": int(success), "time_to_hire_days": tth,
            "req_difficulty": r["req_difficulty"], "urgency": r["urgency"]
        })
    df = pd.DataFrame(rows)
    # introduce missingness/realism
    for col in ["hm_score","loop_score"]:
        df[col] = df[col].astype(float)
    return df, reqs

df, reqs = generate_data()
print("Data shape:", df.shape)
df.head(3)

# -----------------------------------------
# 2) Candidate Success Classifier (sklearn)
# -----------------------------------------
TARGET = "success_label"
GROUP = "req_id"  # to avoid leakage between train/test splits
CATS = ["role","level","loc","source","gender"]
NUMS = ["years_exp","top_school","skill_count","assessment_z","recruiter_load",
        "comp_expect_k","hm_score","loop_score","req_difficulty","urgency"]

# Simple imputation (median for numeric, most_frequent for cats) via pipeline steps
numeric_tf = Pipeline(steps=[
    ("impute",  # use SimpleImputer without importing directly to keep deps minimal
     type("MedianImputer",(object,),{"fit":lambda self,X,y=None:self,
                                     "transform":lambda self,X: pd.DataFrame(X).fillna(pd.DataFrame(X).median()).values})()),
    ("scale", StandardScaler())
])
categorical_tf = Pipeline(steps=[
    ("impute",
     type("ModeImputer",(object,),{"fit":lambda self,X,y=None:self,
                                   "transform":lambda self,X: pd.DataFrame(X).fillna(pd.DataFrame(X).mode().iloc[0]).values})()),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
pre = ColumnTransformer([
    ("num", numeric_tf, NUMS),
    ("cat", categorical_tf, CATS)
])

base_clf = LogisticRegression(max_iter=200, n_jobs=None)  # base model for calibrated wrapper

pipe = Pipeline(steps=[
    ("pre", pre),
    ("clf", CalibratedClassifierCV(
        base_estimator=base_clf, method="sigmoid", cv=3  # Platt scaling
    ))
])

# Hyperparameter search on a *different* classifier inside pipeline:
# We'll swap to RandomForest inside CV using set_params trick.
rf_search_space = {
    "clf__base_estimator": [RandomForestClassifier()],
    "clf__base_estimator__n_estimators": [200, 400],
    "clf__base_estimator__max_depth": [None, 10, 20],
    "clf__base_estimator__min_samples_leaf": [1, 2, 4],
    "clf__base_estimator__class_weight": ["balanced", None],
}

# Leakage-safe splits: group by req_id
gkf = GroupKFold(n_splits=5)
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=rf_search_space,
    n_iter=8,
    scoring="roc_auc",
    cv=gkf.split(df, df[TARGET], groups=df[GROUP]),
    verbose=1,
    n_jobs=-1,
    refit=True
)
search.fit(df[CATS+NUMS], df[TARGET])
clf = search.best_estimator_
print("Best AUC (CV):", search.best_score_)
print("Best params:", search.best_params_)

# Evaluate on a fresh split by req_id (holdout)
unique_reqs = df[GROUP].unique()
np.random.shuffle(unique_reqs)
split = int(0.8*len(unique_reqs))
req_train, req_test = unique_reqs[:split], unique_reqs[split:]
tr = df[df[GROUP].isin(req_train)]
te = df[df[GROUP].isin(req_test)]

clf.fit(tr[CATS+NUMS], tr[TARGET])
probs = clf.predict_proba(te[CATS+NUMS])[:,1]
preds = (probs >= 0.5).astype(int)

auc = roc_auc_score(te[TARGET], probs)
ap = average_precision_score(te[TARGET], probs)
brier = brier_score_loss(te[TARGET], probs)
acc = accuracy_score(te[TARGET], preds)
f1 = f1_score(te[TARGET], preds)
print(f"[Holdout] AUC={auc:.3f}  AP={ap:.3f}  Brier={brier:.3f}  Acc={acc:.3f}  F1={f1:.3f}")

# Feature importances via permutation (post-preprocessing)
perm = permutation_importance(clf, te[CATS+NUMS], te[TARGET], n_repeats=5, random_state=42, n_jobs=-1)
imp = pd.DataFrame({"feature": CATS+NUMS, "importance": perm.importances_mean}).sort_values("importance", ascending=False)
print("\nTop features (permutation):\n", imp.head(10))

# -----------------------------------------
# 3) Lightweight Fairness Audit (group metrics)
# -----------------------------------------
def fairness_report(y_true, y_prob, group, threshold=0.5):
    dfm = pd.DataFrame({"y": y_true, "p": y_prob, "g": group})
    out = []
    for g in dfm["g"].unique():
        d = dfm[dfm["g"]==g]
        sel_rate = (d["p"]>=threshold).mean()
        tpr = ((d["p"]>=threshold) & (d["y"]==1)).sum() / max((d["y"]==1).sum(), 1)
        fpr = ((d["p"]>=threshold) & (d["y"]==0)).sum() / max((d["y"]==0).sum(), 1)
        out.append({"group": g, "selection_rate": sel_rate, "TPR": tpr, "FPR": fpr, "count": len(d)})
    rep = pd.DataFrame(out)
    # gaps vs. global
    base = rep["selection_rate"].mean()
    rep["sel_rate_gap"] = rep["selection_rate"] - base
    rep["tpr_gap_to_max"] = rep["TPR"] - rep["TPR"].max()
    rep["fpr_gap_to_min"] = rep["FPR"] - rep["FPR"].min()
    return rep.sort_values("group")

fair = fairness_report(te[TARGET].values, probs, te["gender"].values, threshold=0.5)
print("\nFairness report by gender:\n", fair)

# Optional: tune threshold to equalize TPR across groups (quick demo)
best_tau, best_disp = 0.5, 1e9
for tau in np.linspace(0.3, 0.7, 21):
    fr = fairness_report(te[TARGET].values, probs, te["gender"].values, threshold=tau)
    disp = fr["TPR"].max() - fr["TPR"].min()
    if disp < best_disp:
        best_disp, best_tau = disp, tau
print(f"\nSuggested threshold for smaller TPR disparity: tau≈{best_tau:.2f} (dispersion={best_disp:.3f})")

# -----------------------------------------
# 4) Time-to-Hire Regression (if hired)
# -----------------------------------------
hired = df[df["accept"]==1].copy()
REG_TARGET = "time_to_hire_days"
reg_cats = ["role","level","loc","source"]
reg_nums = ["years_exp","skill_count","assessment_z","recruiter_load","req_difficulty","urgency"]
pre_reg = ColumnTransformer([
    ("num", StandardScaler(), reg_nums),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), reg_cats)
])
reg_pipe = Pipeline([
    ("pre", pre_reg),
    ("reg", GradientBoostingRegressor(random_state=42))
])
reg_param = {
    "reg__n_estimators": [200, 300],
    "reg__max_depth": [2,3],
    "reg__learning_rate": [0.05, 0.1],
}
reg_search = RandomizedSearchCV(reg_pipe, reg_param, n_iter=4, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1, random_state=42)
reg_search.fit(hired[reg_cats+reg_nums], hired[REG_TARGET])
reg = reg_search.best_estimator_
pred_tth = reg.predict(hired[reg_cats+reg_nums])
mae = mean_absolute_error(hired[REG_TARGET], pred_tth)
r2 = r2_score(hired[REG_TARGET], pred_tth)
print(f"\nTime-to-Hire  |  MAE={mae:.2f} days  R2={r2:.3f}  (n={len(hired)})")

# -----------------------------------------
# 5) Sourcing Channel ROI / Lift (simple)
# -----------------------------------------
channel = te.groupby("source").apply(lambda d: pd.Series({
    "n": len(d),
    "offer_rate": d["offer"].mean(),
    "accept_rate": d["accept"].mean(),
    "success_rate": d[TARGET].mean(),
})).reset_index()
channel["success_lift_vs_avg"] = channel["success_rate"] - te[TARGET].mean()
channel = channel.sort_values("success_rate", ascending=False)
print("\nChannel effectiveness (holdout):\n", channel)

# -----------------------------------------
# 6) Funnel Anomaly Detection (per req)
# -----------------------------------------
# Build per-req funnel metrics, then detect anomalies in conversion patterns
funnel = df.groupby("req_id").agg(
    applied=("req_id","size"),
    screens=("hm_score", lambda s: s.notna().sum()),
    loops=("loop_score", lambda s: s.notna().sum()),
    offers=("offer","sum"),
    accepts=("accept","sum"),
    successes=(TARGET,"sum"),
    recruiter_load=("recruiter_load","mean"),
    diff=("req_difficulty","mean"),
    urg=("urgency","mean")
).reset_index()
for a,b,name in [("screens","applied","screen_rate"),
                 ("loops","screens","loop_rate"),
                 ("offers","loops","offer_rate"),
                 ("accepts","offers","accept_rate"),
                 ("successes","accepts","success_rate")]:
    funnel[name] = np.where(funnel[b]>0, funnel[a]/funnel[b], 0.0)

iso = IsolationForest(random_state=42, contamination=0.08)
X_iso = funnel[["screen_rate","loop_rate","offer_rate","accept_rate","success_rate","recruiter_load","diff","urg"]]
funnel["anomaly_score"] = -iso.fit_score(X_iso)  # higher => more anomalous
sus = funnel.sort_values("anomaly_score", ascending=False).head(10)
print("\nPotential funnel anomalies (top 10 reqs):\n", sus[["req_id","screen_rate","loop_rate","offer_rate","accept_rate","success_rate","anomaly_score"]])

# -----------------------------------------
# 7) Export artifacts + model card
# -----------------------------------------
os.makedirs("artifacts_talent", exist_ok=True)
clf_path = "artifacts_talent/candidate_success_calibrated.joblib"
reg_path = "artifacts_talent/time_to_hire_regressor.joblib"
joblib.dump(clf, clf_path); joblib.dump(reg, reg_path)

card = {
    "task": {
        "candidate_success_classification": {
            "cv_best_auc": float(search.best_score_),
            "holdout": {"auc": float(auc), "ap": float(ap), "brier": float(brier), "accuracy": float(acc), "f1": float(f1)},
            "fairness": fair.to_dict(orient="records"),
            "suggested_threshold_for_tpr_balance": float(best_tau),
            "top_features_permutation": imp.head(10).to_dict(orient="records"),
            "model_path": clf_path,
        },
        "time_to_hire_regression": {
            "mae_days": float(mae), "r2": float(r2),
            "model_path": reg_path
        },
        "sourcing_channel_effectiveness": channel.to_dict(orient="records"),
        "funnel_anomalies_top10": sus[["req_id","anomaly_score"]].to_dict(orient="records"),
    },
    "data": {
        "rows": int(len(df)),
        "requisitions": int(len(reqs)),
        "note": "All data is synthetic; no personal/sensitive real-world data."
    },
    "pipeline": {
        "leakage_avoidance": "GroupKFold by req_id; independent holdout by req_id",
        "preprocessing": {
            "numeric": ["median impute", "StandardScaler"],
            "categorical": ["mode impute", "OneHotEncoder(handle_unknown=ignore)"]
        },
        "calibration": "Platt scaling via CalibratedClassifierCV(cv=3)",
        "hyperparam_search": "RandomizedSearchCV over RandomForest inside calibrated wrapper"
    }
}
with open("artifacts_talent/model_card.json","w") as f:
    json.dump(card, f, indent=2)

print("\nSaved artifacts:")
print(" -", clf_path)
print(" -", reg_path)
print(" - artifacts_talent/model_card.json")

# ----------------------
