In [None]:
# CELL 1: instalar dependencias (una sola vez por sesión)
!pip install pyarrow fastparquet -q

In [123]:
import os
import pandas as pd

EPISODES_CSV_PATH = "/content/2025_HMIS_records_sheltered.csv"
OUTPUT_DIR = "/content/"
OUTPUT_PATH = os.path.join(OUTPUT_DIR, "episode_features.parquet")

EPISODES_CONFIG = {
    "key_map": {
        "EnrollmentID_a": "episode_id",
        "hhid_a": "household_id",
        "ProgramID_a": "program_id",
        "ref_client_a": "client_id",
        "hh_entrydt": "hh_entry_date",
        "entrydt": "entry_date",
        "exitdt": "exit_date",
        "hmls_start_dt": "homeless_start_date",
        "lengthstay": "stay_length_days",
        "SPA": "spa",
        "cd": "council_district",
        "sd": "supervisorial_district",
        "ad": "assembly_district",
        "city_name": "city_name_raw",
        "city": "city",
        "CoC": "coc",
        "community": "community_name",
    },
    "date_cols": [
        "hh_entry_date",
        "entry_date",
        "exit_date",
        "homeless_start_date",
    ],
    "numeric_cols": [
        "age_mos",
        "age_yrs",
        "n_hhmem",
        "n_chronic",
        "n_vet",
        "n_und18",
        "n_1824",
        "n_25p",
        "stay_length_days",
        "hmls_past1yr_times",
        "hmls_past3yr_times",
        "hmls_past3yr_months",
        "since_entry_mo",
        "since_hmls_mo",
        "chronHealth_dur",
        "physical_dur",
        "mental_dur",
        "subsAbuse_dur",
        "any_kids" # Added 'any_kids' here
    ],
    "keep_cols": [
        # claves
        "episode_id",
        "household_id",
        "program_id",
        "client_id",
        # fechas
        "hh_entry_date",
        "entry_date",
        "exit_date",
        "homeless_start_date",
        "stay_length_days",
        "since_entry_mo",
        "since_hmls_mo",
        # hogar
        "n_hhmem",
        "n_chronic",
        "n_vet",
        "hh_shelter_type",
        "n_und18",
        "n_1824",
        "n_25p",
        "any_kids",
        "any_child",
        "hhold_type",
        "youthHH",
        "chronicHH",
        "vetHH",
        # demografía
        "age_mos",
        "age_yrs",
        "age_cat",
        "age_catd",
        "age_catb",
        "ageband",
        # género / orientación / raza
        "gender_simple",
        "sexual_orientation",
        "race_simple",
        "is_hispanic",
        # salud
        "has_physical_condition",
        "has_mental_condition",
        "has_chronic_health",
        "has_disability",
        "has_domestic_violence",
        "is_fleeing_violence",
        "uses_alcohol",
        "uses_drugs",
        "has_sud",
        "has_co_occuring_disorder",
        "physical_months",
        "mental_months",
        "sud_months",
        "chronic_health_months",
        # veteranía
        "is_veteran",
        "veteran_era",
        "veteran_discharge_status",
        # historia homelessness
        "prior_hmls",
        "priorliving",
        "episodes_last_1y",
        "episodes_last_3y",
        "months_homeless_3y",
        "is_chronic_person",
        "from_street_flag",
        "chronlive",
        "chronicP",
        "chronicP_D",
        "dec2024_sheltered",
        "in_dedup",
        "in_enroll",
        "destination",
        # geografía
        "spa",
        "council_district",
        "supervisorial_district",
        "assembly_district",
        "city_name_raw",
        "city",
        "coc",
        "community_name",
    ],
}

In [124]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [131]:
def load_raw(path: str) -> pd.DataFrame:
    return pd.read_csv(path)


def apply_basic_mappings(
    df: pd.DataFrame, key_map: dict, date_cols: list, numeric_cols: list
) -> pd.DataFrame:
    if key_map:
        df = df.rename(columns=key_map)

    for c in date_cols or []:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors="coerce")

    for c in numeric_cols or []:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df


def add_health_features(df: pd.DataFrame) -> pd.DataFrame:
    df["has_physical_condition"] = df.get("physical_yn", 0) == 1
    df["has_mental_condition"] = df.get("mental_yn", 0) == 1
    df["has_chronic_health"] = df.get("chronHealth", 0) == 1
    df["has_disability"] = (df.get("rawDisabled", 0) == 1) | (df.get("develop", 0) == 1)
    df["has_domestic_violence"] = df.get("domViol", 0) == 1
    df["is_fleeing_violence"] = (df.get("flee_viol", 0) == 1) | (
        df.get("flee_dv_hmls", 0) == 1
    )

    df["uses_alcohol"] = df.get("alcohol_yn", 0) == 1
    df["uses_drugs"] = df.get("drugs_yn", 0) == 1
    df["has_sud"] = df["uses_alcohol"] | df["uses_drugs"]
    df["has_co_occuring_disorder"] = (df.get("chronCond", 0) == 1) | (
        df.get("codisorder_LD", 0) == 1
    )

    df["physical_months"] = df.get("physical_dur")
    df["mental_months"] = df.get("mental_dur")
    df["sud_months"] = df.get("subsAbuse_dur")
    df["chronic_health_months"] = df.get("chronHealth_dur")

    return df


def add_veteran_features(df: pd.DataFrame) -> pd.DataFrame:
    df["is_veteran"] = df.get("veteran", 0) == 1

    def _veteran_era(row):
        eras = [
            ("WW2", "vet_1941_1945"),
            ("Korea", "vet_1950_1953"),
            ("Vietnam", "vet_1965_1973"),
            ("GulfWar", "vet_1991"),
            ("Afghanistan", "vet_Afghanistan"),
            ("IraqFreedom", "vet_IraqFreedom"),
            ("IraqDawn", "vet_IraqDawn"),
            ("Post2001", "vet_2001_2014"),
            ("Other", "vet_other"),
        ]
        for label, col in eras:
            if col in row and row[col] == 1:
                return label
        return "None"

    df["veteran_era"] = df.apply(_veteran_era, axis=1)
    df["veteran_discharge_status"] = df.get("vetdischarge")

    return df


def add_gender_features(df: pd.DataFrame) -> pd.DataFrame:
    def _gender_simple(row):
        if row.get("hd_man", 0) == 1:
            return "man"
        if row.get("hd_woman", 0) == 1:
            return "woman"
        if row.get("hd_trans", 0) == 1:
            return "trans"
        if row.get("hd_non_bin", 0) == 1:
            return "nonbinary"
        return "other"

    df["gender_simple"] = df.apply(_gender_simple, axis=1)
    return df


def add_sexual_orientation_features(df: pd.DataFrame) -> pd.DataFrame:
    def _sex_orient(row):
        if row.get("straight", 0) == 1:
            return "straight"
        if row.get("gay", 0) == 1:
            return "gay_lesbian"
        if row.get("bisexual", 0) == 1:
            return "bisexual"
        return "other"

    df["sexual_orientation"] = df.apply(_sex_orient, axis=1)
    return df


def add_race_features(df: pd.DataFrame) -> pd.DataFrame:
    df["is_hispanic"] = df.get("hispanic", 0) == 1

    def _race_simple(row):
        if row.get("white", 0) == 1 and row.get("hispanic", 0) != 1:
            return "white"
        if row.get("black", 0) == 1:
            return "black"
        if row.get("asian", 0) == 1:
            return "asian"
        if row.get("ai_an", 0) == 1:
            return "native"
        if row.get("nh_pi", 0) == 1:
            return "pacific"
        if row.get("NH_multirace", 0) == 1 or row.get("multirace", 0) == 1:
            return "multiracial"
        return "other"

    df["race_simple"] = df.apply(_race_simple, axis=1)
    return df


def add_homeless_history_features(df: pd.DataFrame) -> pd.DataFrame:
    df["episodes_last_1y"] = pd.to_numeric(
        df.get("hmls_past1yr_times", 0), errors="coerce"
    ).fillna(0)
    df["episodes_last_3y"] = pd.to_numeric(
        df.get("hmls_past3yr_times", 0), errors="coerce"
    ).fillna(0)
    df["months_homeless_3y"] = pd.to_numeric(
        df.get("hmls_past3yr_months", 0), errors="coerce"
    ).fillna(0)
    df["is_chronic_person"] = (df.get("chronicP", 0) == 1) | (
        df.get("chronCond", 0) == 1
    )
    df["from_street_flag"] = df.get("from_street", 0) == 1
    return df


def transform_episodes(df: pd.DataFrame, cfg: dict) -> pd.DataFrame:
    key_map = cfg.get("key_map", {})
    date_cols = cfg.get("date_cols", [])
    numeric_cols = cfg.get("numeric_cols", [])

    df = apply_basic_mappings(df, key_map, date_cols, numeric_cols)
    df = add_health_features(df)
    df = add_veteran_features(df)
    df = add_gender_features(df)
    df = add_sexual_orientation_features(df)
    df = add_race_features(df)
    df = add_homeless_history_features(df)

    keep_cols = cfg.get("keep_cols", [])
    keep_cols = [c for c in keep_cols if c in df.columns]
    if keep_cols:
        df = df[keep_cols]

    return df


def save_parquet(df: pd.DataFrame, path: str) -> None:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_parquet(path, index=False)


In [132]:
df_raw = load_raw(EPISODES_CSV_PATH)
df_feat = transform_episodes(df_raw, EPISODES_CONFIG)
save_parquet(df_feat, OUTPUT_PATH)
print(df_feat.shape)
df_feat.head()

  return pd.read_csv(path)


(17847, 73)


Unnamed: 0,episode_id,household_id,program_id,client_id,hh_entry_date,entry_date,exit_date,homeless_start_date,stay_length_days,since_entry_mo,...,in_enroll,destination,spa,council_district,supervisorial_district,assembly_district,city_name_raw,city,coc,community_name
0,e81,H10,P206,r13045,2022-02-18,2022-02-18,NaT,NaT,,35,...,1.Yes,"2. Homeless: Emergency shelter, including hote...",San Fernando Valley,6,3,43,Los Angeles city,1.LA City,1,
1,e1409,H1000,P417,r8499,2024-12-11,2024-12-11,NaT,NaT,,1,...,1.Yes,.N - NULL,Metro (excl. Skid Row and Hollywood),14,1,57,Los Angeles city,1.LA City,1,
2,e12155,H10000,P176,r20595,2024-07-24,2024-07-24,NaT,NaT,,6,...,1.Yes,.N - NULL,South,9,2,57,Los Angeles city,1.LA City,1,
3,e12156,H10001,P176,r20698,2024-07-22,2024-07-22,NaT,NaT,,6,...,1.Yes,4. Temporary & Permanent: Transitional housing...,South,9,2,57,Los Angeles city,1.LA City,1,
4,e12159,H10002,P453,r8306,2024-07-30,2024-07-30,NaT,NaT,,6,...,1.Yes,"2. Homeless: Emergency shelter, including hote...",San Fernando Valley,6,3,43,Los Angeles city,1.LA City,1,


In [133]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors

DATA_PATH = "/content/episode_features.parquet"

In [134]:
df = pd.read_parquet(DATA_PATH)
print(df.shape)
df.head()


(17847, 73)


Unnamed: 0,episode_id,household_id,program_id,client_id,hh_entry_date,entry_date,exit_date,homeless_start_date,stay_length_days,since_entry_mo,...,in_enroll,destination,spa,council_district,supervisorial_district,assembly_district,city_name_raw,city,coc,community_name
0,e81,H10,P206,r13045,2022-02-18,2022-02-18,NaT,NaT,,35,...,1.Yes,"2. Homeless: Emergency shelter, including hote...",San Fernando Valley,6,3,43,Los Angeles city,1.LA City,1,
1,e1409,H1000,P417,r8499,2024-12-11,2024-12-11,NaT,NaT,,1,...,1.Yes,.N - NULL,Metro (excl. Skid Row and Hollywood),14,1,57,Los Angeles city,1.LA City,1,
2,e12155,H10000,P176,r20595,2024-07-24,2024-07-24,NaT,NaT,,6,...,1.Yes,.N - NULL,South,9,2,57,Los Angeles city,1.LA City,1,
3,e12156,H10001,P176,r20698,2024-07-22,2024-07-22,NaT,NaT,,6,...,1.Yes,4. Temporary & Permanent: Transitional housing...,South,9,2,57,Los Angeles city,1.LA City,1,
4,e12159,H10002,P453,r8306,2024-07-30,2024-07-30,NaT,NaT,,6,...,1.Yes,"2. Homeless: Emergency shelter, including hote...",San Fernando Valley,6,3,43,Los Angeles city,1.LA City,1,


In [135]:
candidate_id_col = "episode_id"
feature_cols_num = [
    "age_yrs",
    "stay_length_days",
    "episodes_last_1y",
    "episodes_last_3y",
    "months_homeless_3y",
    "year_round_beds",
    "total_beds",
    "utilization_rate",
]

feature_cols_bool = [
    "has_physical_condition",
    "has_mental_condition",
    "has_chronic_health",
    "has_disability",
    "has_domestic_violence",
    "is_fleeing_violence",
    "uses_alcohol",
    "uses_drugs",
    "has_sud",
    "has_co_occuring_disorder",
    "is_veteran",
    "is_chronic_person",
    "from_street_flag",
]

feature_cols_cat = [
    "gender_simple",
    "race_simple",
    "sexual_orientation",
    "spa",
    "city",
    "coc",
    "project_type",
    "housing_type",
    "inventory_type",
    "target_population",
]

# filtrar solo columnas que realmente existan en df
feature_cols_num   = [c for c in feature_cols_num   if c in df.columns]
feature_cols_bool  = [c for c in feature_cols_bool  if c in df.columns]
feature_cols_cat   = [c for c in feature_cols_cat   if c in df.columns]

feature_cols_all = feature_cols_num + feature_cols_bool + feature_cols_cat

len(feature_cols_all), feature_cols_all


(24,
 ['age_yrs',
  'stay_length_days',
  'episodes_last_1y',
  'episodes_last_3y',
  'months_homeless_3y',
  'has_physical_condition',
  'has_mental_condition',
  'has_chronic_health',
  'has_disability',
  'has_domestic_violence',
  'is_fleeing_violence',
  'uses_alcohol',
  'uses_drugs',
  'has_sud',
  'has_co_occuring_disorder',
  'is_veteran',
  'is_chronic_person',
  'from_street_flag',
  'gender_simple',
  'race_simple',
  'sexual_orientation',
  'spa',
  'city',
  'coc'])

In [136]:

for c in feature_cols_bool:
    df[c] = df[c].astype(float)

numeric_features = feature_cols_num + feature_cols_bool
categorical_features = feature_cols_cat

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
)


In [137]:

df_candidates = df[[candidate_id_col] + feature_cols_all].copy()

df_candidates = df_candidates.replace([np.inf, -np.inf], np.nan)

X_raw = df_candidates[feature_cols_all]
X_raw = X_raw.fillna(0)

preprocessor.fit(X_raw)

X_emb = preprocessor.transform(X_raw)

X_emb.shape


(17847, 32)

In [138]:

nn_model = NearestNeighbors(
    n_neighbors=20,
    metric="cosine",
    algorithm="brute",
)
nn_model.fit(X_emb)

candidate_ids = df_candidates[candidate_id_col].values


In [139]:
# CELL 7: helper para construir el vector de un empleador a partir de un dict de requisitos

def build_employer_vector(requirements: dict) -> np.ndarray:
    row = {}
    for col in feature_cols_all:
        if col in requirements:
            row[col] = requirements[col]
        else:
            if col in numeric_features:
                row[col] = 0.0
            else:
                row[col] = "missing"

    df_emp = pd.DataFrame([row])
    df_emp = df_emp.replace([np.inf, -np.inf], np.nan).fillna(0)

    X_emp = preprocessor.transform(df_emp[feature_cols_all])
    return X_emp


In [143]:
# CELL 8: función principal de recomendación

def recommend_employees(
    requirements: dict,
    top_k: int = 10,
) -> pd.DataFrame:

    X_emp = build_employer_vector(requirements)
    distances, indices = nn_model.kneighbors(X_emp, n_neighbors=top_k)

    dists = distances[0]
    idxs  = indices[0]

    sims = 1.0 - dists

    results = df_candidates.iloc[idxs].copy()
    results["match_score"] = sims

    results = results.sort_values("match_score", ascending=False)

    return results


In [142]:
employer_requirements = {
    "spa": "4",             # Service Planning Area
    "city": "LOS ANGELES",
    "is_veteran": 1,
    "has_sud": 0,
    "age_yrs": 30,
}

recommendations = recommend_employees(employer_requirements, top_k=15)
recommendations.head(15)


Unnamed: 0,episode_id,age_yrs,stay_length_days,episodes_last_1y,episodes_last_3y,months_homeless_3y,has_physical_condition,has_mental_condition,has_chronic_health,has_disability,...,is_veteran,is_chronic_person,from_street_flag,gender_simple,race_simple,sexual_orientation,spa,city,coc,match_score
12990,e7523,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
13101,e7634,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
9217,e3799,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
3859,e16569,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Metro (excl. Skid Row and Hollywood),1.LA City,1,0.562368
17688,e12003,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
11692,e6172,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,South,1.LA City,1,0.562368
17681,e11997,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
162,e12319,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
11854,e2561,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368
11853,e2558,15.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,Antelope Valley,2.LA County,1,0.562368


In [144]:
import joblib
import numpy as np
import os

ARTIFACTS_DIR = "/content/model_artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

PREPROCESSOR_PATH = os.path.join(ARTIFACTS_DIR, "preprocessor.pkl")
KNN_PATH          = os.path.join(ARTIFACTS_DIR, "nn_employees_knn.pkl")
CAND_IDS_PATH     = os.path.join(ARTIFACTS_DIR, "candidate_ids.npy")
FEATURES_META_PATH= os.path.join(ARTIFACTS_DIR, "features_meta.npz")

joblib.dump(preprocessor, PREPROCESSOR_PATH)
joblib.dump(nn_model, KNN_PATH)
np.save(CAND_IDS_PATH, candidate_ids)
np.savez(
    FEATURES_META_PATH,
    feature_cols_all=np.array(feature_cols_all, dtype=object),
    numeric_features=np.array(numeric_features, dtype=object),
    categorical_features=np.array(categorical_features, dtype=object),
)

PREPROCESSOR_PATH, KNN_PATH, CAND_IDS_PATH, FEATURES_META_PATH

('/content/model_artifacts/preprocessor.pkl',
 '/content/model_artifacts/nn_employees_knn.pkl',
 '/content/model_artifacts/candidate_ids.npy',
 '/content/model_artifacts/features_meta.npz')

In [145]:
import joblib
import numpy as np

def load_artifacts(artifacts_dir: str = "/content/model_artifacts"):
    preproc = joblib.load(os.path.join(artifacts_dir, "preprocessor.pkl"))
    knn     = joblib.load(os.path.join(artifacts_dir, "nn_employees_knn.pkl"))
    cand_ids= np.load(os.path.join(artifacts_dir, "candidate_ids.npy"), allow_pickle=True)
    meta    = np.load(os.path.join(artifacts_dir, "features_meta.npz"), allow_pickle=True)

    feature_cols_all   = list(meta["feature_cols_all"])
    numeric_features   = list(meta["numeric_features"])
    categorical_features = list(meta["categorical_features"])

    return preproc, knn, cand_ids, feature_cols_all, numeric_features, categorical_features

preprocessor_loaded, nn_model_loaded, candidate_ids_loaded, feature_cols_all_loaded, numeric_features_loaded, categorical_features_loaded = load_artifacts()


In [146]:
def recommend_like_candidate(
    candidate_id,
    top_k: int = 10,
) -> pd.DataFrame:
    cand_ids_arr = np.array(candidate_ids)

    matches = np.where(cand_ids_arr == candidate_id)[0]
    if len(matches) == 0:
        raise ValueError(f"candidate_id {candidate_id} not found")

    idx = matches[0]

    X_proto = X_emb[idx : idx + 1]

    distances, indices = nn_model.kneighbors(X_proto, n_neighbors=top_k + 1)
    dists = distances[0]
    idxs  = indices[0]

    mask_not_self = idxs != idx
    idxs  = idxs[mask_not_self][:top_k]
    dists = dists[mask_not_self][:top_k]

    sims = 1.0 - dists

    results = df_candidates.iloc[idxs].copy()
    results["match_score"] = sims

    results = results.sort_values("match_score", ascending=False)

    return results


In [147]:
def recommend_employees_filtered(
    requirements: dict,
    top_k: int = 50,
    filter_spa: str | None = None,
    filter_city: str | None = None,
    filter_coc: str | None = None,
) -> pd.DataFrame:
    X_emp = build_employer_vector(requirements)
    distances, indices = nn_model.kneighbors(X_emp, n_neighbors=top_k)

    dists = distances[0]
    idxs  = indices[0]
    sims  = 1.0 - dists

    base = df.merge(df_candidates, on= "episode_id", how="right") if "episode_id" in df.columns else df_candidates
    results = base.iloc[idxs].copy()
    results["match_score"] = sims

    if filter_spa is not None and "spa" in results.columns:
        results = results[results["spa"].astype(str) == str(filter_spa)]

    if filter_city is not None and "city" in results.columns:
        results = results[results["city"].astype(str).str.upper() == str(filter_city).upper()]

    if filter_coc is not None and "coc" in results.columns:
        results = results[results["coc"].astype(str) == str(filter_coc)]

    results = results.sort_values("match_score", ascending=False)

    return results


In [148]:
def match_employees(
    mode: str,
    *,
    candidate_id: str | int | None = None,
    requirements: dict | None = None,
    top_k: int = 10,
    filter_spa: str | None = None,
    filter_city: str | None = None,
    filter_coc: str | None = None,
) -> pd.DataFrame:
    if mode == "like_candidate":
        if candidate_id is None:
            raise ValueError("candidate_id is required when mode='like_candidate'")
        return recommend_like_candidate(candidate_id, top_k=top_k)

    if mode == "by_requirements":
        if requirements is None:
            raise ValueError("requirements is required when mode='by_requirements'")
        return recommend_employees_filtered(
            requirements=requirements,
            top_k=top_k,
            filter_spa=filter_spa,
            filter_city=filter_city,
            filter_coc=filter_coc,
        )

    raise ValueError(f"Unknown mode: {mode}")


In [149]:
# CELL: Wrap recommender logic into a reusable class

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


class EmployeeRecommender:
    """
    Simple employee recommender:
    - Uses tabular features with a ColumnTransformer (num + cat).
    - Uses NearestNeighbors with cosine distance.
    """

    def __init__(
        self,
        df: pd.DataFrame,
        candidate_id_col: str,
        numeric_features: list,
        bool_features: list,
        categorical_features: list,
        n_neighbors_default: int = 20,
    ):
        self.df = df.copy()
        self.candidate_id_col = candidate_id_col

        # ensure boolean features are numeric (0/1)
        for c in bool_features:
            if c in self.df.columns:
                self.df[c] = self.df[c].astype(float)

        self.numeric_features = [c for c in numeric_features if c in self.df.columns]
        self.bool_features = [c for c in bool_features if c in self.df.columns]
        self.categorical_features = [
            c for c in categorical_features if c in self.df.columns
        ]

        self.feature_cols_all = (
            self.numeric_features + self.bool_features + self.categorical_features
        )

        self.numeric_transformer = StandardScaler()
        self.categorical_transformer = OneHotEncoder(
            handle_unknown="ignore", sparse_output=False
        )

        self.preprocessor = ColumnTransformer(
            transformers=[
                ("num", self.numeric_transformer, self.numeric_features + self.bool_features),
                ("cat", self.categorical_transformer, self.categorical_features),
            ],
            remainder="drop",
        )

        df_feats = self.df[[self.candidate_id_col] + self.feature_cols_all].copy()
        df_feats = df_feats.replace([np.inf, -np.inf], np.nan)
        X_raw = df_feats[self.feature_cols_all].fillna(0)

        self.preprocessor.fit(X_raw)
        self.X_emb = self.preprocessor.transform(X_raw)
        self.candidate_ids = df_feats[self.candidate_id_col].values
        self.df_candidates = df_feats

        self.nn_model = NearestNeighbors(
            n_neighbors=n_neighbors_default,
            metric="cosine",
            algorithm="brute",
        )
        self.nn_model.fit(self.X_emb)

    def _build_requirements_vector(self, requirements: dict) -> np.ndarray:
        """
        Build a feature vector for an employer from a dict of requirements.
        Keys must match self.feature_cols_all.
        """
        row = {}
        for col in self.feature_cols_all:
            if col in requirements:
                row[col] = requirements[col]
            else:
                if col in (self.numeric_features + self.bool_features):
                    row[col] = 0.0
                else:
                    row[col] = "missing"

        df_req = pd.DataFrame([row])
        df_req = df_req.replace([np.inf, -np.inf], np.nan).fillna(0)
        X = self.preprocessor.transform(df_req[self.feature_cols_all])
        return X

    def recommend_by_requirements(
        self,
        requirements: dict,
        top_k: int = 10,
        filters: dict | None = None,
    ) -> pd.DataFrame:
        """
        Recommend candidates based on employer requirements.
        filters: optional dict like {"spa": "4", "city": "LOS ANGELES"} applied after scoring.
        """
        X_req = self._build_requirements_vector(requirements)
        distances, indices = self.nn_model.kneighbors(X_req, n_neighbors=top_k * 3)

        dists = distances[0]
        idxs = indices[0]
        sims = 1.0 - dists

        results = self.df_candidates.iloc[idxs].copy()
        results["match_score"] = sims

        if filters:
            for col, value in filters.items():
                if col in results.columns:
                    results = results[
                        results[col].astype(str).str.upper()
                        == str(value).upper()
                    ]

        results = results.sort_values("match_score", ascending=False)
        return results.head(top_k)

    def recommend_like_candidate(
        self,
        candidate_id,
        top_k: int = 10,
    ) -> pd.DataFrame:
        """
        Recommend candidates similar to a given candidate (prototype search).
        """
        cand_ids_arr = np.array(self.candidate_ids)
        matches = np.where(cand_ids_arr == candidate_id)[0]
        if len(matches) == 0:
            raise ValueError(f"candidate_id {candidate_id} not found")

        idx = matches[0]
        X_proto = self.X_emb[idx : idx + 1]

        distances, indices = self.nn_model.kneighbors(X_proto, n_neighbors=top_k + 1)
        dists = distances[0]
        idxs = indices[0]

        # remove self
        mask_not_self = idxs != idx
        idxs = idxs[mask_not_self][:top_k]
        dists = dists[mask_not_self][:top_k]

        sims = 1.0 - dists
        results = self.df_candidates.iloc[idxs].copy()
        results["match_score"] = sims
        results = results.sort_values("match_score", ascending=False)

        return results

    def explain_similarity_pair(self, idx_a: int, idx_b: int) -> dict:
        """
        Simple similarity explanation between two candidates (by row index, not id).
        Returns cosine similarity and raw distance.
        """
        va = self.X_emb[idx_a : idx_a + 1]
        vb = self.X_emb[idx_b : idx_b + 1]

        # cosine similarity manually
        dot = float(np.dot(va, vb.T))
        norm_a = float(np.linalg.norm(va))
        norm_b = float(np.linalg.norm(vb))
        if norm_a == 0 or norm_b == 0:
            sim = 0.0
        else:
            sim = dot / (norm_a * norm_b)

        return {
            "idx_a": int(idx_a),
            "idx_b": int(idx_b),
            "candidate_id_a": self.candidate_ids[idx_a],
            "candidate_id_b": self.candidate_ids[idx_b],
            "cosine_similarity": sim,
            "cosine_distance": 1.0 - sim,
        }


In [152]:
# CELL: instantiate the recommender with the already prepared df_join

# df_join must be episode_program_features (episodes + HIC joined by program_id)
# if not loaded yet, load it:
# df_join = pd.read_parquet("/content/features/episode_program_features.parquet")

numeric_features = [
    "age_yrs",
    "stay_length_days",
    "episodes_last_1y",
    "episodes_last_3y",
    "months_homeless_3y",
    "year_round_beds",
    "total_beds",
    "utilization_rate",
]

bool_features = [
    "has_physical_condition",
    "has_mental_condition",
    "has_chronic_health",
    "has_disability",
    "has_domestic_violence",
    "is_fleeing_violence",
    "uses_alcohol",
    "uses_drugs",
    "has_sud",
    "has_co_occuring_disorder",
    "is_veteran",
    "is_chronic_person",
    "from_street_flag",
]

categorical_features = [
    "gender_simple",
    "race_simple",
    "sexual_orientation",
    "spa",
    "city",
    "coc",
    "project_type",
    "housing_type",
    "inventory_type",
    "target_population",
]

# filter only existing columns
numeric_features = [c for c in numeric_features if c in df.columns]
bool_features = [c for c in bool_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

recommender = EmployeeRecommender(
    df=df,
    candidate_id_col="episode_id",  # or "client_id"
    numeric_features=numeric_features,
    bool_features=bool_features,
    categorical_features=categorical_features,
    n_neighbors_default=50,
)

In [153]:
# CELL: example usage - recommend by requirements

example_requirements = {
    "spa": "4",
    "city": "LOS ANGELES",
    "is_veteran": 1,
    "has_sud": 0,
    "age_yrs": 30,
}

recs = recommender.recommend_by_requirements(
    requirements=example_requirements,
    top_k=15,
    filters={"spa": "4", "city": "LOS ANGELES"},
)

recs.head(15)


Unnamed: 0,episode_id,age_yrs,stay_length_days,episodes_last_1y,episodes_last_3y,months_homeless_3y,has_physical_condition,has_mental_condition,has_chronic_health,has_disability,...,is_veteran,is_chronic_person,from_street_flag,gender_simple,race_simple,sexual_orientation,spa,city,coc,match_score


In [154]:
# CELL: example usage - recommend like an existing candidate

example_candidate_id = recommender.candidate_ids[0]

similar_to_example = recommender.recommend_like_candidate(
    candidate_id=example_candidate_id,
    top_k=10,
)

similar_to_example


Unnamed: 0,episode_id,age_yrs,stay_length_days,episodes_last_1y,episodes_last_3y,months_homeless_3y,has_physical_condition,has_mental_condition,has_chronic_health,has_disability,...,is_veteran,is_chronic_person,from_street_flag,gender_simple,race_simple,sexual_orientation,spa,city,coc,match_score
12122,e6679,47.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.999792
9541,e4143,48.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.999176
2044,e14112,49.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.99816
1917,e13984,43.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.998088
15583,e10005,46.0,,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.998082
7255,e21055,44.0,,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.997149
16404,e10773,44.0,,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.997149
1838,e13920,48.0,,1.0,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.99705
16964,e11273,48.0,,1.0,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.99705
5460,e18722,50.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.996755


In [155]:
# CELL: example usage - inspect similarity explanation between two indices

sim_info = recommender.explain_similarity_pair(idx_a=0, idx_b=1)
sim_info


  dot = float(np.dot(va, vb.T))


{'idx_a': 0,
 'idx_b': 1,
 'candidate_id_a': 'e81',
 'candidate_id_b': 'e1409',
 'cosine_similarity': 0.8178582281701983,
 'cosine_distance': 0.18214177182980174}

In [156]:
# CELL: save recommender artifacts (preprocessor, KNN, metadata)

import joblib

ARTIFACTS_DIR = "/content/model_artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

joblib.dump(recommender.preprocessor, os.path.join(ARTIFACTS_DIR, "preprocessor.pkl"))
joblib.dump(recommender.nn_model, os.path.join(ARTIFACTS_DIR, "nn_model.pkl"))
np.save(os.path.join(ARTIFACTS_DIR, "candidate_ids.npy"), recommender.candidate_ids)

np.savez(
    os.path.join(ARTIFACTS_DIR, "features_meta.npz"),
    feature_cols_all=np.array(recommender.feature_cols_all, dtype=object),
    numeric_features=np.array(recommender.numeric_features, dtype=object),
    bool_features=np.array(recommender.bool_features, dtype=object),
    categorical_features=np.array(recommender.categorical_features, dtype=object),
)


In [157]:
# CELL: export recommendations to CSV for inspection / external tools

def export_recommendations_to_csv(
    recommender: EmployeeRecommender,
    requirements: dict,
    top_k: int,
    filters: dict | None,
    output_path: str,
) -> None:
    """
    Runs recommend_by_requirements and stores the result as CSV.
    """
    recs = recommender.recommend_by_requirements(
        requirements=requirements,
        top_k=top_k,
        filters=filters,
    )
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    recs.to_csv(output_path, index=False)


# example usage:
# export_recommendations_to_csv(
#     recommender=recommender,
#     requirements={"spa": "4", "city": "LOS ANGELES", "is_veteran": 1, "has_sud": 0},
#     top_k=50,
#     filters={"spa": "4"},
#     output_path="/content/recs_spa4_veterans.csv",
# )


In [158]:
# CELL: batch scoring of multiple employer requirement profiles

def batch_recommendations(
    recommender: EmployeeRecommender,
    requirements_list: list[dict],
    names: list[str] | None = None,
    top_k: int = 10,
    filters: dict | None = None,
) -> dict[str, pd.DataFrame]:
    """
    Takes a list of employer requirement dicts and returns a dict of DataFrames,
    keyed by profile name.
    """
    if names is None:
        names = [f"profile_{i}" for i in range(len(requirements_list))]

    results = {}
    for name, req in zip(names, requirements_list):
        recs = recommender.recommend_by_requirements(
            requirements=req,
            top_k=top_k,
            filters=filters,
        )
        results[name] = recs

    return results


# example usage:
# profiles = [
#     {"spa": "4", "is_veteran": 1, "has_sud": 0},
#     {"spa": "6", "is_veteran": 0, "has_sud": 0, "age_yrs": 22},
# ]
# names = ["employer_A", "employer_B"]
# batch_res = batch_recommendations(recommender, profiles, names, top_k=5)
# for k, v in batch_res.items():
#     print("====", k, "====")
#     display(v.head())


In [159]:
# CELL: simple skill / flag extension hook for future features

def add_custom_binary_feature(
    df: pd.DataFrame,
    col_name: str,
    condition_fn,
) -> pd.DataFrame:
    """
    Adds a custom binary feature column based on a row-wise condition function.

    condition_fn: function(row: pd.Series) -> bool
    """
    df = df.copy()
    df[col_name] = df.apply(lambda row: bool(condition_fn(row)), axis=1)
    return df


# example usage (pseudo):
# df_extended = add_custom_binary_feature(
#     df_join,
#     col_name="ready_for_full_time",
#     condition_fn=lambda r: (r["has_sud"] == 0 and r["has_mental_condition"] == 0 and r["is_chronic_person"] == 0),
# )
# then rebuild the recommender using df_extended instead of df_join.


In [161]:
# CELL: quick diagnostics - feature coverage and missingness

from typing import List

def feature_coverage_report(
    df: pd.DataFrame,
    feature_cols: List[str],
    max_print: int = 50,
) -> pd.DataFrame:
    """
    Returns a small report with:
    - fraction of non-null values
    - number of unique values (for sanity check)
    """
    report_rows = []
    for col in feature_cols[:max_print]:
        if col not in df.columns:
            report_rows.append(
                {
                    "feature": col,
                    "exists": False,
                    "non_null_frac": 0.0,
                    "n_unique": 0,
                }
            )
            continue

        series = df[col]
        non_null_frac = float(series.notnull().mean())
        n_unique = int(series.nunique(dropna=True))

        report_rows.append(
            {
                "feature": col,
                "exists": True,
                "non_null_frac": non_null_frac,
                "n_unique": n_unique,
            }
        )

    report = pd.DataFrame(report_rows)
    return report.sort_values(["exists", "non_null_frac"], ascending=[False, False])


# example usage:
# coverage = feature_coverage_report(df_join, recommender.feature_cols_all)
# coverage

In [162]:
# CELL: scoring API — compute a normalized similarity score between an employer profile
# and ALL candidates (not just top-k). Useful for ranking, visualizations, or model ensembles.

def score_all_candidates(
    recommender: EmployeeRecommender,
    requirements: dict,
) -> pd.DataFrame:
    """
    Returns all candidates with a cosine-similarity score to the employer.
    Useful for downstream ranking, analytics, or fairness audits.
    """
    X_req = recommender._build_requirements_vector(requirements)
    X_emb = recommender.X_emb

    # cosine similarity manually between one vector and all
    req_vec = X_req[0]
    dot = X_emb @ req_vec
    norm_req = np.linalg.norm(req_vec)
    norm_all = np.linalg.norm(X_emb, axis=1)
    sims = dot / (norm_req * norm_all + 1e-12)

    out = recommender.df_candidates.copy()
    out["match_score"] = sims

    out = out.sort_values("match_score", ascending=False)
    return out


In [163]:
# CELL: distance matrix generation (optional; heavy on RAM)
# Creates a small pairwise similarity matrix for deep analysis of clusters of candidates.

def build_similarity_matrix(
    recommender: EmployeeRecommender,
    sample_n: int = 500,
    seed: int = 42,
) -> pd.DataFrame:
    """
    Builds a cosine-similarity matrix for a random subset of candidates.
    Useful for cluster detection, heatmaps, anomaly detection.
    """
    np.random.seed(seed)
    idxs = np.random.choice(len(recommender.X_emb), size=min(sample_n, len(recommender.X_emb)), replace=False)
    X_sub = recommender.X_emb[idxs]

    # normalize
    norms = np.linalg.norm(X_sub, axis=1, keepdims=True)
    X_norm = X_sub / (norms + 1e-12)

    sim_matrix = X_norm @ X_norm.T

    cand_ids = recommender.candidate_ids[idxs]
    return pd.DataFrame(sim_matrix, index=cand_ids, columns=cand_ids)


In [164]:
# CELL: Fairness audit placeholder — compute mean similarity score per demographic group

def fairness_audit(
    recommender: EmployeeRecommender,
    requirements: dict,
    group_col: str,
) -> pd.DataFrame:
    """
    Computes the average match score per demographic group (gender, race, city, etc.).
    Helps detect representational biases in recommendations.
    """
    scored = score_all_candidates(recommender, requirements)

    if group_col not in scored.columns:
        raise ValueError(f"{group_col} not in dataframe")

    report = (
        scored.groupby(group_col)["match_score"]
        .agg(["mean", "std", "count"])
        .reset_index()
        .sort_values("mean", ascending=False)
    )
    return report


In [165]:
# CELL: generate synthetic employer requirements — useful for testing

def generate_random_requirements(
    recommender: EmployeeRecommender,
    seed: int = 123,
) -> dict:
    """
    Produces a random requirement dictionary aligned with the model's feature space.
    Good for stress-testing the recommender.
    """
    np.random.seed(seed)
    req = {}

    # numeric features: random range within observed distribution
    for col in recommender.numeric_features:
        vals = recommender.df[col].dropna().values
        if len(vals) > 0:
            lo, hi = np.percentile(vals, 5), np.percentile(vals, 95)
            req[col] = float(np.random.uniform(lo, hi))
        else:
            req[col] = 0.0

    # boolean features: random 0/1
    for col in recommender.bool_features:
        req[col] = float(np.random.choice([0, 1]))

    # categorical features: random category
    for col in recommender.categorical_features:
        cats = (
            recommender.df[col]
            .dropna()
            .astype(str)
            .unique()
            .tolist()
        )
        if len(cats) == 0:
            req[col] = "missing"
        else:
            req[col] = str(np.random.choice(cats))

    return req


In [166]:
# CELL: stress test — benchmark model speed with random queries

import time

def benchmark_recommender(
    recommender: EmployeeRecommender,
    n_runs: int = 20,
):
    times = []
    for _ in range(n_runs):
        req = generate_random_requirements(recommender)
        t0 = time.time()
        _ = recommender.recommend_by_requirements(req, top_k=10)
        t1 = time.time()
        times.append(t1 - t0)

    return {
        "avg_time": np.mean(times),
        "min_time": np.min(times),
        "max_time": np.max(times),
        "samples": times,
    }


In [167]:
# CELL: candidate-to-employer compatibility vector — returns raw embedding similarities

def compatibility_vector(
    recommender: EmployeeRecommender,
    requirements: dict,
) -> np.ndarray:
    """
    Returns the raw cosine-similarity array between an employer and all candidates.
    Useful for ensemble scoring or downstream ML.
    """
    scored = score_all_candidates(recommender, requirements)
    return scored["match_score"].values


In [168]:
# CELL: Developer API — returns (embedding, meta) for integration with deep models

def export_embeddings(recommender: EmployeeRecommender):
    """
    Returns:
      - X_emb: the full embedding matrix
      - ids: candidate IDs aligned to rows
      - feature_columns: metadata about the feature schema
    """
    return {
        "embeddings": recommender.X_emb.copy(),
        "candidate_ids": recommender.candidate_ids.copy(),
        "feature_columns": recommender.feature_cols_all.copy(),
    }


# example:
# exported = export_embeddings(recommender)
# X = exported["embeddings"]
# ids = exported["candidate_ids"]


In [169]:
# CELL: Build ensemble scoring — combine similarity with a custom weight system

def ensemble_score(
    recommender: EmployeeRecommender,
    requirements: dict,
    weights: dict,
) -> pd.DataFrame:
    """
    Produces a weighted-match score:
    - cosine similarity from NN
    - optional direct scores from specific features with user-defined weights

    weights example:
      {
        "is_veteran": 0.2,
        "age_yrs": -0.1,
        "has_sud": -0.5,
      }
    """
    base = score_all_candidates(recommender, requirements)

    # add weighted contributions
    score = base["match_score"].copy()
    for feature, w in weights.items():
        if feature in base.columns:
            val = base[feature].fillna(0).astype(float)
            score += w * val

    base["ensemble_score"] = score
    base = base.sort_values("ensemble_score", ascending=False)
    return base


In [170]:
# CELL: nearest negative examples (anti-recommendations)

def worst_matches(
    recommender: EmployeeRecommender,
    requirements: dict,
    bottom_k: int = 10,
) -> pd.DataFrame:
    """
    Returns the LEAST compatible candidates.
    Useful for debugging or spotting feature conflicts.
    """
    scored = score_all_candidates(recommender, requirements)
    return scored.tail(bottom_k)


In [171]:
# CELL: candidate feature summary for dashboards

def summarize_candidate(
    recommender: EmployeeRecommender,
    candidate_id,
    cols: list | None = None,
) -> pd.Series:
    """
    Returns the clean feature vector for a candidate for dashboards or debugging.
    """
    df = recommender.df_candidates
    m = df[recommender.candidate_id_col] == candidate_id
    row = df[m].head(1)
    if row.empty:
        raise ValueError(f"candidate_id {candidate_id} not found")

    if cols is None:
        return row.squeeze()

    return row[cols].squeeze()


In [173]:
# CELL 2: define feature lists based on existing columns in df_join

numeric_features = [
    "age_yrs",
    "stay_length_days",
    "episodes_last_1y",
    "episodes_last_3y",
    "months_homeless_3y",
    "year_round_beds",
    "total_beds",
    "utilization_rate",
]

bool_features = [
    "has_physical_condition",
    "has_mental_condition",
    "has_chronic_health",
    "has_disability",
    "has_domestic_violence",
    "is_fleeing_violence",
    "uses_alcohol",
    "uses_drugs",
    "has_sud",
    "has_co_occuring_disorder",
    "is_veteran",
    "is_chronic_person",
    "from_street_flag",
]

categorical_features = [
    "gender_simple",
    "race_simple",
    "sexual_orientation",
    "spa",
    "city",
    "coc",
    "project_type",
    "housing_type",
    "inventory_type",
    "target_population",
]

numeric_features = [c for c in numeric_features if c in df.columns]
bool_features = [c for c in bool_features if c in df.columns]
categorical_features = [c for c in categorical_features if c in df.columns]

numeric_features, bool_features, categorical_features

(['age_yrs',
  'stay_length_days',
  'episodes_last_1y',
  'episodes_last_3y',
  'months_homeless_3y'],
 ['has_physical_condition',
  'has_mental_condition',
  'has_chronic_health',
  'has_disability',
  'has_domestic_violence',
  'is_fleeing_violence',
  'uses_alcohol',
  'uses_drugs',
  'has_sud',
  'has_co_occuring_disorder',
  'is_veteran',
  'is_chronic_person',
  'from_street_flag'],
 ['gender_simple', 'race_simple', 'sexual_orientation', 'spa', 'city', 'coc'])

In [175]:
# CELL 3: instantiate recommender (EmployeeRecommender must already be defined)

recommender = EmployeeRecommender(
    df=df,
    candidate_id_col="episode_id",  # change to "client_id" if you prefer
    numeric_features=numeric_features,
    bool_features=bool_features,
    categorical_features=categorical_features,
    n_neighbors_default=50,
)

len(recommender.candidate_ids), recommender.df_candidates.head()

(17847,
   episode_id  age_yrs  stay_length_days  episodes_last_1y  episodes_last_3y  \
 0        e81     46.0               NaN               1.0               1.0   
 1      e1409     48.0               NaN               0.0               0.0   
 2     e12155     54.0               NaN               0.0               0.0   
 3     e12156     35.0               NaN               0.0               2.0   
 4     e12159      3.0               NaN               1.0               0.0   
 
    months_homeless_3y  has_physical_condition  has_mental_condition  \
 0                 3.0                     0.0                   0.0   
 1                 0.0                     0.0                   0.0   
 2                 0.0                     0.0                   0.0   
 3                13.0                     0.0                   0.0   
 4                 0.0                     0.0                   0.0   
 
    has_chronic_health  has_disability  ...  has_co_occuring_disorder  \
 0 

In [176]:
# CELL 5: recommendation "like this candidate"

example_candidate_id = recommender.candidate_ids[0]

similar_candidates = recommender.recommend_like_candidate(
    candidate_id=example_candidate_id,
    top_k=10,
)

similar_candidates


Unnamed: 0,episode_id,age_yrs,stay_length_days,episodes_last_1y,episodes_last_3y,months_homeless_3y,has_physical_condition,has_mental_condition,has_chronic_health,has_disability,...,is_veteran,is_chronic_person,from_street_flag,gender_simple,race_simple,sexual_orientation,spa,city,coc,match_score
12122,e6679,47.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.999792
9541,e4143,48.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.999176
2044,e14112,49.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.99816
1917,e13984,43.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.998088
15583,e10005,46.0,,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.998082
7255,e21055,44.0,,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.997149
16404,e10773,44.0,,1.0,1.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.997149
1838,e13920,48.0,,1.0,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.99705
16964,e11273,48.0,,1.0,1.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.99705
5460,e18722,50.0,,1.0,1.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,other,other,other,San Fernando Valley,1.LA City,1,0.996755


In [177]:
# CELL 6: save artifacts to reuse later (optional but recommended)

import joblib
import numpy as np
import os

ARTIFACTS_DIR = "/content/model_artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

joblib.dump(recommender.preprocessor, os.path.join(ARTIFACTS_DIR, "preprocessor.pkl"))
joblib.dump(recommender.nn_model, os.path.join(ARTIFACTS_DIR, "nn_model.pkl"))
np.save(os.path.join(ARTIFACTS_DIR, "candidate_ids.npy"), recommender.candidate_ids)

np.savez(
    os.path.join(ARTIFACTS_DIR, "features_meta.npz"),
    feature_cols_all=np.array(recommender.feature_cols_all, dtype=object),
    numeric_features=np.array(recommender.numeric_features, dtype=object),
    bool_features=np.array(recommender.bool_features, dtype=object),
    categorical_features=np.array(recommender.categorical_features, dtype=object),
)

ARTIFACTS_DIR


'/content/model_artifacts'