# Supervision Matcher (Filters + ML + Capacity)

This notebook matches **Associates** to **Supervisors**.

**Pipeline**
1. **Deterministic filters (no ML):** `State` must match exactly, and Associate **License Type** must exist in Supervisor **Who can you supervise?`**.
2. **Availability similarity (ML):** TF‑IDF + cosine similarity using the free‑text `Availability` fields.
3. **Capacity‑aware assignment:** Greedy highest‑score allocation; no supervisor is over capacity.

**Inputs** (same folder as notebook)
- `Supervision Matching Program - Supervisors.csv`
- `Supervision Matching Program - Associates.csv`

**Outputs**
- `supervision_matches.csv`
- `associates_unassigned.csv`


In [1]:
# (Optional) install dependencies in this environment
# !pip install -q pandas numpy scikit-learn

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

# Paths (files must be in the same folder as this notebook)
SUPERVISORS_CSV = Path('Supervision Matching Program - Supervisors.csv')
ASSOCIATES_CSV  = Path('Supervision Matching Program - Associates.csv')

assert SUPERVISORS_CSV.exists(), f'Missing {SUPERVISORS_CSV.resolve()}'
assert ASSOCIATES_CSV.exists(), f'Missing {ASSOCIATES_CSV.resolve()}'

sup = pd.read_csv(SUPERVISORS_CSV)
assoc = pd.read_csv(ASSOCIATES_CSV)

print('Supervisors:', sup.shape, 'Associates:', assoc.shape)
display(sup.head(3))
display(assoc.head(3))

AssertionError: Missing /home/connor/Sophias_Mom/supervisors/Supervision Matching Program - Supervisors.csv

In [None]:
# ---- Normalization helpers
def normalize_state(s):
    return str(s).strip().upper()

def parse_license_list(s):
    if pd.isna(s):
        return set()
    s = str(s)
    parts = [p.strip().upper() for p in s.replace(';', ',').split(',') if p.strip()]
    return set(parts) if parts else {s.strip().upper()}

def coerce_int(x, default=1):
    try:
        v = int(x)
        return v if v >= 0 else default
    except Exception:
        return default

# ---- Normalize columns
sup = sup.copy()
assoc = assoc.copy()

sup['State_norm'] = sup['State'].map(normalize_state)
assoc['State_norm'] = assoc['State'].map(normalize_state)

sup['WhoSet'] = sup['Who can you supervise?'].map(parse_license_list)
assoc['License_norm'] = assoc['License Type'].astype(str).str.strip().str.upper()

sup['Capacity_int'] = sup['Capacity'].map(coerce_int)

display(sup[['Name','State','State_norm','Who can you supervise?','WhoSet','Capacity','Capacity_int']].head(5))
display(assoc[['Name','State','State_norm','License Type','License_norm']].head(5))

Unnamed: 0,Name,State,State_norm,Who can you supervise?,WhoSet,Capacity,Capacity_int
0,Test,"Illinois, Iowa, North Carolina, South Carolina","ILLINOIS, IOWA, NORTH CAROLINA, SOUTH CAROLINA",Psychologist,{PSYCHOLOGIST},2,2
1,Connor,"Illinois, New Mexico, North Carolina","ILLINOIS, NEW MEXICO, NORTH CAROLINA","Social Worker, Counselor","{COUNSELOR, SOCIAL WORKER}",4,4
2,Philip,"Illinois, New Mexico, North Carolina","ILLINOIS, NEW MEXICO, NORTH CAROLINA","Social Worker, Counselor","{COUNSELOR, SOCIAL WORKER}",4,4


Unnamed: 0,Name,State,State_norm,License Type,License_norm
0,Sophia Cauwels,North Carolina,NORTH CAROLINA,Counselor,COUNSELOR
1,HELLO,North Carolina,NORTH CAROLINA,Social Worker,SOCIAL WORKER
2,GOODBYE,California,CALIFORNIA,Social Worker,SOCIAL WORKER


## 1) Hard filter: exact State + License inclusion

In [None]:
# Build candidate pairs by exact criteria
candidates = []
for ai, a in assoc.iterrows():
    for sj, s in sup.iterrows():
        if a['State_norm'] != s['State_norm']:
            continue
        if a['License_norm'] not in s['WhoSet']:
            continue
        candidates.append({
            'assoc_idx': ai,
            'sup_idx': sj,
            'Associate': a['Name'],
            'Associate Email': a['Email Address'],
            'Associate State': a['State'],
            'Associate License': a['License Type'],
            'Supervisor': s['Name'],
            'Supervisor Email': s['Email Address'],
            'Supervisor State': s['State'],
            'Who can you supervise?': s['Who can you supervise?'],
        })

cand_df = pd.DataFrame(candidates)
print(f'Candidate pairs after filter: {len(cand_df)}')
display(cand_df.head(10))

Candidate pairs after filter: 0


## 2) Availability similarity (TF‑IDF cosine)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

assoc_avail = assoc['Availability'].fillna('').astype(str).values
sup_avail   = sup['Availability'].fillna('').astype(str).values

vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2))
X_assoc = vectorizer.fit_transform(assoc_avail)
X_sup   = vectorizer.transform(sup_avail)

scores = []
for _, row in cand_df.iterrows():
    ai = int(row['assoc_idx'])
    sj = int(row['sup_idx'])
    sim = cosine_similarity(X_assoc[ai], X_sup[sj]).ravel()[0]
    scores.append(sim)
cand_df['availability_score'] = scores

display(cand_df.sort_values('availability_score', ascending=False).head(10))

Unnamed: 0,availability_score


## 3) Capacity‑aware greedy assignment

In [None]:
sup_capacity_left = sup['Capacity_int'].to_dict()  # keyed by supervisor row index
assigned_assoc = set()
assignments = []

for _, row in cand_df.sort_values('availability_score', ascending=False).iterrows():
    ai = int(row['assoc_idx'])
    sj = int(row['sup_idx'])
    if ai in assigned_assoc:
        continue
    if sup_capacity_left.get(sj, 0) <= 0:
        continue
    # assign
    assigned_assoc.add(ai)
    sup_capacity_left[sj] = sup_capacity_left.get(sj, 0) - 1
    assignments.append(row)

assign_df = pd.DataFrame(assignments).reset_index(drop=True)
print(f'Assigned {len(assign_df)} of {len(assoc)} associates')
# Safe display (avoids KeyError if columns missing or no assignments)
cols = ['Associate','Associate License','Supervisor','availability_score']
display(assign_df.reindex(columns=[c for c in cols if c in assign_df.columns]).head(20))


NameError: name 'sup' is not defined

## 4) Save results

In [None]:
OUT_MATCHES = Path('supervision_matches.csv')
OUT_UNASSIGNED = Path('associates_unassigned.csv')

assign_df.to_csv(OUT_MATCHES, index=False)

assigned_set = set(assign_df['assoc_idx'].tolist())
unassigned = assoc.loc[~assoc.index.isin(assigned_set)].copy()
unassigned.to_csv(OUT_UNASSIGNED, index=False)

print('Wrote ->', OUT_MATCHES.resolve())
print('Wrote ->', OUT_UNASSIGNED.resolve())