Cell 1 – Setup

In [29]:
# 03_taxonomy_mapping.ipynb

import sys
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import load_config
from src.data.loader import load_jobs
from src.taxonomy.mapping import load_onet, build_onet_embeddings, map_clusters_to_soc
from src.clustering.clustering import kmeans_clusters, hdbscan_clusters

cfg = load_config()

EMB_PATH = PROJECT_ROOT / "results" / "embeddings" / "embeddings_desc_clean_len50.npy"
LABELS_PATH = PROJECT_ROOT / "results" / "clusters" / "kmeans_labels.npy"

EMB_PATH, LABELS_PATH


(WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/embeddings/embeddings_desc_clean_len50.npy'),
 WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/clusters/kmeans_labels.npy'))

Cell 2 – Load embeddings, labels, and jobs

In [None]:
embeddings = np.load(EMB_PATH)
labels = np.load(LABELS_PATH)
jobs = load_jobs()

# Load embeddings + labels
embeddings = np.load(EMB_PATH)
labels = np.load(LABELS_PATH)

print("jobs:", jobs.shape)
print("embeddings:", embeddings.shape)
print("labels:", labels.shape)


jobs: (24164, 15)
embeddings: (24001, 384)
labels: (24001,)


Cell 3 – Compute cluster centers

In [31]:
# ignore negative labels
valid_mask = labels >= 0
X = embeddings[valid_mask]
y = labels[valid_mask]

n_clusters = len(np.unique(y))
print("Number of clusters:", n_clusters)

cluster_centers = np.zeros((n_clusters, embeddings.shape[1]), dtype=np.float32)
for c in range(n_clusters):
    cluster_centers[c] = X[y == c].mean(axis=0)

cluster_centers.shape

Number of clusters: 100


(100, 384)

Cell 4 – Load O*NET and build its embeddings

In [32]:
onet_df = load_onet()
print("O*NET shape:", onet_df.shape)
onet_df.head()
onet_df["Title"].unique()

O*NET shape: (1016, 3)


array(['Chief Executives', 'Chief Sustainability Officers',
       'General and Operations Managers', ..., 'Infantry',
       'Special Forces',
       'Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other'],
      shape=(1016,), dtype=object)

In [33]:
onet_emb = build_onet_embeddings(onet_df)
onet_emb.shape

Batches: 100%|██████████| 16/16 [00:10<00:00,  1.51it/s]


(1016, 384)

Cell 5 – Map clusters to SOC codes

In [34]:
mapping_df = map_clusters_to_soc(
    cluster_centers=cluster_centers,
    onet_df=onet_df,
    onet_emb=onet_emb,
)

mapping_df.head()


Unnamed: 0,cluster_id,soc_matches
0,0,"[(39-5094.00, 0.5522235035896301), (51-9082.00..."
1,1,"[(47-5013.00, 0.4926125407218933), (49-3091.00..."
2,2,"[(19-3033.00, 0.6344330906867981), (21-1014.00..."
3,3,"[(43-4141.00, 0.5950422286987305), (43-3031.00..."
4,4,"[(11-9141.00, 0.6407411694526672), (41-9021.00..."


Cell 6 – Expand mapping for readability

In [35]:
rows = []
for _, row in mapping_df.iterrows():
    cid = row["cluster_id"]
    matches = row["soc_matches"]
    if not matches:
        rows.append({"cluster_id": cid, "soc_code": None, "similarity": None})
    else:
        for code, score in matches:
            rows.append({"cluster_id": cid, "soc_code": code, "similarity": score})

mapping_long = pd.DataFrame(rows)
mapping_long.head()


Unnamed: 0,cluster_id,soc_code,similarity
0,0,39-5094.00,0.552224
1,0,51-9082.00,0.491284
2,0,29-2055.00,0.459426
3,1,47-5013.00,0.492613
4,1,49-3091.00,0.489898


Cell 7 – Join back to cluster examples

In [36]:
in_path = PROJECT_ROOT / "data" / "jobs_with_clusters.parquet"
jobs_with_labels = pd.read_parquet(in_path)


In [37]:
# pick one cluster and see its mapped SOC codes + example jobs
cluster_id = 0  # choose any id
cluster_mapping = mapping_long[mapping_long["cluster_id"] == cluster_id]
cluster_mapping


Unnamed: 0,cluster_id,soc_code,similarity
0,0,39-5094.00,0.552224
1,0,51-9082.00,0.491284
2,0,29-2055.00,0.459426


In [38]:
jobs_with_labels[jobs_with_labels["cluster"] == cluster_id][
    [cfg["jobs"]["id_column"], "speciality", "name", cfg["jobs"]["text_column"]]
].head(5)


Unnamed: 0,company_id,speciality,name,description
823,9060,Airway Management,Laerdal Medical,Laerdal Medical is a world leader in healthcar...
877,165340,Regenerative Medicine,Organogenesis,Organogenesis™ is a leading regenerative medic...
1438,9561,Robotics,Intuitive,"Intuitive (Nasdaq: ISRG), headquartered in Sun..."
1624,11073,audiology,Starkey Hearing,"Starkey is a privately held, global hearing te..."
1737,58360388,,Sandstone Medical Solutions,Sandstone Medical Solutions is a professional ...


In [39]:
from src.evaluation.metrics import internal_cluster_metrics, mapping_metrics

# --- 1) internal cluster quality ---
internal = internal_cluster_metrics(
    embeddings=embeddings,
    labels=labels,
    metric="cosine"  # recommended for sentence embeddings
)

# --- 2) taxonomy mapping quality ---
map_eval = mapping_metrics(
    mapping_df=mapping_df,             # must have: cluster_id, soc_matches
    labels=labels,
    similarity_threshold=0.4
)

print("Internal metrics:", internal)
print("Mapping metrics:", map_eval)


Internal metrics: {'silhouette': 0.03062870353460312, 'davies_bouldin': 4.476163984919504, 'n_clusters': 100}
Mapping metrics: {'n_clusters': 100, 'noise_rate': 0.0, 'coverage': 0.97, 'avg_top1_similarity': 0.5425968219324485, 'avg_top1_margin': 0.034263985636441605}


In [72]:
rows = []
for _, row in mapping_df.iterrows():
    cid = row["cluster_id"]
    matches = row["soc_matches"]
    if not matches:
        rows.append({"cluster_id": cid, "soc_code": None, "similarity": None})
    else:
        for code, score in matches:
            rows.append({"cluster_id": cid, "soc_code": code, "similarity": score})

mapping_long = pd.DataFrame(rows)
mapping_long.head()

Unnamed: 0,cluster_id,soc_code,similarity
0,0,39-5094.00,0.552224
1,0,51-9082.00,0.491284
2,0,29-2055.00,0.459426
3,1,47-5013.00,0.492613
4,1,49-3091.00,0.489898


Cell 9 – Save Mapping Results

In [59]:
jobs_used = jobs.iloc[:len(labels)].copy()   # 24001 rows
jobs_used["cluster_id"] = labels


In [60]:
sample_100 = jobs_used.sample(n=100, random_state=42)

sample_100[
    ["cluster_id", "speciality", "name", text_col]
].head(20)


Unnamed: 0,cluster_id,speciality,name,description
16031,65,Consulting,TIMBER,We Listen. We Act. We Succeed. \n\nTIMBER IT C...
21639,12,Cybersecurity,DOIT Security,OUR STORY\n\nSafety Made Simple\n\nOur Beginni...
7163,10,Eating Disorders,Walden Behavioral Care,Walden Behavioral Care's mission is to offer s...
16954,37,Recruiting,AUX Partners,Recruiting the Future\n\nAUX is a Full-Scale R...
2518,42,career preparation,Metropolitan Community College,Metropolitan Community College provides educat...
22862,24,Investment Management,Evolve Talent Partners,The new workforce is anything but traditional…...
18524,99,robotic process automation,Ashling Partners,Ashling Partners is a professional services & ...
23213,42,,accelant,We accelerate growth for our Customers – top l...
18039,26,,CAIRE Inc.,CAIRE Inc. leverages its rich legacy in oxygen...
16579,90,Government Contracting,RAZOR,RAZOR is a women-owned small business (WOSB) s...


In [73]:
# 1) onet_df 컬럼명 표준화
onet_std = onet_df.rename(columns={
    "O*NET-SOC Code": "soc_code",
    "Title": "title",
    "Description": "onet_description",
    "SOC Code": "soc_code",
    "SOC_CODE": "soc_code",
    "title": "title",
    "description": "onet_description",
}).copy()

# 혹시 soc_code가 숫자로 들어있으면 문자열로 통일
onet_std["soc_code"] = onet_std["soc_code"].astype(str)

# 2) mapping_long도 soc_code로 만들어줬으니 그대로 merge
mapping_long = mapping_long.merge(
    onet_std[["soc_code", "title", "onet_description"]],
    on="soc_code",
    how="left"
)


mapping_long.head()

Unnamed: 0,cluster_id,soc_code,similarity,title,onet_description
0,0,39-5094.00,0.552224,Skincare Specialists,Provide skincare treatments to face and body t...
1,0,51-9082.00,0.491284,Medical Appliance Technicians,"Construct, maintain, or repair medical support..."
2,0,29-2055.00,0.459426,Surgical Technologists,"Assist in operations, under the supervision of..."
3,1,47-5013.00,0.492613,"Service Unit Operators, Oil and Gas",Operate equipment to increase oil flow from pr...
4,1,49-3091.00,0.489898,Bicycle Repairers,Repair and service bicycles.


In [None]:
top1_soc = (
    mapping_long
    .sort_values(["cluster_id", "similarity"], ascending=[True, False])
    .groupby("cluster_id")
    .head(1)
)

top1_soc.head()

Unnamed: 0,cluster_id,soc_code,similarity,title,onet_description
0,0,39-5094.00,0.552224,Skincare Specialists,Provide skincare treatments to face and body t...
3,1,47-5013.00,0.492613,"Service Unit Operators, Oil and Gas",Operate equipment to increase oil flow from pr...
6,2,19-3033.00,0.634433,Clinical and Counseling Psychologists,"Assess, diagnose, and treat mental and emotion..."
9,3,43-4141.00,0.595042,New Accounts Clerks,Interview persons desiring to open accounts in...
12,4,11-9141.00,0.640741,"Property, Real Estate, and Community Associati...","Plan, direct, or coordinate the selling, buyin..."


In [68]:
# 1) jobs 쪽 cluster 컬럼 통일
if "cluster" in jobs_with_labels.columns:
    jobs_with_labels = jobs_with_labels.rename(columns={"cluster": "cluster_id"})

# 2) top1_soc 쪽도 cluster_id 맞추기 (이미 cluster_id면 그대로)
if "cluster" in top1_soc.columns:
    top1_soc = top1_soc.rename(columns={"cluster": "cluster_id"})

# 3) merge
sample_jobs = jobs_with_labels.sample(n=100, random_state=42)
explore_100 = sample_jobs.merge(top1_soc, on="cluster_id", how="left")



In [69]:
explore_100 = explore_100[
    ["cluster_id", "speciality", "description", "soc_code", "similarity", "title", "onet_description"]
].rename(columns={
    "description": "job_description"
})

explore_100.head()


Unnamed: 0,cluster_id,speciality,job_description,soc_code,similarity,title,onet_description
0,65,financial planning,SageSpring Wealth Partners is a firm of more t...,13-2052.00,0.66934,Personal Financial Advisors,Advise clients on financial plans using knowle...
1,12,,Doctors Without Borders/Médecins Sans Frontièr...,29-2099.08,0.595115,Patient Representatives,"Assist patients in obtaining services, underst..."
2,10,Staff augmentation services for IT & Non-IT ca...,"Rangam is a global, innovative, total talent m...",27-2012.04,0.549911,Talent Directors,Audition and interview performers to select mo...
3,37,Mobile Applications,Vision\nReimagine Illinois government through ...,15-1241.01,0.565313,Telecommunications Engineering Specialists,"Design or configure wired, wireless, and satel..."
4,42,Vacation Travel,Sun Country Airlines (SY and NASDAQ: SNCY) is ...,17-3021.00,0.580903,Aerospace Engineering and Operations Technolog...,"Operate, install, adjust, and maintain integra..."


In [71]:
out_path = PROJECT_ROOT / "results" / "reports" / "explore_100_jobs_with_onet.csv"
explore_100.to_csv(out_path, index=False)
print("Saved to:", out_path)


Saved to: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\explore_100_jobs_with_onet.csv
