Cell 1 – Setup

In [2]:
# 03_taxonomy_mapping.ipynb

import sys
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import load_config
from src.data.loader import load_jobs
from src.taxonomy.mapping import load_onet, build_onet_embeddings, map_clusters_to_soc

cfg = load_config()

EMB_PATH = PROJECT_ROOT / "results" / "embeddings" / "jobs_embeddings.npy"
LABELS_PATH = PROJECT_ROOT / "results" / "clusters" / "kmeans_labels.npy"

EMB_PATH, LABELS_PATH


  from .autonotebook import tqdm as notebook_tqdm


(WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/embeddings/jobs_embeddings.npy'),
 WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/clusters/kmeans_labels.npy'))

Cell 2 – Load embeddings, labels, and jobs

In [3]:
embeddings = np.load(EMB_PATH)
labels = np.load(LABELS_PATH)
jobs = load_jobs()

embeddings.shape, labels.shape, jobs.shape


((24164, 384), (24164,), (24164, 15))

Cell 3 – Compute cluster centers

In [None]:
# ignore negative labels
valid_mask = labels >= 0
X = embeddings[valid_mask]
y = labels[valid_mask]

n_clusters = len(np.unique(y))
print("Number of clusters:", n_clusters)

cluster_centers = np.zeros((n_clusters, embeddings.shape[1]), dtype=np.float32)
for c in range(n_clusters):
    cluster_centers[c] = X[y == c].mean(axis=0)

cluster_centers.shape

Number of clusters: 50


(50, 384)

Cell 4 – Load O*NET and build its embeddings

In [5]:
onet_df = load_onet()
print("O*NET shape:", onet_df.shape)
onet_df.head()

O*NET shape: (1016, 3)


Unnamed: 0,Title,Description,O*NET-SOC Code
0,Chief Executives,Determine and formulate policies and provide o...,11-1011.00
1,Chief Sustainability Officers,"Communicate and coordinate with management, sh...",11-1011.03
2,General and Operations Managers,"Plan, direct, or coordinate the operations of ...",11-1021.00
3,Legislators,"Develop, introduce, or enact laws and statutes...",11-1031.00
4,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici...",11-2011.00


In [None]:
onet_emb = build_onet_embeddings(onet_df)
onet_emb.shape

Batches: 100%|██████████| 16/16 [00:30<00:00,  1.92s/it]


(1016, 384)

Cell 5 – Map clusters to SOC codes

In [7]:
mapping_df = map_clusters_to_soc(
    cluster_centers=cluster_centers,
    onet_df=onet_df,
    onet_emb=onet_emb,
)

mapping_df.head()


Unnamed: 0,cluster_id,soc_matches
0,0,"[(51-4111.00, 0.544289767742157), (17-2112.00,..."
1,1,"[(15-2099.01, 0.48080164194107056), (19-1029.0..."
2,2,"[(27-1021.00, 0.60793137550354), (27-1027.00, ..."
3,3,"[(27-2012.04, 0.506357729434967), (13-1161.00,..."
4,4,"[(11-9141.00, 0.6353990435600281), (41-9021.00..."


Cell 6 – Expand mapping for readability

In [8]:
rows = []
for _, row in mapping_df.iterrows():
    cid = row["cluster_id"]
    matches = row["soc_matches"]
    if not matches:
        rows.append({"cluster_id": cid, "soc_code": None, "similarity": None})
    else:
        for code, score in matches:
            rows.append({"cluster_id": cid, "soc_code": code, "similarity": score})

mapping_long = pd.DataFrame(rows)
mapping_long.head()


Unnamed: 0,cluster_id,soc_code,similarity
0,0,51-4111.00,0.54429
1,0,17-2112.00,0.530414
2,0,47-5013.00,0.520483
3,1,15-2099.01,0.480802
4,1,19-1029.01,0.441202


Cell 7 – Join back to cluster examples

In [9]:
jobs_with_labels = jobs.copy()
jobs_with_labels["cluster"] = labels

# pick one cluster and see its mapped SOC codes + example jobs
cluster_id = 0  # choose any id
cluster_mapping = mapping_long[mapping_long["cluster_id"] == cluster_id]
cluster_mapping


Unnamed: 0,cluster_id,soc_code,similarity
0,0,51-4111.00,0.54429
1,0,17-2112.00,0.530414
2,0,47-5013.00,0.520483


In [10]:
jobs_with_labels[jobs_with_labels["cluster"] == cluster_id][
    [cfg["jobs"]["id_column"], "speciality", "name", cfg["jobs"]["text_column"]]
].head(5)


Unnamed: 0,company_id,speciality,name,description
171,1906,Military systems,General Dynamics Land Systems,Innovation is in Our DNA \n\nGeneral Dynamics ...
195,2089,Marine and Industrial engines,Volvo Penta,Volvo Penta is a world-leading supplier of eng...
198,2096,Articulated Haulers,Volvo Construction Equipment,Volvo Construction Equipment (Volvo CE) is a g...
243,2425,Aircraft Engines,Pratt & Whitney,"Pratt & Whitney, an RTX business, is a global ..."
311,2722,Semiconductor Process Control,KLA,KLA develops industry-leading equipment and se...


In [11]:
from src.evaluation.metrics import internal_cluster_metrics, mapping_metrics

# --- 1) internal cluster quality ---
internal = internal_cluster_metrics(
    embeddings=embeddings,
    labels=labels,
    metric="cosine"  # recommended for sentence embeddings
)

# --- 2) taxonomy mapping quality ---
map_eval = mapping_metrics(
    mapping_df=mapping_df,             # must have: cluster_id, soc_matches
    labels=labels,
    similarity_threshold=0.4
)

print("Internal metrics:", internal)
print("Mapping metrics:", map_eval)


Internal metrics: {'silhouette': 0.029973605647683144, 'davies_bouldin': 4.660137987116893, 'n_clusters': 50}
Mapping metrics: {'n_clusters': 50, 'noise_rate': 0.0, 'coverage': 0.96, 'avg_top1_similarity': 0.5385381343464056, 'avg_top1_margin': 0.03573673301272922}


Cell 9 – Save Mapping Results (results/mappings/)

In [12]:
from pathlib import Path

MAPPINGS_DIR = PROJECT_ROOT / "results" / "mappings"
MAPPINGS_DIR.mkdir(parents=True, exist_ok=True)


mapping_long_path = MAPPINGS_DIR / "cluster_soc_mapping.csv"
mapping_long.to_csv(mapping_long_path, index=False)

mapping_long_path


WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/mappings/cluster_soc_mapping.csv')

Cell 10 - Generate Summary Reports (results/reports/)

In [13]:
REPORTS_DIR = PROJECT_ROOT / "results" / "reports"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

topk = (
    mapping_long
    .sort_values(["cluster_id", "similarity"], ascending=[True, False])
    .groupby("cluster_id")
    .head(3)
)

topk_path = REPORTS_DIR / "cluster_top3_soc_codes.csv"
topk.to_csv(topk_path, index=False)

topk.head()


Unnamed: 0,cluster_id,soc_code,similarity
0,0,51-4111.00,0.54429
1,0,17-2112.00,0.530414
2,0,47-5013.00,0.520483
3,1,15-2099.01,0.480802
4,1,19-1029.01,0.441202
