Cell 1 – Setup

In [1]:
# 03_taxonomy_mapping.ipynb

import sys
from pathlib import Path

import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import load_config
from src.data.loader import load_jobs
from src.taxonomy.mapping import load_onet, build_onet_embeddings, map_clusters_to_soc

cfg = load_config()

EMB_PATH = PROJECT_ROOT / "results" / "embeddings" / "jobs_embeddings.npy"
LABELS_PATH = PROJECT_ROOT / "results" / "clusters" / "kmeans_labels.npy"

EMB_PATH, LABELS_PATH


ModuleNotFoundError: No module named 'sentence_transformers'

Cell 2 – Load embeddings, labels, and jobs

In [None]:
embeddings = np.load(EMB_PATH)
labels = np.load(LABELS_PATH)
jobs = load_jobs()

embeddings.shape, labels.shape, jobs.shape


Cell 3 – Compute cluster centers

In [None]:
# ignore negative labels if you later use HDBSCAN
valid_mask = labels >= 0
X = embeddings[valid_mask]
y = labels[valid_mask]

n_clusters = len(np.unique(y))
print("Number of clusters:", n_clusters)

cluster_centers = np.zeros((n_clusters, embeddings.shape[1]), dtype=np.float32)
for c in range(n_clusters):
    cluster_centers[c] = X[y == c].mean(axis=0)

cluster_centers.shape


Cell 4 – Load O*NET and build its embeddings

In [None]:
onet_df = load_onet()
print("O*NET shape:", onet_df.shape)
onet_df.head()


In [None]:
onet_emb = build_onet_embeddings(onet_df)
onet_emb.shape


Cell 5 – Map clusters to SOC codes

In [None]:
mapping_df = map_clusters_to_soc(
    cluster_centers=cluster_centers,
    onet_df=onet_df,
    onet_emb=onet_emb,
)

mapping_df.head()


Cell 6 – Expand mapping for readability

In [None]:
rows = []
for _, row in mapping_df.iterrows():
    cid = row["cluster_id"]
    matches = row["soc_matches"]
    if not matches:
        rows.append({"cluster_id": cid, "soc_code": None, "similarity": None})
    else:
        for code, score in matches:
            rows.append({"cluster_id": cid, "soc_code": code, "similarity": score})

mapping_long = pd.DataFrame(rows)
mapping_long.head()


Cell 7 – Join back to cluster examples

In [None]:
jobs_with_labels = jobs.copy()
jobs_with_labels["cluster"] = labels

# pick one cluster and see its mapped SOC codes + example jobs
cluster_id = 0  # choose any id
cluster_mapping = mapping_long[mapping_long["cluster_id"] == cluster_id]
cluster_mapping


In [None]:
jobs_with_labels[jobs_with_labels["cluster"] == cluster_id][
    [cfg["jobs"]["id_column"], "title", "company", cfg["jobs"]["text_column"]]
].head(5)
