Cell 1 – Setup

In [5]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import load_config
from src.data.loader import load_jobs , load_onet
from src.data.preprocess import clean_text
from src.taxonomy.mapping import load_onet, build_onet_embeddings, map_clusters_to_soc
from src.taxonomy.medoids import pick_medoids_simple, map_medoids_to_onet

cfg = load_config()


CLEAN_VERSION = "boiler_v1_len50_cap3000"

EMB_PATH   = PROJECT_ROOT / "results" / "embeddings" / f"embeddings_{CLEAN_VERSION}.npy"
JOBS_PATH  = PROJECT_ROOT / "results" / "embeddings" / f"jobs_{CLEAN_VERSION}.parquet"
LABELS_PATH = PROJECT_ROOT / "results" / "clusters" / "kmeans_labels.npy"

EMB_PATH, JOBS_PATH, LABELS_PATH



(WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/embeddings/embeddings_boiler_v1_len50_cap3000.npy'),
 WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/embeddings/jobs_boiler_v1_len50_cap3000.parquet'),
 WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/clusters/kmeans_labels.npy'))

Cell 2 – Load embeddings, labels, and jobs

In [6]:
# Cell 2 — Load embeddings, labels, and jobs (and validate alignment)

embeddings = np.load(EMB_PATH)
km_labels  = np.load(LABELS_PATH)

# Always load the cleaned jobs from parquet (guarantees row alignment)
jobs = pd.read_parquet(JOBS_PATH)

print("Embeddings shape:", embeddings.shape)
print("Labels shape:", km_labels.shape)
print("Jobs shape:", jobs.shape)

assert embeddings.shape[0] == len(km_labels) == len(jobs), "❌ Row mismatch: embeddings, labels, and jobs are not aligned"
print("✅ Alignment OK")


Embeddings shape: (23839, 384)
Labels shape: (23839,)
Jobs shape: (23839, 17)
✅ Alignment OK


Cell 3 -- Load O*NET data and build embeddings

In [7]:
onet_df = load_onet()
print("O*NET shape:", onet_df.shape)
onet_df.head()
onet_df["Title"].unique()

O*NET shape: (1016, 3)


array(['Chief Executives', 'Chief Sustainability Officers',
       'General and Operations Managers', ..., 'Infantry',
       'Special Forces',
       'Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other'],
      shape=(1016,), dtype=object)

Cell 4 -- Clean_Text on O*NET

In [8]:
# Column names (use config overrides if present)
title_col = cfg["taxonomy"].get("title_column", "Title") if "taxonomy" in cfg else "Title"
desc_col  = cfg["taxonomy"].get("desc_column", "Description") if "taxonomy" in cfg else "Description"

onet_clean = onet_df.copy()

# Apply unified cleaning
onet_clean[title_col] = onet_clean[title_col].astype(str).map(lambda x: clean_text(x, mode="onet_title"))
onet_clean[desc_col]  = onet_clean[desc_col].astype(str).map(lambda x: clean_text(x, mode="onet_desc"))

# Remove empty strings after cleaning (same as before)
mask = onet_clean[title_col].str.strip().ne("") & onet_clean[desc_col].str.strip().ne("")
onet_clean = onet_clean.loc[mask].reset_index(drop=True)

print("✅ Cleaned O*NET shape:", onet_clean.shape)
print("Sample O*NET title:", onet_clean[title_col].iloc[0])
print("Sample O*NET desc:", onet_clean[desc_col].iloc[0][:300])


✅ Cleaned O*NET shape: (1016, 3)
Sample O*NET title: Chief Executives
Sample O*NET desc: Determine and formulate policies and provide overall direction of companies or private and public sector organizations within guidelines set up by a board of directors or similar governing body. Plan, direct, or coordinate operational activities at the highest level of management with the help of su


Cell 5  — Build & Save O*NET embeddings (cleaned)

In [9]:
onet_emb = build_onet_embeddings(onet_df)
print("O*NET embeddings shape:", onet_emb.shape)


Batches: 100%|██████████| 16/16 [00:23<00:00,  1.47s/it]

O*NET embeddings shape: (1016, 384)





In [10]:
ONET_EMB_PATH = PROJECT_ROOT / "results" / "embeddings" / "onet_embeddings_clean_v1.npy"
ONET_DF_PATH  = PROJECT_ROOT / "results" / "embeddings" / "onet_df_clean_v1.parquet"
ONET_EMB_PATH.parent.mkdir(parents=True, exist_ok=True)

np.save(ONET_EMB_PATH, onet_emb)
onet_df.to_parquet(ONET_DF_PATH, index=False)
print("✅ Saved:", ONET_EMB_PATH)
print("✅ Saved:", ONET_DF_PATH)

✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\embeddings\onet_embeddings_clean_v1.npy
✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\embeddings\onet_df_clean_v1.parquet


In [11]:
assert onet_emb.shape[0] == len(onet_df)

Cell 6 — Map clusters to SOC codes using O*NET embeddings

In [12]:
medoids_df = pick_medoids_simple(
    jobs=jobs,
    embeddings=embeddings,
    labels=km_labels,
    text_col=cfg["jobs"]["text_column"]
)

print("medoids_df:", medoids_df.shape)
medoids_df.head()


medoids_df: (100, 21)


Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,...,speciality,employee_count,follower_count,time_recorded,cluster,speciality_clean,cluster_id,job_row_index,sim_to_centroid,rep_text
0,65935,Virtelligence,Virtelligence is a privately held Global Healt...,3.0,Minnesota,US,Minneapolis,55304,6216 Baker Rd,https://www.linkedin.com/company/virtelligence,...,Integration and Technical Services,112,49848,1713285360,0,Integration and Technical Services,0,4357,0.735529,Virtelligence is a privately held Global Healt...
1,53179204,Likely Inc.,Fueling our passion for sustainable packaging ...,,0,US,Los Angeles,0,0,https://www.linkedin.com/company/likely-inc,...,,12,633,1713493510,1,Unknown,1,19673,0.841559,Fueling our passion for sustainable packaging ...
2,79836060,ABA Centers of Florida,We are a company founded by a mother of two da...,1.0,Florida,US,Fort Pierce,0,0,https://www.linkedin.com/company/aba-centers-o...,...,Autism,57,1102,1713407603,2,Autism,2,21627,0.667679,We are a company founded by a mother of two da...
3,3598340,BOWA Construction,We are a General Contractor and Construction M...,2.0,Illinois,US,Chicago,60601,180 N Stetson Ave,https://www.linkedin.com/company/the-bowa-grou...,...,General Contracting,68,9635,1712406114,3,General Contracting,3,13304,0.815158,We are a General Contractor and Construction M...
4,2857693,Empowered Staffing,Our recruiting team specializes in Digital Mar...,1.0,Illinois,US,Evanston,60201,"600 W Davis Street, Suite 3W",https://www.linkedin.com/company/empoweredstaf...,...,Recruiting,10,13405,1712351533,4,Recruiting,4,12484,0.879051,Our recruiting team specializes in Digital Mar...


In [13]:
medoid_idx = medoids_df["job_row_index"].astype(int).to_numpy()
medoid_emb = embeddings[medoid_idx]

print("medoid_emb:", medoid_emb.shape)  # (n_clusters, D) -> ex: (100, 384)


medoid_emb: (100, 384)


In [14]:
mapping_medoid_long = map_medoids_to_onet(
    medoids_df=medoids_df,
    medoid_emb=medoid_emb,
    onet_df=onet_df,
    onet_emb=onet_emb,
    top_k=3
)

print("mapping_medoid_long:", mapping_medoid_long.shape)  #  100*3=300
mapping_medoid_long.head()


mapping_medoid_long: (300, 8)


Unnamed: 0,cluster_id,rank,similarity,soc_code,soc_title,onet_description,job_id,rep_text
0,0,1,0.339622,13-1199.00,"Business Operations Specialists, All Other",All business operations specialists not listed...,,Virtelligence is a privately held Global Healt...
1,0,2,0.332897,13-2099.00,"Financial Specialists, All Other",All financial specialists not listed separately.,,Virtelligence is a privately held Global Healt...
2,0,3,0.314453,43-4141.00,New Accounts Clerks,Interview persons desiring to open accounts in...,,Virtelligence is a privately held Global Healt...
3,1,1,0.494696,53-7064.00,"Packers and Packagers, Hand",Pack or package by hand a wide variety of prod...,,Fueling our passion for sustainable packaging ...
4,1,2,0.449849,27-1021.00,Commercial and Industrial Designers,"Design and develop manufactured products, such...",,Fueling our passion for sustainable packaging ...


Cell 7 — Creating the file for inspecting representative job postings from each cluster (Top1)

In [15]:
top1_soc = (
    mapping_medoid_long
    .sort_values(["cluster_id", "similarity"], ascending=[True, False])
    .groupby("cluster_id", as_index=False)
    .head(1)
    .reset_index(drop=True)
)

print("top1_soc:", top1_soc.shape)  #  (100, ?)
top1_soc.head()


top1_soc: (100, 8)


Unnamed: 0,cluster_id,rank,similarity,soc_code,soc_title,onet_description,job_id,rep_text
0,0,1,0.339622,13-1199.00,"Business Operations Specialists, All Other",All business operations specialists not listed...,,Virtelligence is a privately held Global Healt...
1,1,1,0.494696,53-7064.00,"Packers and Packagers, Hand",Pack or package by hand a wide variety of prod...,,Fueling our passion for sustainable packaging ...
2,2,1,0.287339,21-2021.00,"Directors, Religious Activities and Education",Coordinate or design programs and conduct outr...,,We are a company founded by a mother of two da...
3,3,1,0.526783,11-9021.00,Construction Managers,"Plan, direct, or coordinate, usually through s...",,We are a General Contractor and Construction M...
4,4,1,0.441725,27-2012.04,Talent Directors,Audition and interview performers to select mo...,,Our recruiting team specializes in Digital Mar...


In [16]:
jobs_labeled = jobs.copy()

# Attach cluster labels if not already present
if "cluster_id" not in jobs_labeled.columns:
    jobs_labeled["cluster_id"] = km_labels

# Merge SOC information (preserve job row count)
jobs_with_soc = jobs_labeled.merge(
    top1_soc[["cluster_id", "soc_code", "soc_title", "similarity", "onet_description"]],
    on="cluster_id",
    how="left"
)

print("jobs_with_soc shape:", jobs_with_soc.shape)  # rows should equal len(jobs)
jobs_with_soc.head()



jobs_with_soc shape: (23839, 22)


Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,...,employee_count,follower_count,time_recorded,cluster,speciality_clean,cluster_id,soc_code,soc_title,similarity,onet_description
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm,...,314102,16253625,1712378162,36,Cloud,36,15-1211.01,Health Informatics Specialists,0.348848,Apply knowledge of nursing and informatics to ...
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare,...,56873,2185368,1712382540,66,Healthcare,66,15-1211.01,Health Informatics Specialists,0.514157,Apply knowledge of nursing and informatics to ...
2,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...,...,79528,3586194,1712870106,79,Unknown,79,15-1243.01,Data Warehousing Specialists,0.391347,"Design, model, or implement corporate data war..."
3,1028,Oracle,We’re a cloud technology company that provides...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle,...,192099,9465968,1712642952,79,enterprise,79,15-1243.01,Data Warehousing Specialists,0.391347,"Design, model, or implement corporate data war..."
4,1033,Accenture,Accenture is a leading global professional ser...,7.0,0,IE,Dublin 2,0,Grand Canal Harbour,https://www.linkedin.com/company/accenture,...,574664,11864908,1712641699,36,Management Consulting,36,15-1211.01,Health Informatics Specialists,0.348848,Apply knowledge of nursing and informatics to ...


In [17]:
OUT_PARQUET = PROJECT_ROOT / "results" / "reports" / "jobs_with_soc_top1.parquet"
OUT_CSV     = PROJECT_ROOT / "results" / "reports" / "jobs_with_soc_top1.csv"
OUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)

jobs_with_soc.to_parquet(OUT_PARQUET, index=False)
jobs_with_soc.to_csv(OUT_CSV, index=False)

print("✅ Saved:", OUT_PARQUET)
print("✅ Saved:", OUT_CSV)


✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top1.parquet
✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top1.csv


Cell 8 — Creating the file for inspecting representative job postings from each cluster (Top3)

In [19]:
import pandas as pd
from datetime import datetime


cluster_top3 = (
    mapping_medoid_long
    .sort_values(["cluster_id", "rank"])
    .groupby("cluster_id")
    .apply(lambda g: pd.Series({
        "soc_codes_top3": g["soc_code"].tolist(),
        "soc_titles_top3": g["soc_title"].tolist(),
        "soc_sims_top3": g["similarity"].tolist(),
    }))
    .reset_index()
)

print("cluster_top3:", cluster_top3.shape)
cluster_top3.head()


cluster_top3: (100, 4)


  .apply(lambda g: pd.Series({


Unnamed: 0,cluster_id,soc_codes_top3,soc_titles_top3,soc_sims_top3
0,0,"[13-1199.00, 13-2099.00, 43-4141.00]","[Business Operations Specialists, All Other, F...","[0.3396218717098236, 0.33289700746536255, 0.31..."
1,1,"[53-7064.00, 27-1021.00, 43-5011.00]","[Packers and Packagers, Hand, Commercial and I...","[0.49469608068466187, 0.44984930753707886, 0.3..."
2,2,"[21-2021.00, 41-9041.00, 45-1011.00]","[Directors, Religious Activities and Education...","[0.28733888268470764, 0.27596431970596313, 0.2..."
3,3,"[11-9021.00, 47-2031.00, 17-3022.00]","[Construction Managers, Carpenters, Civil Engi...","[0.5267828702926636, 0.4581689238548279, 0.421..."
4,4,"[27-2012.04, 13-1161.00, 13-1011.00]","[Talent Directors, Market Research Analysts an...","[0.44172537326812744, 0.42309820652008057, 0.4..."


In [20]:
# 2) Add cluster_id to jobs if it does not exist
jobs_labeled = jobs.copy()
if "cluster_id" not in jobs_labeled.columns:
    jobs_labeled["cluster_id"] = km_labels

# 3) Attach the top-3 SOC lists to each job
jobs_with_soc_top3 = jobs_labeled.merge(cluster_top3, on="cluster_id", how="left")

print("jobs_with_soc_top3 shape:", jobs_with_soc_top3.shape)  # number of rows should equal jobs
jobs_with_soc_top3[["cluster_id", "soc_codes_top3", "soc_sims_top3"]].head()


jobs_with_soc_top3 shape: (23839, 21)


Unnamed: 0,cluster_id,soc_codes_top3,soc_sims_top3
0,36,"[15-1211.01, 17-2112.01, 15-1244.00]","[0.3488484025001526, 0.34169721603393555, 0.32..."
1,66,"[15-1211.01, 29-9021.00, 29-2099.08]","[0.5141565799713135, 0.5084431767463684, 0.485..."
2,79,"[15-1243.01, 15-2051.00, 15-2051.01]","[0.3913465738296509, 0.37844517827033997, 0.36..."
3,79,"[15-1243.01, 15-2051.00, 15-2051.01]","[0.3913465738296509, 0.37844517827033997, 0.36..."
4,36,"[15-1211.01, 17-2112.01, 15-1244.00]","[0.3488484025001526, 0.34169721603393555, 0.32..."


In [21]:
# 4) Save outputs (use a timestamp to avoid PermissionError)
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_TOP3_PARQ = PROJECT_ROOT / "results" / "reports" / f"jobs_with_soc_top3_{stamp}.parquet"
OUT_TOP3_CSV  = PROJECT_ROOT / "results" / "reports" / f"jobs_with_soc_top3_{stamp}.csv"
OUT_TOP3_PARQ.parent.mkdir(parents=True, exist_ok=True)

jobs_with_soc_top3.to_parquet(OUT_TOP3_PARQ, index=False)
jobs_with_soc_top3.to_csv(OUT_TOP3_CSV, index=False)

print("✅ Saved:", OUT_TOP3_PARQ)
print("✅ Saved:", OUT_TOP3_CSV)

✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top3_20260114_015651.parquet
✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top3_20260114_015651.csv


Cell 8 — Creating the file for inspecting representative job postings from each cluster (Similarity per cluster)

In [22]:
# top1 similarity Table per cluster
cluster_top1 = (
    mapping_medoid_long
    .sort_values(["cluster_id", "similarity"], ascending=[True, False])
    .groupby("cluster_id", as_index=False)
    .head(1)
    .reset_index(drop=True)
)

cluster_sizes = (
    jobs_labeled.groupby("cluster_id").size().reset_index(name="cluster_size")
)

cluster_watch_base = cluster_top1.merge(cluster_sizes, on="cluster_id", how="left")

watchlist_15 = (
    cluster_watch_base
    .sort_values(["similarity", "cluster_size"], ascending=[True, True])
    .head(15)
    .reset_index(drop=True)
)

print("watchlist_15:", watchlist_15.shape)
watchlist_15[["cluster_id","soc_code","soc_title","similarity","cluster_size"]]


watchlist_15: (15, 9)


Unnamed: 0,cluster_id,soc_code,soc_title,similarity,cluster_size
0,71,15-1243.00,Database Architects,0.234149,203
1,2,21-2021.00,"Directors, Religious Activities and Education",0.287339,294
2,74,39-6012.00,Concierges,0.305731,234
3,84,15-1253.00,Software Quality Assurance Analysts and Testers,0.324978,302
4,45,29-1051.00,Pharmacists,0.327307,251
5,41,25-9031.00,Instructional Coordinators,0.333435,239
6,8,49-3091.00,Bicycle Repairers,0.339261,244
7,0,13-1199.00,"Business Operations Specialists, All Other",0.339622,122
8,92,13-1199.00,"Business Operations Specialists, All Other",0.344641,211
9,36,15-1211.01,Health Informatics Specialists,0.348848,282


In [23]:
# timestamp
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_WATCH = PROJECT_ROOT / "results" / "reports" / f"cluster_watchlist_low_sim_15_{stamp}.csv"
OUT_WATCH.parent.mkdir(parents=True, exist_ok=True)

watchlist_15.to_csv(OUT_WATCH, index=False)
print("✅ Saved:", OUT_WATCH)


✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\cluster_watchlist_low_sim_15_20260114_015656.csv
