Cell 1 – Setup

In [25]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import load_config
from src.data.loader import load_jobs
from src.taxonomy.mapping import load_onet, build_onet_embeddings, map_clusters_to_soc

cfg = load_config()

# ✅ 버전 고정 (02에서 만든 것과 동일해야 함)
CLEAN_VERSION = "boiler_v1_len50_cap3000"

EMB_PATH   = PROJECT_ROOT / "results" / "embeddings" / f"embeddings_{CLEAN_VERSION}.npy"
JOBS_PATH  = PROJECT_ROOT / "results" / "embeddings" / f"jobs_{CLEAN_VERSION}.parquet"
LABELS_PATH = PROJECT_ROOT / "results" / "clusters" / "kmeans_labels.npy"

EMB_PATH, JOBS_PATH, LABELS_PATH



(WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/embeddings/embeddings_boiler_v1_len50_cap3000.npy'),
 WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/embeddings/jobs_boiler_v1_len50_cap3000.parquet'),
 WindowsPath('C:/Users/hisuk/labor-market-nlp-prototype/results/clusters/kmeans_labels.npy'))

Cell 2 – Load embeddings, labels, and jobs

In [26]:
# Cell 2 — Load embeddings, labels, and jobs (and validate alignment)

embeddings = np.load(EMB_PATH)
km_labels  = np.load(LABELS_PATH)

# jobs는 반드시 parquet로 저장한 clean 버전 사용 (row alignment 안전)
jobs = pd.read_parquet(JOBS_PATH)

print("embeddings:", embeddings.shape)
print("labels:", km_labels.shape)
print("jobs:", jobs.shape)

assert embeddings.shape[0] == len(km_labels) == len(jobs), "❌ Mismatch: rows not aligned"
print("✅ Alignment OK")


embeddings: (23838, 384)
labels: (23838,)
jobs: (23838, 17)
✅ Alignment OK


In [27]:
onet_df = load_onet()
print("O*NET shape:", onet_df.shape)
onet_df.head()
onet_df["Title"].unique()

O*NET shape: (1016, 3)


array(['Chief Executives', 'Chief Sustainability Officers',
       'General and Operations Managers', ..., 'Infantry',
       'Special Forces',
       'Military Enlisted Tactical Operations and Air/Weapons Specialists and Crew Members, All Other'],
      shape=(1016,), dtype=object)

In [28]:
import re

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    s = str(s)
    s = (s.replace("â€™", "'")
           .replace("â€œ", '"')
           .replace("â€", '"')
           .replace("â€“", "-")
           .replace("â€”", "-"))
    s = re.sub(r"\s+", " ", s).strip()
    return s

# O*NET용: 너무 공격적으로 자르지 말고, 흔한 잡음만 제거
ONET_BOILER_PATTERNS = [
    r"\bexamples? of\b.*",                # "Examples of..." 뒤에 길게 붙는 경우
    r"\bmay include\b.*",                 # "may include..." 뒤에 길게
    r"\brelated occupations?\b.*",
    r"\bfor more information\b.*",
    r"\bvisit\b.*\bwww\..*",              # URL 라인
    r"https?://\S+",
]

ONET_BOILER_REGEX = re.compile("|".join(f"(?:{p})" for p in ONET_BOILER_PATTERNS), flags=re.IGNORECASE)

def clean_onet_description(desc: str, cap: int = 2000) -> str:
    t = normalize_text(desc)
    t = ONET_BOILER_REGEX.sub(" ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t[:cap]

def clean_onet_title(title: str) -> str:
    t = normalize_text(title)
    # 괄호 안 보조설명 제거 (너무 과하면 주석 처리)
    t = re.sub(r"\s*\([^)]*\)\s*", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# 컬럼명은 네 onet_df에 맞춰서 확인 (대문자 Title/Description 가정)
# cfg["taxonomy"]에 title_column/desc_column 있으면 그걸 쓰자
title_col = cfg["taxonomy"].get("title_column", "Title") if "taxonomy" in cfg else "Title"
desc_col  = cfg["taxonomy"].get("desc_column", "Description") if "taxonomy" in cfg else "Description"

onet_df = onet_df.copy()
onet_df[title_col] = onet_df[title_col].astype(str).map(clean_onet_title)
onet_df[desc_col]  = onet_df[desc_col].astype(str).map(clean_onet_description)

# 결측/빈 문자열 제거
mask = onet_df[title_col].str.strip().ne("") & onet_df[desc_col].str.strip().ne("")
onet_df = onet_df.loc[mask].reset_index(drop=True)

print("✅ Cleaned O*NET shape:", onet_df.shape)
print("Sample O*NET title:", onet_df[title_col].iloc[0])
print("Sample O*NET desc:", onet_df[desc_col].iloc[0][:300])


✅ Cleaned O*NET shape: (1016, 3)
Sample O*NET title: Chief Executives
Sample O*NET desc: Determine and formulate policies and provide overall direction of companies or private and public sector organizations within guidelines set up by a board of directors or similar governing body. Plan, direct, or coordinate operational activities at the highest level of management with the help of su


In [29]:
# Cell 3 — Build O*NET embeddings (cleaned)

onet_emb = build_onet_embeddings(onet_df)
print("O*NET embeddings shape:", onet_emb.shape)


Batches: 100%|██████████| 16/16 [00:10<00:00,  1.55it/s]

O*NET embeddings shape: (1016, 384)





In [30]:
ONET_EMB_PATH = PROJECT_ROOT / "results" / "embeddings" / "onet_embeddings_clean_v1.npy"
ONET_DF_PATH  = PROJECT_ROOT / "results" / "embeddings" / "onet_df_clean_v1.parquet"
ONET_EMB_PATH.parent.mkdir(parents=True, exist_ok=True)

np.save(ONET_EMB_PATH, onet_emb)
onet_df.to_parquet(ONET_DF_PATH, index=False)
print("✅ Saved:", ONET_EMB_PATH)
print("✅ Saved:", ONET_DF_PATH)

✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\embeddings\onet_embeddings_clean_v1.npy
✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\embeddings\onet_df_clean_v1.parquet


In [31]:
assert onet_emb.shape[0] == len(onet_df)


In [44]:
import numpy as np
import pandas as pd

def pick_medoids_simple(
    jobs: pd.DataFrame,
    embeddings: np.ndarray,
    labels: np.ndarray,
    text_col: str = "description"
) -> pd.DataFrame:
    """
    Pick 1 medoid (closest job to centroid) per cluster.
    """
    rows = []

    for cid in np.unique(labels):
        idx = np.where(labels == cid)[0]
        if len(idx) == 0:
            continue

        emb_c = embeddings[idx]
        centroid = emb_c.mean(axis=0)
        centroid = centroid / np.linalg.norm(centroid)

        sims = emb_c @ centroid
        best = idx[np.argmax(sims)]

        row = jobs.iloc[best].to_dict()
        row.update({
            "cluster_id": int(cid),
            "job_row_index": int(best),
            "sim_to_centroid": float(np.max(sims)),
            "rep_text": jobs.iloc[best][text_col]
        })
        rows.append(row)

    return pd.DataFrame(rows)


In [45]:
medoids_df = pick_medoids_simple(
    jobs=jobs,
    embeddings=embeddings,
    labels=km_labels,
    text_col=cfg["jobs"]["text_column"]
)

print("medoids_df:", medoids_df.shape)
medoids_df.head()


medoids_df: (100, 21)


Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,...,speciality,employee_count,follower_count,time_recorded,cluster,speciality_clean,cluster_id,job_row_index,sim_to_centroid,rep_text
0,74519793,Viva Tech Solutions,We are an IT consulting firm with a combined i...,1.0,Ontario,CA,Toronto,M5B 2H1,220 Yonge St,https://www.linkedin.com/company/viva-tech-sol...,...,Training And Deployment,37,34007,1712363140,0,Training And Deployment,0,20973,0.812216,We are an IT consulting firm with a combined i...
1,53179204,Likely Inc.,Fueling our passion for sustainable packaging ...,,0,US,Los Angeles,0,0,https://www.linkedin.com/company/likely-inc,...,,12,633,1713493510,1,Unknown,1,19674,0.836923,Fueling our passion for sustainable packaging ...
2,19647,Technologent,"Technologent is a women-owned, WBENC-certified...",3.0,California,US,Irvine,92618,100 Spectrum Center Dr,https://www.linkedin.com/company/technologent,...,Data Center Integration,399,12089,1712346769,2,Data Center Integration,2,2373,0.762745,"Technologent is a women-owned, WBENC-certified..."
3,95294887,NeueHealth,experience that maximizes value for health con...,5.0,Florida,US,Doral,33178,0,https://www.linkedin.com/company/neuehealth,...,Value-based care,658,24779,1713555347,3,Value-based care,3,22990,0.744237,experience that maximizes value for health con...
4,11494597,BioTalent,"We are BioTalent, the life sciences recruitmen...",2.0,England,GB,London,E1 6EG,10 Bishops Square,https://www.linkedin.com/company/bio-talent,...,Quality,72,167731,1712652038,4,Quality,4,16319,0.825447,"We are BioTalent, the life sciences recruitmen..."


In [54]:
def map_medoids_to_onet(
    medoids_df: pd.DataFrame,
    medoid_emb: np.ndarray,
    onet_df: pd.DataFrame,
    onet_emb: np.ndarray,
    top_k: int = 3
) -> pd.DataFrame:
    rows = []

    for i, row in medoids_df.iterrows():
        sims = onet_emb @ medoid_emb[i]
        top_idx = np.argsort(sims)[::-1][:top_k]

        for rank, j in enumerate(top_idx):
            rows.append({
                "cluster_id": int(row["cluster_id"]),
                "rank": rank + 1,
                "similarity": float(sims[j]),
                "soc_code": onet_df.iloc[j]["O*NET-SOC Code"],
                "soc_title": onet_df.iloc[j]["Title"],
                "onet_description": onet_df.iloc[j]["Description"],
                "job_id": row.get("job_id"),
                "rep_text": row["rep_text"],
            })

    return pd.DataFrame(rows)


In [55]:
medoid_idx = medoids_df["job_row_index"].astype(int).to_numpy()
medoid_emb = embeddings[medoid_idx]

print("medoid_emb:", medoid_emb.shape)  # (n_clusters, D) -> 예: (100, 384)


medoid_emb: (100, 384)


In [56]:
mapping_medoid_long = map_medoids_to_onet(
    medoids_df=medoids_df,
    medoid_emb=medoid_emb,
    onet_df=onet_df,
    onet_emb=onet_emb,
    top_k=3
)

print("mapping_medoid_long:", mapping_medoid_long.shape)  # 보통 100*3=300
mapping_medoid_long.head()


mapping_medoid_long: (300, 8)


Unnamed: 0,cluster_id,rank,similarity,soc_code,soc_title,onet_description,job_id,rep_text
0,0,1,0.40614,11-3021.00,Computer and Information Systems Managers,"Plan, direct, or coordinate activities in such...",,We are an IT consulting firm with a combined i...
1,0,2,0.390967,15-1299.09,Information Technology Project Managers,"Plan, initiate, and manage information technol...",,We are an IT consulting firm with a combined i...
2,0,3,0.377403,21-1012.00,"Educational, Guidance, and Career Counselors a...",Advise and assist students and provide educati...,,We are an IT consulting firm with a combined i...
3,1,1,0.494696,53-7064.00,"Packers and Packagers, Hand",Pack or package by hand a wide variety of prod...,,Fueling our passion for sustainable packaging ...
4,1,2,0.449849,27-1021.00,Commercial and Industrial Designers,"Design and develop manufactured products, such...",,Fueling our passion for sustainable packaging ...


In [57]:
top1_soc = (
    mapping_medoid_long
    .sort_values(["cluster_id", "similarity"], ascending=[True, False])
    .groupby("cluster_id", as_index=False)
    .head(1)
    .reset_index(drop=True)
)

print("top1_soc:", top1_soc.shape)  # 보통 (100, ?)
top1_soc.head()


top1_soc: (100, 8)


Unnamed: 0,cluster_id,rank,similarity,soc_code,soc_title,onet_description,job_id,rep_text
0,0,1,0.40614,11-3021.00,Computer and Information Systems Managers,"Plan, direct, or coordinate activities in such...",,We are an IT consulting firm with a combined i...
1,1,1,0.494696,53-7064.00,"Packers and Packagers, Hand",Pack or package by hand a wide variety of prod...,,Fueling our passion for sustainable packaging ...
2,2,1,0.455119,29-2055.00,Surgical Technologists,"Assist in operations, under the supervision of...",,"Technologent is a women-owned, WBENC-certified..."
3,3,1,0.503097,29-2099.08,Patient Representatives,"Assist patients in obtaining services, underst...",,experience that maximizes value for health con...
4,4,1,0.441985,19-1042.00,"Medical Scientists, Except Epidemiologists",Conduct research dealing with the understandin...,,"We are BioTalent, the life sciences recruitmen..."


In [58]:
jobs_labeled = jobs.copy()

if "cluster_id" not in jobs_labeled.columns:
    jobs_labeled["cluster_id"] = km_labels

# merge (job row 수 유지)
jobs_with_soc = jobs_labeled.merge(
    top1_soc[["cluster_id", "soc_code", "soc_title", "similarity", "onet_description"]],
    on="cluster_id",
    how="left"
)

print("jobs_with_soc:", jobs_with_soc.shape)  # rows = len(jobs)
jobs_with_soc.head()


jobs_with_soc: (23838, 22)


Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,...,employee_count,follower_count,time_recorded,cluster,speciality_clean,cluster_id,soc_code,soc_title,similarity,onet_description
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm,...,314102,16253625,1712378162,54,Cloud,54,15-1221.00,Computer and Information Research Scientists,0.372814,Conduct research into fundamental computer and...
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare,...,56873,2185368,1712382540,72,Healthcare,72,15-1211.01,Health Informatics Specialists,0.514157,Apply knowledge of nursing and informatics to ...
2,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...,...,79528,3586194,1712870106,81,Unknown,81,15-1299.07,Blockchain Engineers,0.362875,Maintain and support distributed and decentral...
3,1028,Oracle,We’re a cloud technology company that provides...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle,...,192099,9465968,1712642952,81,enterprise,81,15-1299.07,Blockchain Engineers,0.362875,Maintain and support distributed and decentral...
4,1033,Accenture,"experience, functional expertise and global de...",7.0,0,IE,Dublin 2,0,Grand Canal Harbour,https://www.linkedin.com/company/accenture,...,574664,11864908,1712641699,83,Management Consulting,83,41-3091.00,"Sales Representatives of Services, Except Adve...",0.460762,Sell services to individuals or businesses. Ma...


In [59]:
OUT_PARQUET = PROJECT_ROOT / "results" / "reports" / "jobs_with_soc_top1.parquet"
OUT_CSV     = PROJECT_ROOT / "results" / "reports" / "jobs_with_soc_top1.csv"
OUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)

jobs_with_soc.to_parquet(OUT_PARQUET, index=False)
jobs_with_soc.to_csv(OUT_CSV, index=False)

print("✅ Saved:", OUT_PARQUET)
print("✅ Saved:", OUT_CSV)


✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top1.parquet
✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top1.csv


In [61]:
SAMPLES_PER_CLUSTER = 5
TEXT_COL = cfg["jobs"]["text_column"]

sample_rows = (
    jobs_with_soc
    .groupby("cluster_id", group_keys=False)
    .apply(lambda g: g.sample(n=min(SAMPLES_PER_CLUSTER, len(g)), random_state=42))
    .reset_index(drop=True)
)

# 사람이 체크할 컬럼만 남기기
keep_cols = [c for c in ["cluster_id","soc_code","soc_title","similarity",TEXT_COL] if c in sample_rows.columns]
sample_rows["human_judgement"] = ""
sample_rows["notes"] = ""

VAL_PATH = PROJECT_ROOT / "results" / "reports" / "cluster_validation_sample.csv"
sample_rows[keep_cols + ["human_judgement","notes"]].to_csv(VAL_PATH, index=False)
print("✅ Saved validation sample:", VAL_PATH)
sample_rows.head()


✅ Saved validation sample: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\cluster_validation_sample.csv


  .apply(lambda g: g.sample(n=min(SAMPLES_PER_CLUSTER, len(g)), random_state=42))


Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url,...,time_recorded,cluster,speciality_clean,cluster_id,soc_code,soc_title,similarity,onet_description,human_judgement,notes
0,41771,"Akraya, Inc.",Akraya stands as a preeminent provider of IT c...,3.0,California,US,San Francisco,94114,2261 Market St,https://www.linkedin.com/company/akraya-inc,...,1712643099,0,Systems Integration,0,11-3021.00,Computer and Information Systems Managers,0.40614,"Plan, direct, or coordinate activities in such...",,
1,1097607,"ADR Application Development Resources, Inc.",We Find The Right Fit. ADR connects the right ...,3.0,0,US,Alpharetta,GA,3480 Preston Ridge Road,https://www.linkedin.com/company/adr-applicati...,...,1713491923,0,Time and Materials Contracting Services,0,11-3021.00,Computer and Information Systems Managers,0.40614,"Plan, direct, or coordinate activities in such...",,
2,75472485,AGPROfessionals,AGPROfessionals is a comprehensive agricultura...,1.0,CO,US,Greeley,80634,3050 67th Avenue,https://www.linkedin.com/company/agprofessionals,...,1713565370,0,Consulting,0,11-3021.00,Computer and Information Systems Managers,0.40614,"Plan, direct, or coordinate activities in such...",,
3,87467766,FUSTIS LLC,Fustis is a company that connects with IT cons...,1.0,California,US,Sacramento,95825,3400 Cottage Way,https://www.linkedin.com/company/fustisllc,...,1712861331,0,Unknown,0,11-3021.00,Computer and Information Systems Managers,0.40614,"Plan, direct, or coordinate activities in such...",,
4,3779159,"IdeaHelix, Inc",Salesforce Industries consulting company deliv...,3.0,CA,US,Fremont,94538,0,https://www.linkedin.com/company/ideahelix-inc,...,1713290448,0,Salesforce,0,11-3021.00,Computer and Information Systems Managers,0.40614,"Plan, direct, or coordinate activities in such...",,


In [62]:
import pandas as pd
from datetime import datetime

# 1) cluster_id별 top3를 "리스트"로 묶기
cluster_top3 = (
    mapping_medoid_long
    .sort_values(["cluster_id", "rank"])
    .groupby("cluster_id")
    .apply(lambda g: pd.Series({
        "soc_codes_top3": g["soc_code"].tolist(),
        "soc_titles_top3": g["soc_title"].tolist(),
        "soc_sims_top3": g["similarity"].tolist(),
    }))
    .reset_index()
)

print("cluster_top3:", cluster_top3.shape)
cluster_top3.head()


cluster_top3: (100, 4)


  .apply(lambda g: pd.Series({


Unnamed: 0,cluster_id,soc_codes_top3,soc_titles_top3,soc_sims_top3
0,0,"[11-3021.00, 15-1299.09, 21-1012.00]","[Computer and Information Systems Managers, In...","[0.4061403274536133, 0.3909672498703003, 0.377..."
1,1,"[53-7064.00, 27-1021.00, 43-5011.00]","[Packers and Packagers, Hand, Commercial and I...","[0.49469608068466187, 0.44984930753707886, 0.3..."
2,2,"[29-2055.00, 15-1299.02, 29-9021.00]","[Surgical Technologists, Geographic Informatio...","[0.45511913299560547, 0.39804160594940186, 0.3..."
3,3,"[29-2099.08, 11-9111.00, 21-1094.00]","[Patient Representatives, Medical and Health S...","[0.5030971169471741, 0.42231374979019165, 0.42..."
4,4,"[19-1042.00, 19-4021.00, 13-2099.00]","[Medical Scientists, Except Epidemiologists, B...","[0.44198518991470337, 0.4296797215938568, 0.41..."


In [63]:
# 2) jobs에 cluster_id가 없으면 만들기
jobs_labeled = jobs.copy()
if "cluster_id" not in jobs_labeled.columns:
    jobs_labeled["cluster_id"] = km_labels

# 3) jobs에 top3 리스트 붙이기
jobs_with_soc_top3 = jobs_labeled.merge(cluster_top3, on="cluster_id", how="left")

print("jobs_with_soc_top3:", jobs_with_soc_top3.shape)  # rows = jobs rows 유지
jobs_with_soc_top3[["cluster_id", "soc_codes_top3", "soc_sims_top3"]].head()


jobs_with_soc_top3: (23838, 21)


Unnamed: 0,cluster_id,soc_codes_top3,soc_sims_top3
0,54,"[15-1221.00, 15-1212.00, 15-1299.09]","[0.3728139102458954, 0.36662083864212036, 0.35..."
1,72,"[15-1211.01, 29-9021.00, 29-2099.08]","[0.5141565799713135, 0.5084431767463684, 0.485..."
2,81,"[15-1299.07, 15-1243.00, 15-1243.01]","[0.3628748059272766, 0.3615328371524811, 0.320..."
3,81,"[15-1299.07, 15-1243.00, 15-1243.01]","[0.3628748059272766, 0.3615328371524811, 0.320..."
4,83,"[41-3091.00, 43-4051.00, 41-4011.07]","[0.4607622027397156, 0.3933505415916443, 0.387..."


In [64]:
# 4) 저장 (PermissionError 방지: timestamp)
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_TOP3_PARQ = PROJECT_ROOT / "results" / "reports" / f"jobs_with_soc_top3_{stamp}.parquet"
OUT_TOP3_CSV  = PROJECT_ROOT / "results" / "reports" / f"jobs_with_soc_top3_{stamp}.csv"
OUT_TOP3_PARQ.parent.mkdir(parents=True, exist_ok=True)

jobs_with_soc_top3.to_parquet(OUT_TOP3_PARQ, index=False)
jobs_with_soc_top3.to_csv(OUT_TOP3_CSV, index=False)

print("✅ Saved:", OUT_TOP3_PARQ)
print("✅ Saved:", OUT_TOP3_CSV)


✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top3_20260109_012840.parquet
✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\jobs_with_soc_top3_20260109_012840.csv


In [65]:
# cluster별 top1 similarity 테이블
cluster_top1 = (
    mapping_medoid_long
    .sort_values(["cluster_id", "similarity"], ascending=[True, False])
    .groupby("cluster_id", as_index=False)
    .head(1)
    .reset_index(drop=True)
)

cluster_sizes = (
    jobs_labeled.groupby("cluster_id").size().reset_index(name="cluster_size")
)

cluster_watch_base = cluster_top1.merge(cluster_sizes, on="cluster_id", how="left")

watchlist_15 = (
    cluster_watch_base
    .sort_values(["similarity", "cluster_size"], ascending=[True, True])
    .head(15)
    .reset_index(drop=True)
)

print("watchlist_15:", watchlist_15.shape)
watchlist_15[["cluster_id","soc_code","soc_title","similarity","cluster_size"]]


watchlist_15: (15, 9)


Unnamed: 0,cluster_id,soc_code,soc_title,similarity,cluster_size
0,15,15-1243.00,Database Architects,0.234149,243
1,42,21-2021.00,"Directors, Religious Activities and Education",0.287339,347
2,11,13-1161.00,Market Research Analysts and Marketing Special...,0.287931,148
3,64,49-3091.00,Bicycle Repairers,0.297698,232
4,13,19-3032.00,Industrial-Organizational Psychologists,0.318746,295
5,58,11-9033.00,"Education Administrators, Postsecondary",0.320566,48
6,22,29-1051.00,Pharmacists,0.327307,274
7,91,25-9031.00,Instructional Coordinators,0.333435,173
8,48,13-1199.00,"Business Operations Specialists, All Other",0.334827,239
9,56,13-1199.00,"Business Operations Specialists, All Other",0.344641,131


In [66]:
# 저장 (timestamp)
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_WATCH = PROJECT_ROOT / "results" / "reports" / f"cluster_watchlist_low_sim_15_{stamp}.csv"
OUT_WATCH.parent.mkdir(parents=True, exist_ok=True)

watchlist_15.to_csv(OUT_WATCH, index=False)
print("✅ Saved:", OUT_WATCH)


✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\cluster_watchlist_low_sim_15_20260109_012902.csv
