In [7]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

FINAL_PATH = PROJECT_ROOT / "results/reports/jobs_with_soc_top3_20260109_012840.csv"
assert FINAL_PATH.exists(), f"File not found: {FINAL_PATH}"

df = pd.read_csv(FINAL_PATH)

# ---- choose text column ----
if "speciality_clean" in df.columns:
    TEXT_COL = "speciality_clean"
elif "description" in df.columns:
    TEXT_COL = "description"
else:
    raise ValueError("Need a text column: speciality_clean or description")

work = df[["cluster_id", TEXT_COL]].copy()
work[TEXT_COL] = work[TEXT_COL].fillna("").astype(str)
work = work[work[TEXT_COL].str.strip().ne("")].reset_index(drop=True)

# ---- custom stopwords ----
custom_stop = {"unknown","recruiting", "services", "staffing", "company", "team"}  # 필요하면 더 추가
stopwords = list(set(ENGLISH_STOP_WORDS).union(custom_stop))  # ✅ set -> list

# ---- TF-IDF ----
vec = TfidfVectorizer(
    stop_words=stopwords,
    min_df=3,
    max_df=0.8,
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[a-zA-Z][a-zA-Z]+\b"
)

X = vec.fit_transform(work[TEXT_COL])
terms = np.array(vec.get_feature_names_out())

rows = []
for cid, idx in work.groupby("cluster_id").groups.items():
    cluster_mean = X[list(idx)].mean(axis=0)
    scores = np.asarray(cluster_mean).ravel()

    if scores.sum() == 0:
        top_terms = []
    else:
        topk = scores.argsort()[-30:][::-1]
        top_terms = terms[topk].tolist()

    rows.append({
        "cluster_id": cid,
        "n_rows": int(len(idx)),
        "top_terms_tfidf_30": ", ".join(top_terms)
    })

out = pd.DataFrame(rows).sort_values("cluster_id").reset_index(drop=True)

ts = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_PATH = PROJECT_ROOT / f"results/reports/cluster_top_terms_tfidf_30_{ts}.csv"
out.to_csv(OUT_PATH, index=False)

print("✅ Saved:", OUT_PATH)
out.head(10)


✅ Saved: C:\Users\hisuk\labor-market-nlp-prototype\results\reports\cluster_top_terms_tfidf_30_20260110_055942.csv


Unnamed: 0,cluster_id,n_rows,top_terms_tfidf_30
0,0,280,"consulting, technology, management, engineerin..."
1,1,164,"packaging, printing, plastic, print, bags, pap..."
2,2,247,"engineering, consulting, technology, software,..."
3,3,271,"healthcare, care, health, insurance, health ca..."
4,4,157,"biotechnology, clinical, pharmaceutical, pharm..."
5,5,278,"recruitment, accounting, executive, engineerin..."
6,6,272,"water, environmental, control, restoration, wa..."
7,7,395,"marketing, advertising, strategy, brand, media..."
8,8,270,"temporary, accounting, recruitment, technology..."
9,9,460,"construction, engineering, civil, management, ..."
