Cell 1 – Setup imports + project root

In [None]:
# 02_embeddings_clustering.ipynb

import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import load_config
from src.data.loader import load_jobs
from src.embeddings.embedder import embed_texts, build_and_save_embeddings
from src.evaluation.metrics import internal_cluster_metrics

sns.set(style="whitegrid")

cfg = load_config()
TEXT_COL = cfg["jobs"]["text_column"]

EMB_PATH = PROJECT_ROOT / "results" / "embeddings" / "jobs_embeddings.npy"
LABELS_PATH = PROJECT_ROOT / "results" / "clusters" / "kmeans_labels.npy"

cfg


Cell 2 – Load jobs & build embeddings (or reuse cached)

In [None]:
jobs = load_jobs()
print("Shape:", jobs.shape)

EMB_PATH.parent.mkdir(parents=True, exist_ok=True)

if EMB_PATH.exists():
    print("Loading cached embeddings from:", EMB_PATH)
    embeddings = np.load(EMB_PATH)
else:
    print("Building embeddings…")
    embeddings = build_and_save_embeddings(jobs, EMB_PATH)

embeddings.shape


Cell 3 – KMeans clustering

In [None]:
n_clusters = cfg["clustering"]["n_clusters"]
print("n_clusters:", n_clusters)

kmeans = KMeans(
    n_clusters=n_clusters,
    random_state=42,
    n_init="auto",
)
labels = kmeans.fit_predict(embeddings)

LABELS_PATH.parent.mkdir(parents=True, exist_ok=True)
np.save(LABELS_PATH, labels)

labels[:10]


Cell 4 – Cluster size distribution

In [None]:
cluster_sizes = pd.Series(labels).value_counts().sort_index()
cluster_sizes


In [None]:
plt.figure(figsize=(10, 4))
cluster_sizes.plot(kind="bar")
plt.title("Cluster size distribution")
plt.xlabel("Cluster ID")
plt.ylabel("Number of jobs")
plt.tight_layout()
plt.show()


Cell 5 – Internal cluster metrics

In [None]:
metrics = internal_cluster_metrics(embeddings, labels)
metrics


Cell 6 – Attach labels back to jobs and inspect

In [None]:
jobs_with_labels = jobs.copy()
jobs_with_labels["cluster"] = labels

jobs_with_labels.head()


In [None]:
# Look at a few clusters and their most common job titles
jobs_with_labels.groupby("cluster")["title"].value_counts().groupby(level=0).head(3)


Cell 7 – Inspect a single cluster in detail

In [None]:
cluster_id = 0  # change as you like
cluster_sample = jobs_with_labels[jobs_with_labels["cluster"] == cluster_id].head(10)
cluster_sample[[cfg["jobs"]["id_column"], "title", "company", TEXT_COL]]
