# Clustering Evaluation (Steps 1–4)


In [11]:
from pathlib import Path
import json
import zipfile
import numpy as np
import pandas as pd

from sklearn.metrics import silhouette_score

try:
    from IPython.display import display, HTML
    IPY_OK = True
except Exception:
    IPY_OK = False



## 1) Locate / unzip outputs

In [12]:
# [p for p in (Path(".").resolve()/"outputs").iterdir() if p.is_dir()]
[p for p in (Path(".").resolve()/"outputs").iterdir() if p.is_dir() if p.name.startswith("outputs")]

[PosixPath('/config/projects/lang-recognition/outputs/outputs_20260215_190441')]

In [13]:
def find_base_dir() -> Path:
    """Find a directory that contains the step folders."""
    here = Path(".").resolve()
    candidates = [
        here,
        here / "outputs",
        here / "_outputs",
        here / "outputs_unzipped",
        here.parent,
    ]

    required = [
        "step1_raw_and_optimized_data",
        "step2_kmeans_compare",
        "step3_dbscan_compare",
        "step4_optics_compare",
    ]

    for c in candidates :
        if c.is_dir() and all((c / r).exists() for r in required):
            return c
        for c2 in [p for p in (Path(".").resolve()/"outputs").iterdir() if p.is_dir() if p.name.startswith("outputs")] :
            if c2.is_dir() and all((c2 / r).exists() for r in required):
                return c2

    # If not found, try unzipping outputs.zip if present
    zpath = here / "outputs.zip"
    if zpath.exists():
        out = here / "_outputs"
        out.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(zpath, "r") as z:
            z.extractall(out)
        # after extract, the step folders are inside out/
        if all((out / r).exists() for r in required):
            return out

    raise FileNotFoundError(
        "Could not find step folders. Put this notebook next to the step folders "
        "OR place outputs.zip next to the notebook and re-run."
    )

BASE_DIR = find_base_dir()
print("BASE_DIR =", BASE_DIR)


BASE_DIR = /config/projects/lang-recognition/outputs/outputs_20260215_190441


## 2) Helpers: loading data + purity + composition tables

In [14]:
RANDOM_SEED = 42
SIL_SAMPLE_SIZE = 1000  # sampling for speed on large sets

KINDS = ["no_augmentation", "augmented"]
VARIANTS = ["raw", "optimized"]

STEP1 = BASE_DIR / "step1_raw_and_optimized_data"
STEP2 = BASE_DIR / "step2_kmeans_compare"
STEP3 = BASE_DIR / "step3_dbscan_compare"
STEP4 = BASE_DIR / "step4_optics_compare"

OUT_EVAL = BASE_DIR / "evaluation_outputs"
OUT_EVAL.mkdir(parents=True, exist_ok=True)

def ensure_dirs(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def load_step1(kind: str, variant: str):
    base = STEP1 / kind / variant
    X = np.load(base / "X_train.npy")
    y = np.load(base / "y_train.npy", allow_pickle=True).astype(str)
    return X, y

def purity_overall_and_per_cluster(cluster_labels: np.ndarray, true_labels: np.ndarray, ignore_noise: bool):
    df = pd.DataFrame({"cluster": cluster_labels, "true": true_labels})
    if ignore_noise:
        df = df[df["cluster"] != -1]

    if len(df) == 0:
        return float("nan"), pd.DataFrame(columns=["cluster","size","top_label","top_count","purity_cluster"])

    per_cluster = []
    correct = 0
    total = len(df)
    for c, g in df.groupby("cluster"):
        vc = g["true"].value_counts()
        top_label = vc.index[0]
        top_count = int(vc.iloc[0])
        size = int(len(g))
        correct += top_count
        per_cluster.append({
            "cluster": int(c),
            "size": size,
            "top_label": str(top_label),
            "top_count": top_count,
            "purity_cluster": float(top_count / size)
        })
    overall = float(correct / total)
    return overall, pd.DataFrame(per_cluster).sort_values("cluster")

def composition_tables(cluster_labels: np.ndarray, true_labels: np.ndarray, include_noise: bool):
    df = pd.DataFrame({"cluster": cluster_labels, "true": true_labels})
    if not include_noise:
        df = df[df["cluster"] != -1]
    counts = pd.crosstab(df["cluster"], df["true"])
    props = counts.div(counts.sum(axis=1), axis=0).fillna(0.0)
    counts = counts.reset_index()
    props = props.reset_index()
    return counts, props

def compute_silhouette(X: np.ndarray, labels: np.ndarray, ignore_noise: bool):
    labels = np.asarray(labels)
    mask = np.ones(len(labels), dtype=bool)
    if ignore_noise:
        mask = labels != -1

    # need at least 2 clusters
    labs_use = labels[mask]
    if len(np.unique(labs_use)) < 2 or mask.sum() < 10:
        return float("nan")

    X_use = X[mask]
    sample_size = SIL_SAMPLE_SIZE if X_use.shape[0] > SIL_SAMPLE_SIZE else None
    return float(
        silhouette_score(X_use, labs_use, sample_size=sample_size, random_state=RANDOM_SEED)
    )

def show_side_by_side(df_left, df_right, title_left="RAW", title_right="OPT"):
    if not IPY_OK:
        print(title_left); print(df_left)
        print(title_right); print(df_right)
        return
    html = f"""
    <div style="display:flex; gap:24px; align-items:flex-start;">
      <div style="flex:1;">
        <h4 style="margin:0 0 8px 0;">{title_left}</h4>
        {df_left.to_html(index=False)}
      </div>
      <div style="flex:1;">
        <h4 style="margin:0 0 8px 0;">{title_right}</h4>
        {df_right.to_html(index=False)}
      </div>
    </div>
    """
    display(HTML(html))



## 3) Evaluation: K-Means (Step2)

In [15]:
def eval_kmeans(kind: str):
    rows_summary = []
    by_variant_tables = {}

    for variant in VARIANTS:
        X, y = load_step1(kind, variant)
        base = STEP2 / kind / variant

        labels_path = base / "tables" / "labels_best.csv"
        summary_path = base / "meta" / "summary.json"

        labels = pd.read_csv(labels_path)["cluster"].to_numpy()
        with open(summary_path, "r", encoding="utf-8") as f:
            meta = json.load(f)

        sil = compute_silhouette(X, labels, ignore_noise=False)
        pur_overall, pur_tbl = purity_overall_and_per_cluster(labels, y, ignore_noise=False)
        counts, props = composition_tables(labels, y, include_noise=True)

        out_dir = OUT_EVAL / "kmeans" / kind / variant
        ensure_dirs(out_dir)

        # save
        pd.DataFrame({"cluster": labels}).to_csv(out_dir / "labels_best.csv", index=False)
        pur_tbl.to_csv(out_dir / "purity_per_cluster.csv", index=False)
        counts.to_csv(out_dir / "composition_counts.csv", index=False)
        props.to_csv(out_dir / "composition_props.csv", index=False)

        result = {
            "kind": kind,
            "variant": variant,
            "best_k_from_step2": int(meta["best"]["k"]),
            "silhouette_recomputed": sil,
            "purity_overall": pur_overall,
        }
        with open(out_dir / "metrics.json", "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

        rows_summary.append(result)
        by_variant_tables[variant] = (pur_tbl, counts)

    # show side-by-side summaries
    df_sum = pd.DataFrame(rows_summary)
    print("=== KMEANS SUMMARY:", kind, "===")
    display(df_sum) if IPY_OK else print(df_sum)

    # side-by-side: purity per cluster
    show_side_by_side(
        by_variant_tables["raw"][0],
        by_variant_tables["optimized"][0],
        title_left=f"KMeans Purity per Cluster — RAW ({kind})",
        title_right=f"KMeans Purity per Cluster — OPT ({kind})"
    )

    # side-by-side: composition counts
    show_side_by_side(
        by_variant_tables["raw"][1],
        by_variant_tables["optimized"][1],
        title_left=f"KMeans Composition (counts) — RAW ({kind})",
        title_right=f"KMeans Composition (counts) — OPT ({kind})"
    )

    return df_sum

kmeans_summaries = []
for kind in KINDS:
    kmeans_summaries.append(eval_kmeans(kind))

print("✅ KMeans evaluation done. Outputs in:", OUT_EVAL / "kmeans")


=== KMEANS SUMMARY: no_augmentation ===


Unnamed: 0,kind,variant,best_k_from_step2,silhouette_recomputed,purity_overall
0,no_augmentation,raw,4,0.193831,0.618056
1,no_augmentation,optimized,4,0.214798,0.618056


cluster,size,top_label,top_count,purity_cluster
0,296,korean,142,0.47973
1,71,spanish,71,1.0
2,142,italian,76,0.535211
3,67,italian,67,1.0

cluster,size,top_label,top_count,purity_cluster
0,296,korean,142,0.47973
1,142,italian,76,0.535211
2,71,spanish,71,1.0
3,67,italian,67,1.0


true,cluster,german,italian,korean,spanish
,0,105,1,142,48
,1,0,0,0,71
,2,39,76,2,25
,3,0,67,0,0

true,cluster,german,italian,korean,spanish
,0,105,1,142,48
,1,39,76,2,25
,2,0,0,0,71
,3,0,67,0,0


=== KMEANS SUMMARY: augmented ===


Unnamed: 0,kind,variant,best_k_from_step2,silhouette_recomputed,purity_overall
0,augmented,raw,5,0.145978,0.548322
1,augmented,optimized,4,0.158302,0.462674


cluster,size,top_label,top_count,purity_cluster
0,1340,korean,671,0.500746
1,355,spanish,355,1.0
2,363,italian,341,0.939394
3,842,italian,384,0.456057
4,556,korean,144,0.258993

cluster,size,top_label,top_count,purity_cluster
0,1641,korean,673,0.410116
1,394,spanish,355,0.901015
2,858,italian,426,0.496503
3,563,korean,145,0.257549


true,cluster,german,italian,korean,spanish
,0,452,5,671,212
,1,0,0,0,355
,2,16,341,2,4
,3,259,384,47,152
,4,137,134,144,141

true,cluster,german,italian,korean,spanish
,0,470,259,673,239
,1,0,39,0,355
,2,257,426,46,129
,3,137,140,145,141


✅ KMeans evaluation done. Outputs in: /config/projects/lang-recognition/outputs/outputs_20260215_190441/evaluation_outputs/kmeans


## 4) Evaluation: DBSCAN (Step3)

In [16]:
def eval_dbscan(kind: str):
    rows_summary = []
    by_variant_tables = {}

    for variant in VARIANTS:
        X, y = load_step1(kind, variant)
        base = STEP3 / kind / variant

        labels_path = base / "tables" / "labels_best.csv"
        best_path = base / "meta" / "best_params.json"

        labels = pd.read_csv(labels_path)["cluster"].to_numpy()
        with open(best_path, "r", encoding="utf-8") as f:
            best = json.load(f)

        sil = compute_silhouette(X, labels, ignore_noise=True)
        pur_overall, pur_tbl = purity_overall_and_per_cluster(labels, y, ignore_noise=True)
        counts, props = composition_tables(labels, y, include_noise=True)

        noise_ratio = float(np.mean(labels == -1))

        out_dir = OUT_EVAL / "dbscan" / kind / variant
        ensure_dirs(out_dir)

        pur_tbl.to_csv(out_dir / "purity_per_cluster_excl_noise.csv", index=False)
        counts.to_csv(out_dir / "composition_counts_incl_noise.csv", index=False)
        props.to_csv(out_dir / "composition_props_incl_noise.csv", index=False)

        result = {
            "kind": kind,
            "variant": variant,
            "eps": float(best.get("eps", float("nan"))),
            "min_samples": int(best.get("min_samples", -1)),
            "n_clusters": int(best.get("n_clusters", len(set(labels)) - (1 if -1 in labels else 0))),
            "noise_ratio": noise_ratio,
            "silhouette_excl_noise_recomputed": sil,
            "purity_excl_noise": pur_overall,
        }
        with open(out_dir / "metrics.json", "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

        rows_summary.append(result)
        by_variant_tables[variant] = (pur_tbl, counts)

    df_sum = pd.DataFrame(rows_summary)
    print("=== DBSCAN SUMMARY:", kind, "===")
    display(df_sum) if IPY_OK else print(df_sum)

    show_side_by_side(
        by_variant_tables["raw"][0],
        by_variant_tables["optimized"][0],
        title_left=f"DBSCAN Purity per Cluster (excl. noise) — RAW ({kind})",
        title_right=f"DBSCAN Purity per Cluster (excl. noise) — OPT ({kind})"
    )
    show_side_by_side(
        by_variant_tables["raw"][1],
        by_variant_tables["optimized"][1],
        title_left=f"DBSCAN Composition (incl. noise) — RAW ({kind})",
        title_right=f"DBSCAN Composition (incl. noise) — OPT ({kind})"
    )

    return df_sum

dbscan_summaries = []
for kind in KINDS:
    dbscan_summaries.append(eval_dbscan(kind))

print("✅ DBSCAN evaluation done. Outputs in:", OUT_EVAL / "dbscan")


=== DBSCAN SUMMARY: no_augmentation ===


Unnamed: 0,kind,variant,eps,min_samples,n_clusters,noise_ratio,silhouette_excl_noise_recomputed,purity_excl_noise
0,no_augmentation,raw,10.000703,5,4,0.017361,0.175462,0.29682
1,no_augmentation,optimized,6.917895,5,4,0.140625,0.213785,0.426263


cluster,size,top_label,top_count,purity_cluster
0,541,korean,143,0.264325
1,10,italian,10,1.0
2,6,italian,6,1.0
3,9,italian,9,1.0

cluster,size,top_label,top_count,purity_cluster
0,420,korean,136,0.32381
1,10,italian,10,1.0
2,58,spanish,58,1.0
3,7,spanish,7,1.0


true,cluster,german,italian,korean,spanish
,-1,4,2,1,3
,0,140,117,143,141
,1,0,10,0,0
,2,0,6,0,0
,3,0,9,0,0

true,cluster,german,italian,korean,spanish
,-1,16,36,8,21
,0,128,98,136,58
,1,0,10,0,0
,2,0,0,0,58
,3,0,0,0,7


=== DBSCAN SUMMARY: augmented ===


Unnamed: 0,kind,variant,eps,min_samples,n_clusters,noise_ratio,silhouette_excl_noise_recomputed,purity_excl_noise
0,augmented,raw,6.54053,15,4,0.233796,0.161666,0.405967
1,augmented,optimized,5.655716,15,4,0.233796,0.157077,0.405211


cluster,size,top_label,top_count,purity_cluster
0,2341,korean,768,0.328065
1,44,italian,44,1.0
2,233,spanish,233,1.0
3,30,spanish,30,1.0

cluster,size,top_label,top_count,purity_cluster
0,2340,korean,765,0.326923
1,45,italian,45,1.0
2,233,spanish,233,1.0
3,30,spanish,30,1.0


true,cluster,german,italian,korean,spanish
,-1,192,326,96,194
,0,672,494,768,407
,1,0,44,0,0
,2,0,0,0,233
,3,0,0,0,30

true,cluster,german,italian,korean,spanish
,-1,194,312,99,203
,0,670,507,765,398
,1,0,45,0,0
,2,0,0,0,233
,3,0,0,0,30


✅ DBSCAN evaluation done. Outputs in: /config/projects/lang-recognition/outputs/outputs_20260215_190441/evaluation_outputs/dbscan


## 5) Evaluation: OPTICS (Step4)

In [17]:
def eval_optics(kind: str):
    rows_summary = []
    by_variant_tables = {}

    for variant in VARIANTS:
        X, y = load_step1(kind, variant)
        base = STEP4 / kind / variant

        labels_path = base / "tables" / "labels_best.csv"
        best_path = base / "meta" / "best_params.json"

        labels = pd.read_csv(labels_path)["cluster"].to_numpy()
        with open(best_path, "r", encoding="utf-8") as f:
            best = json.load(f)

        sil = compute_silhouette(X, labels, ignore_noise=True)
        pur_overall, pur_tbl = purity_overall_and_per_cluster(labels, y, ignore_noise=True)
        counts, props = composition_tables(labels, y, include_noise=True)

        noise_ratio = float(np.mean(labels == -1))
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

        out_dir = OUT_EVAL / "optics" / kind / variant
        ensure_dirs(out_dir)

        pur_tbl.to_csv(out_dir / "purity_per_cluster_excl_noise.csv", index=False)
        counts.to_csv(out_dir / "composition_counts_incl_noise.csv", index=False)
        props.to_csv(out_dir / "composition_props_incl_noise.csv", index=False)

        result = {
            "kind": kind,
            "variant": variant,
            "min_samples": int(best.get("min_samples", -1)),
            "xi": float(best.get("xi", float("nan"))),
            "min_cluster_size": float(best.get("min_cluster_size", float("nan"))),
            "n_clusters": int(best.get("n_clusters", n_clusters)),
            "noise_ratio": noise_ratio,
            "silhouette_excl_noise_recomputed": sil,
            "purity_excl_noise": pur_overall,
        }
        with open(out_dir / "metrics.json", "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)

        rows_summary.append(result)
        by_variant_tables[variant] = (pur_tbl, counts)

    df_sum = pd.DataFrame(rows_summary)
    print("=== OPTICS SUMMARY:", kind, "===")
    display(df_sum) if IPY_OK else print(df_sum)

    show_side_by_side(
        by_variant_tables["raw"][0],
        by_variant_tables["optimized"][0],
        title_left=f"OPTICS Purity per Cluster (excl. noise) — RAW ({kind})",
        title_right=f"OPTICS Purity per Cluster (excl. noise) — OPT ({kind})"
    )
    show_side_by_side(
        by_variant_tables["raw"][1],
        by_variant_tables["optimized"][1],
        title_left=f"OPTICS Composition (incl. noise) — RAW ({kind})",
        title_right=f"OPTICS Composition (incl. noise) — OPT ({kind})"
    )

    return df_sum

optics_summaries = []
for kind in KINDS:
    optics_summaries.append(eval_optics(kind))

print("✅ OPTICS evaluation done. Outputs in:", OUT_EVAL / "optics")


=== OPTICS SUMMARY: no_augmentation ===


Unnamed: 0,kind,variant,min_samples,xi,min_cluster_size,n_clusters,noise_ratio,silhouette_excl_noise_recomputed,purity_excl_noise
0,no_augmentation,raw,15,0.08,0.03,4,0.814236,0.429759,1.0
1,no_augmentation,optimized,20,0.05,0.03,4,0.756944,0.557768,0.992857


cluster,size,top_label,top_count,purity_cluster
0,30,italian,30,1.0
1,29,german,29,1.0
2,29,spanish,29,1.0
3,19,korean,19,1.0

cluster,size,top_label,top_count,purity_cluster
0,37,italian,37,1.0
1,47,korean,46,0.978723
2,36,spanish,36,1.0
3,20,german,20,1.0


true,cluster,german,italian,korean,spanish
,-1,115,114,125,115
,0,0,30,0,0
,1,29,0,0,0
,2,0,0,0,29
,3,0,0,19,0

true,cluster,german,italian,korean,spanish
,-1,123,107,98,108
,0,0,37,0,0
,1,1,0,46,0
,2,0,0,0,36
,3,20,0,0,0


=== OPTICS SUMMARY: augmented ===


Unnamed: 0,kind,variant,min_samples,xi,min_cluster_size,n_clusters,noise_ratio,silhouette_excl_noise_recomputed,purity_excl_noise
0,augmented,raw,5,0.03,0.03,2,0.924769,0.486912,1.0
1,augmented,optimized,5,0.03,0.03,2,0.884259,0.470375,0.4125


cluster,size,top_label,top_count,purity_cluster
0,113,korean,113,1.0
1,147,spanish,147,1.0

cluster,size,top_label,top_count,purity_cluster
0,255,korean,118,0.462745
1,145,italian,47,0.324138


true,cluster,german,italian,korean,spanish
,-1,864,864,751,717
,0,0,0,113,0
,1,0,0,0,147

true,cluster,german,italian,korean,spanish
,-1,807,703,727,819
,0,23,114,118,0
,1,34,47,19,45


✅ OPTICS evaluation done. Outputs in: /config/projects/lang-recognition/outputs/outputs_20260215_190441/evaluation_outputs/optics


## 6) Final: One combined table

In [18]:
def load_metrics_tree(root: Path):
    rows = []
    for p in root.rglob("metrics.json"):
        with open(p, "r", encoding="utf-8") as f:
            obj = json.load(f)
        # infer algorithm from path
        parts = p.parts
        # .../evaluation_outputs/<algo>/<kind>/<variant>/metrics.json
        try:
            algo = parts[parts.index("evaluation_outputs")+1]
        except Exception:
            algo = "unknown"
        obj["algo"] = algo
        rows.append(obj)
    return pd.DataFrame(rows)

df_all = load_metrics_tree(OUT_EVAL)
df_all = df_all.sort_values(["algo","kind","variant"]).reset_index(drop=True)

print("=== COMBINED METRICS TABLE ===")
display(df_all) if IPY_OK else print(df_all)

df_all.to_csv(OUT_EVAL / "combined_metrics.csv", index=False)
print("✅ Saved:", OUT_EVAL / "combined_metrics.csv")


=== COMBINED METRICS TABLE ===


Unnamed: 0,kind,variant,best_k_from_step2,silhouette_recomputed,purity_overall,algo,eps,min_samples,n_clusters,noise_ratio,silhouette_excl_noise_recomputed,purity_excl_noise,xi,min_cluster_size
0,augmented,optimized,,,,dbscan,5.655716,15.0,4.0,0.233796,0.157077,0.405211,,
1,augmented,raw,,,,dbscan,6.54053,15.0,4.0,0.233796,0.161666,0.405967,,
2,no_augmentation,optimized,,,,dbscan,6.917895,5.0,4.0,0.140625,0.213785,0.426263,,
3,no_augmentation,raw,,,,dbscan,10.000703,5.0,4.0,0.017361,0.175462,0.29682,,
4,augmented,optimized,4.0,0.158302,0.462674,kmeans,,,,,,,,
5,augmented,raw,5.0,0.145978,0.548322,kmeans,,,,,,,,
6,no_augmentation,optimized,4.0,0.214798,0.618056,kmeans,,,,,,,,
7,no_augmentation,raw,4.0,0.193831,0.618056,kmeans,,,,,,,,
8,augmented,optimized,,,,optics,,5.0,2.0,0.884259,0.470375,0.4125,0.03,0.03
9,augmented,raw,,,,optics,,5.0,2.0,0.924769,0.486912,1.0,0.03,0.03


✅ Saved: /config/projects/lang-recognition/outputs/outputs_20260215_190441/evaluation_outputs/combined_metrics.csv
