In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score

import umap


In [2]:
data = np.load("data/news_dataset_normalized.npz", allow_pickle=True)

X = data["emb"]        # shape: (n_samples, embedding_dim)
y = data["y"]          # ground truth labels (ints)
labels = data["labels"]  # class names (strings)

print(X.shape)


(2800, 768)


In [3]:
dbscan = DBSCAN(
    eps=0.3,
    min_samples=10,
    metric="euclidean"
)

db_labels = dbscan.fit_predict(X)


In [4]:
unique_labels = set(db_labels)
n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
n_noise = np.sum(db_labels == -1)

print("Number of clusters:", n_clusters)
print("Number of noise points:", n_noise)


Number of clusters: 8
Number of noise points: 1629


In [5]:
mask = db_labels != -1

if len(set(db_labels[mask])) > 1:
    sil_score = silhouette_score(X[mask], db_labels[mask])
    print("Silhouette Score:", sil_score)
else:
    print("Silhouette Score not defined (only one cluster)")


Silhouette Score: 0.7090961933135986


In [6]:
ari = adjusted_rand_score(y, db_labels)
print("Adjusted Rand Index (DBSCAN):", ari)


Adjusted Rand Index (DBSCAN): 0.11454810837580294


In [7]:
umap_reducer = umap.UMAP(
    n_neighbors=15,
    min_dist=1.0,
    metric="euclidean",
    n_components=2,
    random_state=42
)

X_2d = umap_reducer.fit(X).embedding_



  warn(


In [8]:
plt.figure(figsize=(6, 5))
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap="tab10", s=5)
plt.title("UMAP Projection – Ground Truth Labels")
plt.tight_layout()
plt.savefig("plots/task3/umap_ground_truth.png")
plt.close()


In [9]:
plt.figure(figsize=(6, 5))
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=db_labels, cmap="tab10", s=5)
plt.title("UMAP Projection – DBSCAN Clusters")
plt.tight_layout()
plt.savefig("plots/task3/umap_dbscan.png")
plt.close()


In [10]:
k_values = [4, 6, 8]

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    km_labels = kmeans.fit_predict(X)

    sil = silhouette_score(X, km_labels)
    ari = adjusted_rand_score(y, km_labels)

    print(f"KMeans k={k}: Silhouette={sil:.3f}, ARI={ari:.3f}")

    plt.figure(figsize=(6, 5))
    plt.scatter(X_2d[:, 0], X_2d[:, 1], c=km_labels, cmap="tab10", s=5)
    plt.title(f"UMAP Projection – KMeans (k={k})")
    plt.tight_layout()
    plt.savefig(f"plots/task3/umap_kmeans_{k}.png")
    plt.close()


KMeans k=4: Silhouette=0.338, ARI=0.387
KMeans k=6: Silhouette=0.380, ARI=0.602
KMeans k=8: Silhouette=0.437, ARI=0.651
