<a href="https://colab.research.google.com/github/Anishgoswamicode/wikipedia-semantic-clustering/blob/main/wikipedia_semantic_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q wikipedia
!pip install -q sentence-transformers umap-learn keybert


In [None]:
import wikipedia
import random

# Choose categories to pull summaries from
topics = ["Artificial intelligence", "Philosophy", "Genetics", "Climate change", "World War II", "Blockchain", "Economics", "Quantum mechanics", "Indian History", "Linguistics"]

summaries = []
labels = []

for topic in topics:
    try:
        related_pages = wikipedia.search(topic, results=100)
        for title in related_pages:
            try:
                summary = wikipedia.summary(title)
                summaries.append(summary)
                labels.append(topic)
            except Exception:
                continue
    except Exception:
        continue

print(f"Total articles collected: {len(summaries)}")


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(summaries, show_progress_bar=True)


In [None]:
import umap.umap_ as umap

reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embeddings)


In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=3)
cluster_labels = dbscan.fit_predict(embedding_2d)

print(f"Clusters found: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], c=cluster_labels, cmap='tab20', s=40)
plt.title("Topic Topology of Wikipedia Summaries")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(label="Cluster ID")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assume:
# - embedding_2d is your UMAP result (shape: [N, 2])
# - cluster_labels is your DBSCAN result (shape: [N])

plt.figure(figsize=(12, 8))

# Scatter plot
scatter = plt.scatter(
    embedding_2d[:, 0],
    embedding_2d[:, 1],
    c=cluster_labels,
    cmap='tab20',
    s=20,
    alpha=0.8
)

# Plot title and labels
plt.title("UMAP + DBSCAN Clustered Wikipedia Topics")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")

# Label each cluster with its number at the median location
unique_clusters = np.unique(cluster_labels)
for cluster_id in unique_clusters:
    if cluster_id == -1:
        continue  # skip noise

    # Find the points in this cluster
    cluster_points = embedding_2d[cluster_labels == cluster_id]

    # Compute median position
    median_x, median_y = np.median(cluster_points, axis=0)

    # Plot cluster label at median
    plt.text(median_x, median_y, f"Cluster {cluster_id}", fontsize=12,
             bbox=dict(facecolor='white', alpha=0.7), ha='center')

plt.colorbar(scatter, label="Cluster ID")
plt.show()


In [None]:
from keybert import KeyBERT
from collections import defaultdict

kw_model = KeyBERT(model)

cluster_to_summaries = defaultdict(list)
for i, label in enumerate(cluster_labels):
    if label == -1:
        continue  # skip noise
    cluster_to_summaries[label].append(summaries[i])

# Print top keywords for each cluster
for cluster_id, cluster_texts in cluster_to_summaries.items():
    joined = " ".join(cluster_texts[:10])
    keywords = kw_model.extract_keywords(joined, top_n=5)
    print(f"\n🔹 Cluster {cluster_id}:")
    for kw in keywords:
        print(f" - {kw[0]} ({kw[1]:.2f})")
