<a href="https://colab.research.google.com/github/BeatrixBlaine/DS-C1/blob/main/3_5_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Representative Selection Code
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


In [13]:
# function

def clustering_representative_selection(
    embeddings,
    items,
    k=5,
    random_state=42
):
    """
    embeddings: np.array of shape (N, d)
    items: list of associated data (texts, examples, docs)
    k: number of representatives
    """
    assert len(embeddings) == len(items)

    kmeans = KMeans(
        n_clusters=k,
        random_state=random_state,
        n_init=10
    )
    labels = kmeans.fit_predict(embeddings)
    centroids = kmeans.cluster_centers_

    representatives = []

    for i in range(k):
        cluster_idx = np.where(labels == i)[0]
        cluster_embeddings = embeddings[cluster_idx]

        sims = cosine_similarity(
            cluster_embeddings,
            centroids[i].reshape(1, -1)
        ).flatten()

        rep_idx = cluster_idx[np.argmax(sims)]
        representatives.append(items[rep_idx])

    return representatives


In [14]:
# Example Usage

# Dump Documents

import numpy as np

documents = [
    "In-context learning improves with diverse examples.",
    "Few-shot prompting relies on representative samples.",
    "Retrieval augmented generation uses external knowledge.",
    "Entropy is a measure of uncertainty in predictions.",
    "Clustering helps summarize large datasets.",
    "Diversity improves generalization in machine learning.",
    "Iterative reasoning refines answers step by step."
]

np.random.seed(0)
embeddings = np.random.randn(len(documents), 128)


In [15]:
# Select Representative

selected_docs = clustering_representative_selection(
    embeddings=embeddings,
    items=documents,
    k=3
)

In [20]:
print("Selected Representative Documents:\n")
for i, doc in enumerate(selected_docs, 1):
    print(f"{i}. {doc}")


Selected Representative Documents:

1. In-context learning improves with diverse examples.
2. Retrieval augmented generation uses external knowledge.
3. Diversity improves generalization in machine learning.
