<a href="https://colab.research.google.com/github/BeatrixBlaine/DS-C1/blob/main/3_5_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
class GeneratedExample:
    def __init__(self, input_text, output_text, probs, embedding):
        self.input = input_text
        self.output = output_text
        self.probs = probs          # class probabilities or token probs
        self.embedding = embedding  # semantic embedding


In [3]:
def confidence_score(probs):
    probs = np.clip(probs, 1e-12, 1.0)
    entropy = -np.sum(probs * np.log(probs))
    return 1.0 - entropy / np.log(len(probs))


In [4]:
def diversity_score(embedding, selected_embeddings):
    if not selected_embeddings:
        return 1.0
    sims = cosine_similarity(
        embedding.reshape(1, -1),
        np.vstack(selected_embeddings)
    )
    return 1.0 - np.max(sims)


In [5]:
def sge_score(example, selected_embeddings,
              alpha=0.7, beta=0.3):
    """
    alpha: confidence weight
    beta : diversity weight
    """
    conf = confidence_score(example.probs)
    div = diversity_score(example.embedding, selected_embeddings)
    return alpha * conf + beta * div


In [6]:
def select_self_generated_examples(
    generated_examples,
    k=3,
    alpha=0.7,
    beta=0.3
):
    selected = []
    selected_embeddings = []

    for _ in range(k):
        scores = [
            sge_score(ex, selected_embeddings, alpha, beta)
            for ex in generated_examples
        ]
        idx = np.argmax(scores)
        best = generated_examples.pop(idx)

        selected.append(best)
        selected_embeddings.append(best.embedding)

    return selected


In [7]:
np.random.seed(42)

generated = []
for i in range(6):
    ex = GeneratedExample(
        input_text=f"Question {i}",
        output_text=f"Answer {i}",
        probs=np.random.dirichlet(np.ones(4)),   # fake model confidence
        embedding=np.random.randn(128)
    )
    generated.append(ex)


In [8]:
selected_examples = select_self_generated_examples(
    generated_examples=generated,
    k=3
)


In [9]:
prompt = ""
for ex in selected_examples:
    prompt += f"Q: {ex.input}\nA: {ex.output}\n\n"

prompt += "Q: New question\nA:"
print(prompt)


Q: Question 5
A: Answer 5

Q: Question 3
A: Answer 3

Q: Question 0
A: Answer 0

Q: New question
A:
