In [1]:
# Install dependencies
%pip install dspy pandas python-dotenv -q


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Minimal GEPA optimization quickstart
# This notebook shows the simplest possible end-to-end example of optimizing a tiny DSPy program with GEPA.
# It follows the style of the quickstart notebook and borrows the feedback-augmented metric pattern from comedian-agent.

import dspy
import os
from typing import List
from dotenv import load_dotenv

# Load environment variables (expects OPENAI_API_KEY to be set in .env)
load_dotenv()

# Configure a single LM globally for simplicity
# Use the same model family as the quickstart for consistency
lm = dspy.LM("openai/gpt-5-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=1, max_tokens=16000)
dspy.configure(lm=lm)


In [3]:
# Define a tiny QA signature and program
class QASignature(dspy.Signature):
    """
    Answer a job interview question.
    """
    question: str = dspy.InputField(description="Job interview question")
    answer: str = dspy.OutputField(description="Candidate's answer to the question")

qa_program = dspy.Predict(QASignature)

# Quick sanity check
print(qa_program(question="What is your greatest strength?").answer)


My greatest strength is structured problem-solving combined with a bias for action — I can quickly break down complex problems, identify the highest-impact levers, and coordinate the work needed to deliver results.

For example, in my last role we had a recurring product release delay caused by handoffs between engineering, QA, and documentation. I mapped the end-to-end process, identified three bottlenecks (unclear acceptance criteria, a single QA gate, and manual doc generation), and proposed concrete fixes: clearer user-story templates, parallelized QA stages, and a small automation script for docs. I led the implementation across teams, and within two release cycles we cut average release time by 35% and reduced post-release bugs by 20%.

That combination of analytical thinking, prioritization, and follow-through is how I approach work, and it helps me move projects forward reliably while keeping stakeholders aligned and focused on impact.


In [4]:
# Create a tiny train/test split of QA examples
# Job interview questions with natural, straightforward answers
examples = [
    dspy.Example(
        question="What is your greatest strength?",
        answer="I'm good at working with others and finding practical solutions to problems. I communicate well and can adapt to different situations."
    ).with_inputs("question"),
    dspy.Example(
        question="Where do you see yourself in five years?",
        answer="I'd like to grow into a leadership role where I can mentor others and take on more responsibility. I want to keep learning and contributing to meaningful projects."
    ).with_inputs("question"),
    dspy.Example(
        question="How do you handle workplace challenges?",
        answer="I try to stay calm and break problems down into smaller parts. I ask for help when I need it and focus on finding solutions rather than dwelling on what went wrong."
    ).with_inputs("question"),
    dspy.Example(
        question="What makes you a good fit for this role?",
        answer="My experience matches what you're looking for, and I'm genuinely interested in this type of work. I learn quickly and I'm motivated to do a good job."
    ).with_inputs("question"),
    dspy.Example(
        question="How do you prioritize tasks?",
        answer="I look at what's most urgent and important, then make a list. I focus on high-impact work first and try to be realistic about what I can accomplish in a day."
    ).with_inputs("question"),
]

# Deterministic tiny split (keep it simple)
trainset = examples[:3]
testset = examples[3:]

print(f"Train: {len(trainset)}, Test: {len(testset)}")


Train: 3, Test: 2


In [5]:
# Metric with feedback compatible with GEPA
# Checks for absence of corporate buzzwords in the answer

def qa_metric_with_feedback(example: dspy.Example, pred: dspy.Prediction, trace=None, pred_name=None, pred_trace=None):
    """
    Returns a score based on how few corporate buzzwords appear in the prediction.
    Provides feedback to encourage natural, straightforward language during optimization.
    """
    # Common corporate buzzwords to avoid
    corporate_buzzwords = [
        "synergy", "leverage", "paradigm", "disruptive", "innovative", "strategic",
        "scalable", "optimize", "streamline", "proactive", "dynamic", "collaborative",
        "best practices", "core competencies", "value-added", "stakeholder", "bandwidth",
        "circle back", "touch base", "move the needle", "low-hanging fruit", "actionable",
        "deliverables", "key performance indicators", "kpi", "roi", "growth mindset",
        "thought leader", "game changer", "win-win", "deep dive", "drill down",
        "bias for action", "levers", "deliver results", "end-to-end",
        "bottlenecks", "acceptance criteria", "parallelized", "automation", "implementation",
        "release cycles", "analytical thinking", "prioritization", "follow-through",
        "stakeholders", "aligned", "impact", "problem-solving",
        "break down", "complex", "coordinate", "concrete", "post-release", "problem solving"
    ]
    
    guess = (getattr(pred, "answer", "") or "").strip().lower()
    
    # Count how many buzzwords appear in the prediction
    found_buzzwords = [word for word in corporate_buzzwords if word in guess]
    
    # Score based on absence of buzzwords (fewer is better)
    # Perfect score (1.0) = no buzzwords, score decreases with each buzzword found
    score = max(1.0 - (len(found_buzzwords) * 0.33), 0.0)
    
    if pred_name is None:
        return score
    else:
        if score < 1.0:
            feedback = f"Pred: {guess}\nFound buzzwords: {', '.join(found_buzzwords)}\nTip: Avoid corporate jargon. Use natural, straightforward language instead."
        else:
            feedback = f"Great! No corporate buzzwords detected. Natural language used."
        return dspy.Prediction(score=score, feedback=feedback)

# Quick smoke test of metric
print("Metric smoke test (with buzzwords):", qa_metric_with_feedback(trainset[0], dspy.Prediction(answer="I leverage synergy and strategic thinking to optimize outcomes")))
print("Metric smoke test (no buzzwords):", qa_metric_with_feedback(trainset[0], dspy.Prediction(answer="I'm good at working with others")))


Metric smoke test (with buzzwords): 0.0
Metric smoke test (no buzzwords): 1.0


In [6]:
# Baseline evaluation on the tiny test set
evaluate = dspy.Evaluate(
    devset=testset,
    metric=qa_metric_with_feedback,
    num_threads=4,
    display_table=True,
    display_progress=True,
)

baseline_eval_score = evaluate(qa_program)
print("Baseline done.")


Average Metric: 0.00 / 2 (0.0%): 100%|██████████| 2/2 [00:00<00:00, 2522.13it/s]

2025/10/03 16:15:11 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 2 (0.0%)





Unnamed: 0,question,example_answer,pred_answer,qa_metric_with_feedback
0,What makes you a good fit for this role?,"My experience matches what you're looking for, and I'm genuinely i...",There are three clear reasons I’m a strong fit for this role: rele...,
1,How do you prioritize tasks?,"I look at what's most urgent and important, then make a list. I fo...","I prioritize tasks by combining clear goal alignment, a simple fra...",


Baseline done.


In [8]:
# Run GEPA to optimize the program
from dspy import GEPA

optimizer = GEPA(
    metric=qa_metric_with_feedback,
    max_full_evals=2,  # keep it small/fast for a quickstart demo
    num_threads=8,
    track_stats=True,
    use_merge=False,
    reflection_lm=dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000)
)

optimized_qa = optimizer.compile(
    qa_program,
    trainset=trainset,
    valset=testset,
)

print("GEPA optimization complete.")


2025/10/03 16:16:24 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 10 metric calls of the program. This amounts to 2.00 full evals on the train+val set.
2025/10/03 16:16:24 INFO dspy.teleprompt.gepa.gepa: Using 2 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/10 [00:00<?, ?rollouts/s]2025/10/03 16:16:24 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 2 (0.0%)
2025/10/03 16:16:24 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.0
2025/10/03 16:16:24 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.0


Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:00<00:00, 1105.80it/s]

2025/10/03 16:16:24 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)
2025/10/03 16:16:24 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for self: You are answering job interview questions.

Input format:
- You will receive an object with a single field: question: <the interviewer’s question as plain text>.

Output format:
- Return only the answer, as plain text. No headings or labels.
- Keep it concise (about 110–220 words).
- Bullets are allowed for short step lists; avoid heavy formatting.

How to answer:
- Start with a clear 1–2 sentence direct answer to the question.
- If helpful, add a brief 3–5 bullet list of your approach or steps in plain language.
- Include one short example story using STAR (Situation, Task, Action, Result). Keep it realistic, use first person, and add a simple metric or observable outcome when possible.
- End with a one‑line takeaway that ties back to the role’s value.
- If the question is future‑focused (e.g., “five years”), emph


GEPA optimization complete.





In [9]:
optimized_score = evaluate(optimized_qa)

Average Metric: 1.67 / 2 (83.5%): 100%|██████████| 2/2 [00:00<00:00, 2387.20it/s]

2025/10/03 16:17:37 INFO dspy.evaluate.evaluate: Average Metric: 1.67 / 2 (83.5%)





Unnamed: 0,question,example_answer,pred_answer,qa_metric_with_feedback
0,What makes you a good fit for this role?,"My experience matches what you're looking for, and I'm genuinely i...","I'm a good fit because I combine practical problem‑solving, depend...",✔️ [1.000]
1,How do you prioritize tasks?,"I look at what's most urgent and important, then make a list. I fo...","I prioritize tasks by assessing urgency, importance, and effort so...",✔️ [0.670]


In [11]:
print(optimized_qa.signature.instructions)

You are answering job interview questions.

Input format:
- You will receive an object with a single field: question: <the interviewer’s question as plain text>.

Output format:
- Return only the answer, as plain text. No headings or labels.
- Keep it concise (about 110–220 words).
- Bullets are allowed for short step lists; avoid heavy formatting.

How to answer:
- Start with a clear 1–2 sentence direct answer to the question.
- If helpful, add a brief 3–5 bullet list of your approach or steps in plain language.
- Include one short example story using STAR (Situation, Task, Action, Result). Keep it realistic, use first person, and add a simple metric or observable outcome when possible.
- End with a one‑line takeaway that ties back to the role’s value.
- If the question is future‑focused (e.g., “five years”), emphasize learning, contribution, and flexibility.
- If no role/domain is given, keep examples broadly applicable (e.g., office, tech, operations) and avoid niche acronyms. Use d