<a href="https://colab.research.google.com/github/G10hdz/promptimprover/blob/main/prompt_improver_multiAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# promptsmith_unified_final.py
from __future__ import annotations
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field
import asyncio
import json
import datetime

# ─────────────────────────────────────────────────────────────────────────────
# Try to import your orchestration primitives and runtime agents.
# If not available (e.g., in Colab/VS Code sandbox), we provide drop-in shims.
# ─────────────────────────────────────────────────────────────────────────────
try:
    from agents import Agent, Runner, trace  # Provided by your stack
except Exception:
    # Minimal fallbacks so the file can import and RUN.
    class Agent:  # type: ignore
        def __init__(self, name: str, model: str, output_type: Any, instructions: str = ""):
            self.name = name
            self.model = model
            self.output_type = output_type
            self.instructions = instructions

    class _DummyRes:
        def __init__(self, out: Any): self.final_output = out

    from contextlib import contextmanager
    @contextmanager
    def trace(name: str):  # type: ignore
        yield

    # Pydantic models for outputs used by agents
    class ChatMessage(BaseModel):
        role: str
        content: str

    class Issues(BaseModel):
        issues: List[str] = Field(default_factory=list)
        has_issues: bool = False
        def model_dump(self):
            return {"issues": self.issues, "has_issues": self.has_issues}

    class FewShotIssues(Issues):
        @classmethod
        def no_issues(cls): return cls(issues=[], has_issues=False)

    class DevRewriteOutput(BaseModel):
        new_developer_message: str

    class MessagesOutput(BaseModel):
        messages: List[Dict[str, str]]

    class EvaluationOutput(BaseModel):
        score_improvement: int = 8
        is_improvement: bool = True
        explanation: str = "OK"

    def _normalize_messages(msgs: List[ChatMessage]):  # noqa
        return [{"role": m.role, "content": m.content} for m in msgs]

    class Runner:  # type: ignore
        @staticmethod
        async def run(agent: Agent, payload: Any):
            """
            Produce outputs consistent with the agent's declared output_type.
            This dummy lets the workflow run end-to-end without external APIs.
            """
            ot = agent.output_type
            # Simular issues para activar el camino de rewrite y evaluación
            if ot is Issues:
                has = True if ("format" in agent.name or "contradiction" in agent.name) else False
                return _DummyRes(Issues(issues=(["dummy-issue"] if has else []), has_issues=has))
            if ot is FewShotIssues:
                return _DummyRes(FewShotIssues.no_issues())
            if ot is DevRewriteOutput:
                text = payload if isinstance(payload, str) else json.dumps(payload)[:400]
                msg = f"[REWRITTEN by {agent.name}/{agent.model}] " + (text if isinstance(text, str) else str(text))
                return _DummyRes(DevRewriteOutput(new_developer_message=msg))
            if ot is MessagesOutput:
                try:
                    data = json.loads(payload) if isinstance(payload, str) else payload
                except Exception:
                    data = {"ORIGINAL_MESSAGES": []}
                msgs = data.get("ORIGINAL_MESSAGES", [])
                return _DummyRes(MessagesOutput(messages=msgs))
            if ot is EvaluationOutput:
                # Simular puntuación “razonable”
                return _DummyRes(EvaluationOutput(score_improvement=8, is_improvement=True, explanation="Simulated OK"))
            # Default: return the payload text
            return _DummyRes(f"[{agent.name}:{agent.model}] {str(payload)[:200]}")

    # Define default agents (names and models kept to your spec)
    dev_contradiction_checker = Agent(
        name="contradiction_detector", model="gpt-5-mini", output_type=Issues,
        instructions="Detect contradictions and return Issues JSON."
    )
    format_checker = Agent(
        name="format_checker", model="gpt-5-mini", output_type=Issues,
        instructions="Detect format/spec problems and return Issues JSON."
    )
    fewshot_consistency_checker = Agent(
        name="fewshot_consistency_checker", model="gpt-5-mini", output_type=FewShotIssues,
        instructions="Check few-shot consistency and return FewShotIssues JSON."
    )
    dev_rewriter = Agent(
        name="dev_rewriter", model="gpt-5", output_type=DevRewriteOutput,
        instructions="Rewrite developer message to fix issues; return new_developer_message."
    )
    fewshot_rewriter = Agent(
        name="fewshot_rewriter", model="gpt-5", output_type=MessagesOutput,
        instructions="Rewrite few-shot messages; return messages list."
    )
    prompt_evaluator_agent = Agent(
        name="prompt_evaluator", model="gpt-5", output_type=EvaluationOutput,
        instructions="Evaluate improvement; output score_improvement, is_improvement, explanation."
    )

# If your runtime exists, import it (overrides shims above)
try:
    from main_shared_runtime import (  # <- change path if needed
        dev_contradiction_checker,
        format_checker,
        fewshot_consistency_checker,
        dev_rewriter,
        fewshot_rewriter,
        prompt_evaluator_agent,
        ChatMessage, Issues, FewShotIssues,
        _normalize_messages,
    )
except Exception:
    # keep using the shims defined above
    pass

# ─────────────────────────────────────────────────────────────────────────────
# Multi-platform output schema (OpenAI · Gemini · Claude)
# ─────────────────────────────────────────────────────────────────────────────
class MultiOptimizedOutput(BaseModel):
    """Platform-specific optimized prompts"""
    openai_optimized_prompt: str = Field(description="Prompt optimized for OpenAI (GPT-5, GPT-4o, o1)")
    gemini_optimized_prompt: str = Field(description="Prompt optimized for Google Gemini")
    claude_optimized_prompt: str = Field(description="Prompt optimized for Anthropic Claude")
    openai_best_practices: List[str] = Field(description="OpenAI-specific practices applied")
    gemini_best_practices: List[str] = Field(description="Gemini-specific practices applied")
    claude_best_practices: List[str] = Field(description="Claude-specific practices applied")

# Platform optimizers (these RUN with GPT-5 to WRITE optimized prompts)
openai_optimizer = Agent(
    name="openai_optimizer", model="gpt-5", output_type=str,
    instructions=r"""
You are an OpenAI Prompt Optimization Specialist.
Task: Transform UNIFIED_PROMPT into an OpenAI-optimized prompt (GPT-5, GPT-4o, o1).
Apply these practices:
- System/User separation
- Numbered steps for complex tasks
- Output format specified at the top
- Clear delimiters (### or ```), no markdown fences around JSON when in JSON mode
- 1–2 few-shot examples only if format is complex
- Temperature guidance (0 for factual, ~0.7 creative)
- Token/length constraints when needed
- Start with a specific role ("You are a …")
- State constraints BEFORE instructions
- If structured output, explicitly request JSON and give a minimal schema
Input: UNIFIED_PROMPT
Output: ONLY the OpenAI-optimized prompt as plain text.
"""
)

gemini_optimizer = Agent(
    name="gemini_optimizer", model="gpt-5", output_type=str,
    instructions=r"""
You are a Google Gemini Prompt Optimization Specialist.
Transform UNIFIED_PROMPT into a Gemini-optimized prompt.
Apply these practices:
- Conversational, context-rich framing
- Multi-turn awareness (assume prior chat)
- Encourage explicit reasoning ("think through")
- Markdown for structure and scannability
- Multimodal hints when applicable (images/docs)
- Positive framing (avoid excessive "don't")
- Safety/educational intent explicit
- Dialogue-style examples (user/assistant) if using few-shot
Input: UNIFIED_PROMPT
Output: ONLY the Gemini-optimized prompt as plain text.
"""
)

claude_optimizer = Agent(
    name="claude_optimizer", model="gpt-5", output_type=str,
    instructions=r"""
You are an Anthropic Claude Prompt Optimization Specialist.
Transform UNIFIED_PROMPT into a Claude-optimized prompt.
Claude best practices to apply:
- Keep core instructions in a single, strong system-style preamble.
- Prefer explicit sections with XML-like tags or clear headers:
  <role>…</role>, <task>…</task>, <context>…</context>, <constraints>…</constraints>, <output>…</output>
- Set safety/ethics intent and audience explicitly (HHH: helpful, honest, harmless).
- Use precise guardrails ("Do not reveal chain-of-thought; return only final answer or brief rationale").
- If structured output: ask for strict JSON with a minimal schema and an example; forbid extra prose.
- Use few-shot exemplars only when critical; show input/output pairs; keep them compact.
- Provide length/format limits (words, bullets, tables) early.
- Prefer positive phrasing; list what to include before what to avoid.
- If stepwise reasoning is required, put it in a hidden scratchpad section and instruct not to expose it.
- Be deterministic: recommend low temperature for factual tasks.
Input: UNIFIED_PROMPT
Output: ONLY the Claude-optimized prompt as plain text.
"""
)

async def create_multi_optimized_outputs(unified_prompt: str) -> MultiOptimizedOutput:
    """Generate platform-specific optimized prompts for OpenAI, Gemini, and Claude."""
    print("  → Generating platform-specific optimizations (OpenAI/Gemini/Claude)…")
    openai_task = Runner.run(openai_optimizer, unified_prompt)
    gemini_task = Runner.run(gemini_optimizer, unified_prompt)
    claude_task = Runner.run(claude_optimizer, unified_prompt)
    results = await asyncio.gather(openai_task, gemini_task, claude_task)
    openai_optimized = getattr(results[0], "final_output", "")
    gemini_optimized = getattr(results[1], "final_output", "")
    claude_optimized = getattr(results[2], "final_output", "")
    return MultiOptimizedOutput(
        openai_optimized_prompt=openai_optimized,
        gemini_optimized_prompt=gemini_optimized,
        claude_optimized_prompt=claude_optimized,
        openai_best_practices=[
            "System/User separation",
            "Numbered steps",
            "Output format first",
            "Delimiters for sections",
            "Temperature guidance",
            "JSON mode with schema when needed",
        ],
        gemini_best_practices=[
            "Conversational, context-rich framing",
            "Multi-turn awareness",
            "Markdown structure",
            "Positive framing",
            "Explicit reasoning prompts",
        ],
        claude_best_practices=[
            "Single strong preamble (system-style)",
            "XML-like tagged sections",
            "HHH intent & audience set",
            "Hide chain-of-thought; final answer only",
            "Strict JSON with schema + example",
            "Concise few-shot IO pairs only when needed",
            "Early length/format limits",
            "Positive phrasing + explicit inclusions",
            "Optional hidden scratchpad with non-exposure rule",
            "Low temperature for factual tasks",
        ],
    )

# ─────────────────────────────────────────────────────────────────────────────
# Unified optimize wrapper (adds multi-platform outputs after rewrite)
# ─────────────────────────────────────────────────────────────────────────────
async def optimize_prompt_parallel_with_platforms(
    developer_message: str,
    messages: List["ChatMessage"],
    *, create_platform_outputs: bool = True,
) -> Dict[str, Any]:
    """
    Wraps your existing optimize steps and appends platform-specific outputs
    when a rewrite occurs. Works with GPT-5-mini (checkers) + GPT-5 (writer/judge).
    """
    with trace("optimize_prompt_workflow"):
        # 1) Run checkers in parallel
        tasks = [
            Runner.run(dev_contradiction_checker, developer_message),
            Runner.run(format_checker, developer_message),
        ]
        if messages:
            fs_input = {
                "DEVELOPER_MESSAGE": developer_message,
                "USER_EXAMPLES": [m.content for m in messages if m.role == "user"],
                "ASSISTANT_EXAMPLES": [m.content for m in messages if m.role == "assistant"],
            }
            tasks.append(Runner.run(fewshot_consistency_checker, json.dumps(fs_input)))

        results = await asyncio.gather(*tasks)
        cd_issues: Issues = results[0].final_output
        fi_issues: Issues = results[1].final_output
        fs_issues: FewShotIssues = (results[2].final_output if len(results) > 2 else FewShotIssues.no_issues())

        # 2) Rewrite if needed
        final_prompt = developer_message
        prompt_was_rewritten = False
        if getattr(cd_issues, "has_issues", False) or getattr(fi_issues, "has_issues", False):
            pr_input = {
                "ORIGINAL_DEVELOPER_MESSAGE": developer_message,
                "CONTRADICTION_ISSUES": cd_issues.model_dump(),
                "FORMAT_ISSUES": fi_issues.model_dump(),
            }
            pr_res = await Runner.run(dev_rewriter, json.dumps(pr_input))
            new_msg = getattr(pr_res.final_output, "new_developer_message", None)
            if new_msg is None and isinstance(pr_res.final_output, dict):
                new_msg = pr_res.final_output.get("new_developer_message", developer_message)
            final_prompt = new_msg or developer_message
            if final_prompt != developer_message:
                prompt_was_rewritten = True

        # Few-shot rewrite if needed
        final_messages: List[ChatMessage] | List[Dict[str, str]] = messages
        few_shots_were_rewritten = False
        if getattr(fs_issues, "has_issues", False):
            mr_input = {
                "NEW_DEVELOPER_MESSAGE": final_prompt,
                "ORIGINAL_MESSAGES": _normalize_messages(messages),
                "FEW_SHOT_ISSUES": fs_issues.model_dump(),
            }
            mr_res = await Runner.run(fewshot_rewriter, json.dumps(mr_input))
            msgs = getattr(mr_res.final_output, "messages", None)
            if msgs is None and isinstance(mr_res.final_output, dict):
                msgs = mr_res.final_output.get("messages", _normalize_messages(messages))
            final_messages = msgs
            few_shots_were_rewritten = True

        # 3) Platform-specific outputs
        multi_output = None
        if prompt_was_rewritten and create_platform_outputs:
            try:
                multi_output = await create_multi_optimized_outputs(final_prompt)
                print("  ✓ Platform-specific versions (OpenAI/Gemini/Claude) generated")
            except Exception as e:
                print(f"  ⚠ Error creating platform outputs: {e}")
                multi_output = None

        # 4) Evaluate improvement (for unified only; per-platform eval is below in run loop)
        evaluation_results = None
        if prompt_was_rewritten:
            try:
                eval_input = {"ORIGINAL_PROMPT": developer_message, "NEW_PROMPT": final_prompt}
                eval_res = await Runner.run(prompt_evaluator_agent, json.dumps(eval_input))
                payload = getattr(eval_res, "final_output", {})
                if hasattr(payload, "model_dump"):
                    evaluation_results = payload.model_dump()
                else:
                    evaluation_results = payload
            except Exception as e:
                evaluation_results = {"error": str(e)}

        # 5) Final payload
        changes_made = prompt_was_rewritten or few_shots_were_rewritten
        return {
            "changes_made": changes_made,
            "unified_prompt": final_prompt,
            "openai_optimized": (multi_output.openai_optimized_prompt if multi_output else final_prompt),
            "gemini_optimized": (multi_output.gemini_optimized_prompt if multi_output else final_prompt),
            "claude_optimized": (multi_output.claude_optimized_prompt if multi_output else final_prompt),
            "openai_best_practices": (multi_output.openai_best_practices if multi_output else []),
            "gemini_best_practices": (multi_output.gemini_best_practices if multi_output else []),
            "claude_best_practices": (multi_output.claude_best_practices if multi_output else []),
            "new_messages": _normalize_messages(final_messages) if isinstance(final_messages, list) else final_messages,
            "evaluation": evaluation_results,
        }

# ─────────────────────────────────────────────────────────────────────────────
# Pretty table helpers
# ─────────────────────────────────────────────────────────────────────────────
async def _evaluate_platforms(original_prompt: str, out: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
    """
    Evalúa con prompt_evaluator_agent cada una de las versiones:
    unified, openai, gemini, claude. Devuelve dict con score/is_improvement.
    """
    versions = {
        "unified": out.get("unified_prompt", ""),
        "openai": out.get("openai_optimized", ""),
        "gemini": out.get("gemini_optimized", ""),
        "claude": out.get("claude_optimized", ""),
    }

    results: Dict[str, Dict[str, Any]] = {}
    for name, candidate in versions.items():
        try:
            eval_input = {"ORIGINAL_PROMPT": original_prompt, "NEW_PROMPT": candidate}
            eval_res = await Runner.run(prompt_evaluator_agent, json.dumps(eval_input))
            payload = getattr(eval_res, "final_output", {})
            if hasattr(payload, "model_dump"):
                payload = payload.model_dump()
            results[name] = {
                "score": payload.get("score_improvement", 0),
                "is_improvement": payload.get("is_improvement", False),
                "explanation": payload.get("explanation", "N/A"),
            }
        except Exception as e:
            results[name] = {"score": 0, "is_improvement": False, "explanation": f"error: {e}"}
    return results

def _print_attempt_table(attempt_idx: int, eval_map: Dict[str, Dict[str, Any]]) -> None:
    print(f"\n   Attempt {attempt_idx}: scores by platform")
    print("   " + "-" * 58)
    print(f"   {'Platform':<12} {'Improved':<10} {'Score':<8} Explanation")
    print("   " + "-" * 58)
    for name in ["unified", "openai", "gemini", "claude"]:
        data = eval_map.get(name, {"score": 0, "is_improvement": False, "explanation": "N/A"})
        improved = "✓" if data.get("is_improvement") else "✗"
        score = data.get("score", 0)
        expl = str(data.get("explanation", ""))[:60]
        print(f"   {name:<12} {improved:<10} {score:<8} {expl}")
    print("   " + "-" * 58)

# ─────────────────────────────────────────────────────────────────────────────
# Test suite — 6 impressive, realistic cases
# ─────────────────────────────────────────────────────────────────────────────
TEST_CASES = [
    {
        "id": "medical_guideline_explainer",
        "prompt": "Explain hypertension management for doctors vs patients using clear structure and empathy.",
        "messages": [],
    },
    {
        "id": "financial_summary_generator",
        "prompt": "Summarize quarterly earnings for a fintech startup highlighting trends and growth metrics for investors.",
        "messages": [],
    },
    {
        "id": "educational_curriculum_builder",
        "prompt": "Design a 4-week AI ethics curriculum for high school students blending philosophy and technology.",
        "messages": [],
    },
    {
        "id": "creative_story_corporate_metaphor",
        "prompt": "Write a short allegorical story about burnout recovery using an office as metaphor for a phoenix nest.",
        "messages": [],
    },
    {
        "id": "software_doc_refactor",
        "prompt": "Refactor documentation for an open-source project to improve onboarding clarity and code examples.",
        "messages": [],
    },
    {
        "id": "data_analysis_causal_reasoning",
        "prompt": "Analyze customer churn dataset explaining causal insights rather than correlations, with clear reasoning.",
        "messages": [],
    },
]

# Número de intentos por test (para ver evolución)
NUM_TRIALS = 3

# ─────────────────────────────────────────────────────────────────────────────
# Runner principal con múltiples intentos por test
# ─────────────────────────────────────────────────────────────────────────────
async def run_all_tests() -> List[Dict[str, Any]]:
    print("\n" + "="*90)
    print("🚀 Running PromptSmith Unified (GPT-5-mini core + Multi-Platform Outputs)")
    print("="*90, flush=True)

    results: List[Dict[str, Any]] = []
    total = len(TEST_CASES)
    start_time = datetime.datetime.now()

    for i, test in enumerate(TEST_CASES, start=1):
        print(f"\n[{i}/{total}] ▶ {test['id']}")
        print("-" * 90)
        print(f"🧩 Original prompt:\n{test['prompt']}\n", flush=True)

        attempts_history = []
        best_score_overall = -1.0
        best_attempt_idx = 0

        for attempt in range(1, NUM_TRIALS + 1):
            try:
                out = await optimize_prompt_parallel_with_platforms(
                    developer_message=test["prompt"],
                    messages=[ChatMessage(role="user", content=test["prompt"])] if 'ChatMessage' in globals() else [],
                )
            except Exception as e:
                print(f"⚠️ Error in test {test['id']} attempt {attempt}: {e}", flush=True)
                continue

            # Evalúa por plataforma
            eval_map = await _evaluate_platforms(test["prompt"], out)

            # Imprime preview + diferencias por plataforma
            print(f"\n🧠 Unified prompt (attempt {attempt} preview):")
            print((out.get("unified_prompt", "") or "")[:400] + "...\n")

            same_u = lambda k: "Same as unified" if out.get(k, "") == out.get("unified_prompt", "") else "OK"
            print(f"🤖 Platform-specific versions (attempt {attempt}):")
            print("  • OpenAI  →", same_u("openai_optimized"))
            print("  • Gemini  →", same_u("gemini_optimized"))
            print("  • Claude  →", same_u("claude_optimized"))

            # Muestra tabla por intento
            _print_attempt_table(attempt, eval_map)

            # Track del mejor intento (promedio de plataformas)
            avg_score_this_attempt = (
                eval_map["unified"]["score"]
                + eval_map["openai"]["score"]
                + eval_map["gemini"]["score"]
                + eval_map["claude"]["score"]
            ) / 4.0
            if avg_score_this_attempt > best_score_overall:
                best_score_overall = avg_score_this_attempt
                best_attempt_idx = attempt

            attempts_history.append({
                "attempt": attempt,
                "unified": eval_map["unified"],
                "openai": eval_map["openai"],
                "gemini": eval_map["gemini"],
                "claude": eval_map["claude"],
                "unified_preview": (out.get("unified_prompt", "") or "")[:200],
            })

        # Consolidado de test (tomamos el último intento como “estado final” visible)
        if attempts_history:
            last = attempts_history[-1]
            improvement_score = last["unified"]["score"]
            is_improvement = last["unified"]["is_improvement"]
        else:
            improvement_score = 0
            is_improvement = False

        results.append({
            "test_id": test["id"],
            "attempts": attempts_history,
            "best_attempt": best_attempt_idx,
            "best_avg_score": best_score_overall,
            "final_unified_score": improvement_score,
            "final_unified_is_improvement": is_improvement,
        })

        print(f"\n🏅 Best attempt for '{test['id']}': Attempt {best_attempt_idx} (avg score={best_score_overall:.1f})")
        print("=" * 90, flush=True)

    print("\n🏁 All tests completed.")
    end_time = datetime.datetime.now()
    print(f"🕒 Duration: {end_time - start_time}")
    print(f"✅ Total processed: {len(results)}", flush=True)

    # Resumen global (usa mejor intento por test)
    print("\n" + "="*90)
    print("RESULTADOS FINALES — Mejor intento por test (avg score across platforms)")
    print("="*90)
    print(f"{'Test Case':<35} {'BestAttempt':<12} {'BestAvgScore':<14}")
    print("-" * 90)
    for r in results:
        print(f"{r['test_id']:<35} {r['best_attempt']:<12} {r['best_avg_score']:<14.1f}")
    print("="*90)

    # Guardar resultados
    out_name = f"promptsmith_unified_results_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    try:
        with open(out_name, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"\n💾 Results saved to: {out_name}", flush=True)
    except Exception as e:
        print(f"⚠️ Could not save results: {e}", flush=True)

    return results

# ─────────────────────────────────────────────────────────────────────────────
# Colab/VSCode entrypoint
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    try:
        asyncio.run(run_all_tests())
    except RuntimeError:
        # Colab event loop already running
        import nest_asyncio, asyncio as _asyncio
        nest_asyncio.apply()
        _asyncio.get_event_loop().run_until_complete(run_all_tests())



🚀 Running PromptSmith Unified (GPT-5-mini core + Multi-Platform Outputs)

[1/6] ▶ medical_guideline_explainer
------------------------------------------------------------------------------------------
🧩 Original prompt:
Explain hypertension management for doctors vs patients using clear structure and empathy.

  → Generating platform-specific optimizations (OpenAI/Gemini/Claude)…
  ✓ Platform-specific versions (OpenAI/Gemini/Claude) generated

🧠 Unified prompt (attempt 1 preview):
[REWRITTEN by dev_rewriter/gpt-5] {"ORIGINAL_DEVELOPER_MESSAGE": "Explain hypertension management for doctors vs patients using clear structure and empathy.", "CONTRADICTION_ISSUES": {"issues": ["dummy-issue"], "has_issues": true}, "FORMAT_ISSUES": {"issues": ["dummy-issue"], "has_issues": true}}...

🤖 Platform-specific versions (attempt 1):
  • OpenAI  → OK
  • Gemini  → OK
  • Claude  → OK

   Attempt 1: scores by platform
   ----------------------------------------------------------
   Platform     Improv

  _asyncio.get_event_loop().run_until_complete(run_all_tests())
