In [None]:
# Start with imports - ask ChatGPT to explain any package that you don't know

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from anthropic import Anthropic
from IPython.display import Markdown, display

In [None]:
# Always remember to do this!
load_dotenv(override=True) # This is to bring in the environment variables and override isset to True to override any existing environment variables taking priorities
# Print the key prefixes to help with any debugging

In [None]:
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set (and this is optional)")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:2]}")
else:
    print("Google API Key not set (and this is optional)")

if deepseek_api_key:
    print(f"DeepSeek API Key exists and begins {deepseek_api_key[:3]}")
else:
    print("DeepSeek API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")

In [None]:
request = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
request += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": request}]

print("request:", request)
messages
openai = OpenAI()
response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages,
)
question = response.choices[0].message.content
print(question)
competitors=[] # creating a new empty list
answers=[]
messages = [{"role":"user", "content":question}]

print(messages)

In [None]:
model_name="gpt-4o-mini"

response = openai.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
#  Anthropic has a slightly different API, and Max Tokens is required
model_name = "claude-3-7-sonnet-latest"

claude = Anthropic()
response = claude.messages.create(model=model_name, messages=messages, max_tokens=1000)
answer = response.content[0].text

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model=model_name, messages=messages)
# print(response)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
groq = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
model_name = "llama-3.3-70b-versatile"

response = groq.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
## For the next cell, we will use Ollama

ollama = OpenAI(base_url="http://localhost:11434/v1", api_key='ollama')
model_name = "llama3.2"

response = ollama.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# So where are we?

print(competitors)
print(answers)
for competitor, answer in zip(competitors, answers):
    print(f"Competitor: {competitor}\n\n{answer}")
# It's nice to know how to use "zip"

In [None]:
# # Let's bring this together - note the use of "enumerate"

together = "" # empty string
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"
print(together)

In [None]:
# Evauator - optimizer - Adding another agentic pattern - Self-loop design pattern - Where there's a feedback loop - the generated output goes as an input to the evauator and either gets rejected or approve by it. If rejected, the response generated next will be an improved one.

# Critic pattern: each competitor answer is critiqued by another agent

# critiqued_answers = []
# for competitor, answer in zip(competitors, answers):
#     critique_prompt = f"""You are a critic. Please evaluate the following response from {competitor} for clarity, factual correctness, and depth.
#     Suggest improvements, but do not rewrite the full response.
    
#     Response:
#     {answer}
#     """

#     critique_messages = [{"role": "user", "content": critique_prompt}]

#     response = openai.chat.completions.create(
#         model="gpt-4o-mini",
#         messages=critique_messages,
#         temperature=0.2,
#         seed=42
#     )

#     critique = response.choices[0].message.content
#     critiqued_answers.append(f"{answer}\n\nCritique: {critique}")
#     display(Markdown(critique))
#####################################################################

# --- Critic + Reflection (sef-refine) block

# keep a copy of the original answers (so we can critiqu and then refine)
original_answers = answers.copy()

critiqued_answers = []
critiques = []

# 1) Critic step: produce a critique for each answer
for competitor, answer in zip(competitors, original_answers):
    critique_prompt = f"""You are a critic. Please evaluate the folowing response from {competitor} for:
- Clarity
- Factual correctness
- Depth

Suggets concrete improvements, but do not rewrite the full response.

Response:
{answer}
"""

    critique_messages = [{"role": "user", "content": critique_prompt}]
    try:
        resp = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=critique_messages,
            temperature=0.2,
            seed=42 # Use an integer seed for reproducibility
        )
        critique = resp.choices[0].message.content
    except Exception as e:
        critique = f"ERROR (critique failed): {e}"

    critiques.append(critique)
    critiqued_answers.append(f"{answer}\n\n**Critique:** {critique}")

In [None]:
# print(critiqued_answers)

# show all critiques together (neater than showing only the last)
display(Markdown("\n\n---\n\n".join(f"### {c}\n\n{cr}" for c, cr in zip(competitors, critiques))))


In [None]:
# 2) Reflection / Self-refine step: improve each original answer using the critique
refined_answers = []
for competitor, answer, critique in zip(competitors, answers, critiques):
    refine_prompt = f"""You wrote the response below. Improve it uisng the critique.
Keep the same stance, but fix clarity, correctness, and depth. Be concise.

Original response:
{answer}

Critique:
{critique}

Revised response (final):
"""
    try:
        response2 = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": refine_prompt}],
            temperature=0.2,
            seed=42
        )
        refined = response2.choices[0].message.content
    except Exception as e:
        # fallback - if refinement fails, keep original answer
        refined = answer
        refined += f"\n\n[Refinement failed: {e}]"

    refined_answers.append(refined)
    # show each refined answer right away for transparency
    display(Markdown(f"### Refined answer for {competitor}\n\n{refined}"))
    

In [None]:



# The API we know well

# model_name = "gpt-4o-mini"

# response = openai.chat.completions.create(model=model_name, messages=messages)
# answer = response.choices[0].message.content

# display(Markdown(answer))
# competitors.append(model_name)
# answers.append(answer)


# Anthropic has a slightly different API, and Max Tokens is required

# model_name = "claude-3-7-sonnet-latest"

# claude = Anthropic()
# response = claude.messages.create(model=model_name, messages=messages, max_tokens=1000)
# answer = response.content[0].text

# display(Markdown(answer))
# competitors.append(model_name)
# answers.append(answer)



# gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
# model_name = "gemini-2.0-flash"

# response = gemini.chat.completions.create(model=model_name, messages=messages)
# answer = response.choices[0].message.content

# display(Markdown(answer))
# competitors.append(model_name)
# answers.append(answer)


# deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1")
# model_name = "deepseek-chat"

# response = deepseek.chat.completions.create(model=model_name, messages=messages)
# answer = response.choices[0].message.content

# display(Markdown(answer))
# competitors.append(model_name)
# answers.append(answer)

# deepseek = OpenAI(api_key=deepseek_api_key, base_url="https://api.deepseek.com/v1")
# model_name = "deepseek-chat"

# response = deepseek.chat.completions.create(model=model_name, messages=messages)
# answer = response.choices[0].message.content

# display(Markdown(answer))
# competitors.append(model_name)
# answers.append(answer)
# groq = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
# model_name = "llama-3.3-70b-versatile"

# response = groq.chat.completions.create(model=model_name, messages=messages)
# answer = response.choices[0].message.content

# display(Markdown(answer))
# competitors.append(model_name)
# answers.append(answer)






# for competitor, answer in zip(competitors, answers):
#     print(f"Competitor: {competitor}\n\n{answer}")






In [None]:
# answers = critiqued_answers
# 3) Use the refined answers for judging
answers = refined_answers

In [None]:
# # Let's bring this together - note the use of "enumerate"

together = "" # empty string
for index, answer in enumerate(answers):
    together += f"# Response from competitor {index+1}\n\n"
    together += answer + "\n\n"
print(together)

In [None]:
# judge = f"""You are judging a competition between {len(competitors)} competitors.
# Each model has been given this question:

# {question}

# Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
# Respond with JSON, and only JSON, with the following format:

# {{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

# Here are the responses from each competitor:

# {together}

# Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks."""

def run_judging(stage_name, answers, competitors, question):
    together = ""
    for index, answer in enumerate(answers):
        together += f"# Response from competitor {index+1}\n\n{answer}\n\n"

    judge_prompt = f"""You are judging a competition between {len(competitors)} competitors.
Each model has been given this question:

{question}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.
Respond with JSON, and only JSON, with the following format:

{{"results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the responses from each competitor (stage: {stage_name}):

{together}

Now respond with the JSON with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks.
"""
    judge_messages = [{"role": "user", "content": judge_prompt}]
    response = openai.chat.completions.create(
        model="o3-mini",
        messages=judge_messages
    )
    results = response.choices[0].message.content
    print(f"\n===== {stage_name} Judging Results =====")
    print(results)

    results_dict = json.loads(results)
    ranks = results_dict["results"]
    for index, result in enumerate(ranks):
        competitor = competitors[int(result)-1]
        print(f"Rank {index+1}: {competitor}")

In [None]:


# print(judge)
# judge_messages =  [{"role": "user", "content": judge}]
# # Judgement time!

# # openai = OpenAI()
# # response = openai.chat.completions.create(
# #     model="o3-mini",
# #     messages=judge_messages,
# # )
# # results = response.choices[0].message.content
# # print(results)

# openai = OpenAI()
# response = openai.chat.completions.create(
#     model="o3-mini",
#     messages=judge_messages
# )
# results = response.choices[0].message.content
# print(results)
# # # OK let's turn this into results!

# # results_dict = json.loads(results)
# # ranks = results_dict["results"]
# # for index, result in enumerate(ranks):
# #     competitor = competitors[int(result)-1]
# #     print(f"Rank {index+1}: {competitor}")

# results_dict = json.loads(results)
# ranks = results_dict["results"]
# for index, result in enumerate(ranks):
#     competitor = competitors[int(result)-1]
#     print(f"Rank {index +1}: {competitor}")

# # The project above demonstrates multi -agent evaluation, wherein several agents are being asked to perform the same exercise and then their responses are collected.
# # Then there's one more: Judge / refree - a separate LLM ("o3-mini") acts like a judge to compare and rank the responses.
# # And the Orchestration design pattern - The code coordinates different APIs (OpenAI, Gemini, Ollama, Anthropic, Deepseek, Groq)

# # Evauator - optimizer - Adding another agentic pattern - Self-loop design pattern - Where there's a feedback loop - the generated output goes as an input to the evauator and either gets rejected or approve by it. If rejected, the response generated next will be an improved one.


# Run 3 rounds of judging
run_judging("Original", original_answers, competitors, question)
run_judging("Critiqued", critiqued_answers, competitors, question)
run_judging("Refined", refined_answers, competitors, question)
