In [45]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import display,Markdown
import json

In [2]:
load_dotenv()


True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


In [14]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
google_client = OpenAI(api_key=GOOGLE_API_KEY, base_url="https://generativelanguage.googleapis.com/v1beta/openai")
groq_client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")


In [16]:
competitors = [{
    "name": "gemini-2.0-flash",
    "client": google_client,
},
{
    "name": "gpt-4o-mini",
    "client": openai_client,
},
{
    "name": "llama-3.3-70b-versatile",
    "client": groq_client,
},
{
    "name": "o3-mini",
    "client": openai_client,
}
]
answers = []

In [6]:
question_prompt = "Generate a complex research question on explainable AI for different AI models to answer. Just generate the question, no other text."

In [None]:
message = [{
    "role": "user",
    "content": question_prompt
}]

message

In [8]:
response = openai_client.chat.completions.create(model="o3-mini", messages=message)

debate_question = response.choices[0].message.content

print(debate_question)

How do the intrinsic design, learning mechanisms, and data representations across diverse AI models impact the formulation, effectiveness, and evaluation of explainable AI techniques, and what standardized methodologies can be developed to ensure consistent interpretability and actionable insights across these heterogeneous systems?


In [9]:
#Now, we iterate through the competitors and get the answers

debate_question = f"you are a debate expert. You are given a question and you need to answer it. Make sure to answer to the best of your ability. The question is: {debate_question}"

debate_message = [{
    "role": "user",
    "content": debate_question
}]

print(debate_message)

[{'role': 'user', 'content': 'you are a debate expert. You are given a question and you need to answer it. Make sure to answer to the best of your ability. The question is: How do the intrinsic design, learning mechanisms, and data representations across diverse AI models impact the formulation, effectiveness, and evaluation of explainable AI techniques, and what standardized methodologies can be developed to ensure consistent interpretability and actionable insights across these heterogeneous systems?'}]


In [28]:
for competitor in competitors:
    print(f"Getting answer from {competitor['name']}")
    response = competitor["client"].chat.completions.create(model=competitor["name"], messages=debate_message)
    answers.append(response.choices[0].message.content)


Getting answer from gemini-2.0-flash
Getting answer from gpt-4o-mini
Getting answer from llama-3.3-70b-versatile
Getting answer from o3-mini


In [None]:
# iterate through answers and print them
for answer in answers:
    print(display(Markdown(answer)))


In [47]:
competitors_response = ""
for index, answer in enumerate(answers):
    competitors_response += f"Answer {index + 1}:\n{answer}\n\n"

judge_prompt = f"""
You are a debate judge. You are given a question and you need to judge the answers from the competitors. The question is: {debate_question}
You are given the following answers:
{competitors_response}

You need to rank the answers based on what you consider the best based on the question and the following criteria:
- The answer should be the most relevant to the question
- The answer should be the most accurate to the question
- The answer should be the most comprehensive to the question
- The answer should be the most persuasive to the question
- The answer should be the most logical to the question
- The answer should be the most consistent to the question

The rank should be from 1 to 4. Where 1 is the best and 4 is the worst.

provide the answer in json format with the following structure:
{{
    "rank": [
        {{
            "index": "index of the answer",
            "score": "score of the answer"
        }}
    ]
}}

Do not include any other text in your response. And don't respond with markdown.
"""


In [None]:
print(judge_prompt)

In [None]:
judge_message = [{
    "role": "user",
    "content": judge_prompt
}]

judge_response = openai_client.chat.completions.create(model="o3-mini", messages=judge_message)

print(judge_response.choices[0].message.content)

In [None]:
results_dict = json.loads(judge_response.choices[0].message.content)
print(results_dict)

In [57]:
for index, rank in enumerate(results_dict["rank"]):
    print(f"Rank {index + 1}: {competitors[int(rank['index'])-1]['name']}")

Rank 1: gemini-2.0-flash
Rank 2: o3-mini
Rank 3: llama-3.3-70b-versatile
Rank 4: gpt-4o-mini
