In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

The below code is the implementation of the uni model for gemini 2.5 pro 3-25 against the GSM 8k- dataset; n =100; accuracy = 98%;
Also tested on gemini 1.5 flash; n=100;accuracy = 92%

In [None]:
import google.generativeai as genai
import os
import re # For extracting numbers from the response
from google.colab import userdata
from datasets import load_dataset # Import the datasets library

# --- API Key and Model Configuration (Same as before) ---
try:
    gemini_api_key = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=gemini_api_key)
    print("Gemini API Key configured.")
except KeyError:
    print("Please configure your Gemini API key securely (e.g., environment variable or Colab secrets).")
    raise ValueError("API Key not configured.")

MODEL_NAME = 'gemini-2.5-pro-preview-03-25'
generation_config = {"temperature": 0.2} # Lower temperature for more deterministic math answers
safety_settings = [ # Standard safety settings
  {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
  {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
  {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
  {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]
model = genai.GenerativeModel(
    model_name=MODEL_NAME,
    generation_config=generation_config,
    safety_settings=safety_settings
)
# --- End Configuration ---

def get_single_answer(query, prompt_template):
    """Gets an answer from the Gemini model for a single query."""
    prompt = prompt_template.format(query=query)
    try:
        print(f"\n--- Sending Prompt for Query: {query[:80]}... ---") # Print start of query
        response = model.generate_content(prompt)
        print(f"--- Received Response ---")
        # Handle potential lack of text response (rare, but possible)
        if response.parts:
            return response.text
        else:
            print("Warning: Received empty response parts.")
            # Check for finish reason if available
            try:
                finish_reason = response.candidates[0].finish_reason
                print(f"Finish Reason: {finish_reason}")
                if finish_reason == 'SAFETY':
                     return "Error: Response blocked due to safety settings."
            except (IndexError, AttributeError):
                 pass # Ignore if details aren't available
            return "Error: Received empty response from API."

    except Exception as e:
        print(f"!!! API Call Error: {e}")
        # Check if the error is due to safety settings (common)
        if "response was blocked" in str(e).lower():
             return "Error: Response blocked due to safety settings."
        return f"Error: API call failed. {e}"


def extract_boxed_answer(text):
    """Extracts the numerical answer from the \boxed{...} format."""
    # Search for the pattern \boxed{number}
    match = re.search(r"\\boxed\{(-?\d+\.?\d*)\}", text)
    if match:
        try:
            answer_str = match.group(1)
            # Convert to float or int
            if '.' in answer_str:
                return float(answer_str)
            else:
                return int(answer_str)
        except ValueError:
            print(f"Warning: Could not convert extracted boxed value '{match.group(1)}' to number.")
            return None
    else:
        # Fallback: Look for final number if \boxed{} is missing (less reliable)
        numbers = re.findall(r'-?\d+\.?\d*', text)
        if numbers:
            try:
                last_num_str = numbers[-1]
                print(f"Warning: \\boxed{{}} not found, using last number found: {last_num_str}")
                if '.' in last_num_str:
                    return float(last_num_str)
                else:
                    return int(last_num_str)
            except ValueError:
                 print(f"Warning: Could not convert last number '{last_num_str}' to number.")
                 return None
    return None


# --- Load GSM8K Dataset ---
print("Loading GSM8K dataset from Hugging Face...")
try:
    # Load the 'main' configuration which contains train and test splits
    gsm8k_dataset = load_dataset("openai/gsm8k", "main", trust_remote_code=True) # Added trust_remote_code for safety with community datasets
    # Use the test split for evaluation
    gsm8k_test = gsm8k_dataset['test']
    print(f"Loaded {len(gsm8k_test)} examples from the test split.")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please ensure the 'datasets' library is installed (`pip install datasets`) and you have internet access.")
    raise # Stop execution if dataset fails to load

# --- Evaluation Setup ---
# Let's evaluate on a small sample first (e.g., first 10 questions)
SAMPLE_SIZE = 100
evaluation_subset = gsm8k_test.select(range(SAMPLE_SIZE)) # Select first N examples

# Prompt template based on the paper's Appendix (Figure 15)
gsm8k_prompt_template = (
    "Can you solve the following math problem? {query}\n\n"
    "Explain your reasoning step-by-step. Your final answer should be a single numerical number, "
    "in the form \\boxed{{answer}}, at the end of your response."
)


# --- Run Inference and Evaluation ---
results = []
correct_count = 0

for item in evaluation_subset:
    query = item["question"]
    # Extract the true answer from the 'answer' field (it has reasoning + #### number)
    true_answer_match = re.search(r"####\s*(-?\d+\.?\d*)", item["answer"])
    if not true_answer_match:
        print(f"Warning: Could not parse true answer from: {item['answer']}")
        continue # Skip this question if we can't get the ground truth

    try:
        expected_answer_str = true_answer_match.group(1).replace(",", "") # Remove commas
        if '.' in expected_answer_str:
             expected_answer = float(expected_answer_str)
        else:
             expected_answer = int(expected_answer_str)
    except ValueError:
        print(f"Warning: Could not convert expected answer '{true_answer_match.group(1)}' to number.")
        continue


    model_response_text = get_single_answer(query, gsm8k_prompt_template)
    extracted_answer = extract_boxed_answer(model_response_text)

    print(f"Query: {query}")
    print(f"Expected Answer: {expected_answer}")
    print(f"Model Raw Response Snippet:\n{model_response_text[:300]}...") # Print snippet
    print(f"Extracted Answer: {extracted_answer}")

    is_correct = False
    if extracted_answer is not None:
        # Use tolerance for potential floating point differences
        if abs(extracted_answer - expected_answer) < 1e-6:
            is_correct = True
            correct_count += 1

    print(f"Correct?: {is_correct}")
    print("-" * 20)

    results.append({
        "query": query,
        "expected": expected_answer,
        "response": model_response_text,
        "extracted": extracted_answer,
        "correct": is_correct
    })

# --- Calculate Performance ---
total_questions = len(results) # Use length of results in case some were skipped
accuracy = (correct_count / total_questions) * 100 if total_questions > 0 else 0

print(f"\n--- GSM8K Evaluation Summary (Sample Size: {total_questions}) ---")
print(f"Correct Answers: {correct_count}")
print(f"Accuracy: {accuracy:.2f}%")

# print(results) # Optionally print detailed results list

Gemini API Key configured.
Loading GSM8K dataset from Hugging Face...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Loaded 1319 examples from the test split.

--- Sending Prompt for Query: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning an... ---
--- Received Response ---
Query: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Expected Answer: 18
Model Raw Response Snippet:
Here is the step-by-step solution:

1.  **Find the total number of eggs Janet uses each day:** Janet uses eggs for breakfast and for baking.
    *   Eggs for breakfast: 3
    *   Eggs for baking: 4
    *   Total eggs used = 3 + 4 = 7 eggs.

2.  **Find the number of eggs remaining to be sold:** Subtr...
Extracted Answer: 18
Correct?: True
--------------------

--- Sending Prompt for Query: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bol... -

In [None]:
import os
import re
import time
import subprocess # For running curl
import json       # For handling JSON payloads/responses
import random     # For potential backoff jitter
from collections import Counter # For majority voting
from datasets import load_dataset
from tqdm.notebook import tqdm # For progress bar in notebooks

# ----------------------------------------------------------------------------
#                        API KEY CONFIGURATION
# ----------------------------------------------------------------------------
print("--- Configuring API Keys ---")
keys = {}
try:
    # Try loading from Colab Secrets first
    from google.colab import userdata
    keys = {"gemini": "AIzaSyBpm6ZyXmWUcx_M_HhoRacudHC-s7Kzlvc", "openai": "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"}
    print("API Keys fetched from Colab Secrets.")
    if not keys["gemini"] or not keys["openai"]:
         print("ERROR: Ensure both GEMINI_API_KEY and OPENAI_API_KEY secrets exist and have Notebook Access enabled.")
         raise ValueError("Required API keys not found/loaded from Colab Secrets.")
    print("Successfully loaded required API keys.")

except ImportError:
    # Fallback to environment variables
    print("Warning: Not running in Colab. Attempting to use environment variables.")
    keys["gemini"] = os.environ.get('GEMINI_API_KEY')
    keys["openai"] = os.environ.get('OPENAI_API_KEY')
    if not keys["gemini"] or not keys["openai"]:
        raise ValueError("Required API keys (Gemini, OpenAI) not found in environment variables.")
    print("Successfully loaded required API keys from environment variables.")

except userdata.SecretNotFoundError as e:
     print(f"ERROR: Secret not found: {e}. Please check secret names and Notebook Access toggle.")
     raise
except Exception as e:
     print(f"Error loading API keys: {e}")
     raise ValueError("Failed to load API keys.")

# --- NO CLIENT INITIALIZATION NEEDED IF BOTH USE CURL ---
print("API Keys loaded. Clients will be called via curl/subprocess.")

# ----------------------------------------------------------------------------
#                            MODEL CONFIGURATION
# ----------------------------------------------------------------------------
model_config = {
    # Using state-of-the-art models
    "gemini": "gemini-2.5-pro-preview-03-25", # SOTA Gemini model (Preview)
    "openai": "gpt-4-turbo",                  # SOTA OpenAI general model
}
# Generation parameters (need to be added to JSON payload)
generation_config = {
    "temperature": 0.1,
    "max_tokens": 2048 # Note: OpenAI uses 'max_tokens', Gemini uses 'maxOutputTokens'
}
# Gemini safety settings (for curl payload)
gemini_safety_settings_payload = [ {"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
gemini_generation_config_payload = {"temperature": generation_config["temperature"], "maxOutputTokens": generation_config["max_tokens"]}

# OpenAI payload uses slightly different names
openai_generation_config_payload = {"temperature": generation_config["temperature"], "max_tokens": generation_config["max_tokens"]}


# ----------------------------------------------------------------------------
#               AGENT CLASS (Using curl for BOTH Agents)
# ----------------------------------------------------------------------------
class Agent:
    """Represents an LLM agent using curl/subprocess for API calls."""
    def __init__(self, agent_id, provider):
        self.agent_id = agent_id
        self.provider = provider
        self.model_name = model_config.get(provider)
        self.api_key = keys.get(provider) # Get the key for this agent's provider
        self.current_response = ""
        self.history = []

        if not self.api_key:
            raise ValueError(f"{provider.capitalize()} API Key missing for Agent {agent_id}")
        if not self.model_name:
            raise ValueError(f"Model name for provider '{provider}' is missing.")

    def _call_api(self, prompt):
        """Calls the appropriate API via curl based on the agent's provider."""
        print(f"\n--- Agent {self.agent_id} ({self.provider}) sending prompt via curl ---")
        api_result = f"Error: Call failed for {self.provider} (Initial State)."

        max_retries = 2
        for attempt in range(max_retries):
            try:
                curl_command = []
                json_payload = ""

                if self.provider == "gemini":
                    data_payload = {
                        "contents": [{"parts": [{"text": prompt}]}],
                        "generationConfig": gemini_generation_config_payload,
                        "safetySettings": gemini_safety_settings_payload
                    }
                    json_payload = json.dumps(data_payload)
                    # Use v1beta endpoint for API key auth
                    url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:generateContent?key={self.api_key}"
                    curl_command = ['curl', '-H', 'Content-Type: application/json', '-X', 'POST', '-d', json_payload, url]

                elif self.provider == "openai":
                    # Use v1/chat/completions endpoint for generation
                    data_payload = {
                        "model": self.model_name,
                        "messages": [{"role": "user", "content": prompt}],
                        **openai_generation_config_payload
                    }
                    json_payload = json.dumps(data_payload)
                    url = "https://api.openai.com/v1/chat/completions"
                    auth_header = f"Authorization: Bearer {self.api_key}" # Use Bearer token auth
                    curl_command = ['curl', url, '-H', 'Content-Type: application/json', '-H', auth_header, '-X', 'POST', '-d', json_payload]

                else:
                    return f"Error: Unsupported provider '{self.provider}'" # Should not happen if init checks pass

                # --- Execute Curl Command ---
                print(f"Executing curl for Agent {self.agent_id} ({self.provider}, Attempt {attempt+1})...")
                result = subprocess.run(curl_command, capture_output=True, text=True, check=False, timeout=180) # Increased timeout

                # --- Process Curl Result ---
                if result.returncode == 0 and result.stdout:
                    try:
                        response_data = json.loads(result.stdout)
                        # Check for API errors in response JSON
                        if "error" in response_data:
                            err_msg = response_data['error'].get('message', 'Unknown API error')
                            print(f"!!! Agent {self.agent_id} ({self.provider}) API Error JSON: {err_msg}")
                            api_result = f"Error: API returned error: {err_msg}"
                            return api_result.strip() # No retry

                        # Extract successful response text
                        if self.provider == "gemini":
                            if "candidates" in response_data and response_data["candidates"] and \
                               "content" in response_data["candidates"][0] and \
                               "parts" in response_data["candidates"][0]["content"] and response_data["candidates"][0]["content"]["parts"]:
                                api_result = response_data['candidates'][0]['content']['parts'][0]['text']
                                print(f"--- Agent {self.agent_id} ({self.provider}) received response (curl) ---")
                                return api_result.strip() # SUCCESS
                            else: # Handle missing content/safety blocks
                                finish_reason = response_data.get("candidates", [{}])[0].get("finishReason", "UNKNOWN")
                                api_result = f"Error: No content (Gemini Finish Reason: {finish_reason})"
                                if finish_reason == 'SAFETY': api_result = "Error: Response blocked by safety settings."
                                print(f"Warning: Agent {self.agent_id} ({self.provider}) - {api_result}")
                                return api_result.strip() # Return error/status

                        elif self.provider == "openai":
                            if "choices" in response_data and response_data["choices"] and \
                               "message" in response_data["choices"][0] and \
                               "content" in response_data["choices"][0]["message"]:
                                api_result = response_data['choices'][0]['message']['content']
                                print(f"--- Agent {self.agent_id} ({self.provider}) received response (curl) ---")
                                return api_result.strip() # SUCCESS
                            else: # Handle other OpenAI structures or errors if needed
                                print(f"Warning: Agent {self.agent_id} ({self.provider}) - Unexpected OpenAI response structure.")
                                api_result = "Error: Unexpected OpenAI response structure."
                                return api_result.strip() # Return error/status

                    # Handle errors during JSON parsing or structure access
                    except json.JSONDecodeError:
                        print(f"!!! Agent {self.agent_id} ({self.provider}) - Failed JSON decode (curl): {result.stdout[:500]}")
                        api_result = "Error: Failed API JSON decode."
                    except (KeyError, IndexError, TypeError) as e:
                         print(f"!!! Agent {self.agent_id} ({self.provider}) - Error parsing JSON structure: {e} | Response: {result.stdout[:500]}")
                         api_result = "Error: Unexpected API JSON structure."
                    # If parsing failed, maybe retry?
                    if attempt < max_retries - 1: time.sleep(1); continue
                    else: return api_result.strip() # Return error after final attempt

                else: # curl command itself failed
                    print(f"!!! Agent {self.agent_id} ({self.provider}) - Curl command failed.")
                    print(f"Exit Code: {result.returncode}"); print(f"STDERR: {result.stderr[:500]}")
                    api_result = f"Error: Curl command failed (Code: {result.returncode})."
                    if attempt < max_retries - 1: time.sleep(1); continue # Retry on curl failure
                    else: return api_result.strip() # Return error after final attempt

            except subprocess.TimeoutExpired:
                 print(f"!!! Agent {self.agent_id} ({self.provider}) Curl command timed out (Attempt {attempt+1}).")
                 if attempt < max_retries - 1: continue
                 else: api_result = f"Error: Curl command timed out for {self.provider}."
            except Exception as e:
                 print(f"!!! Agent {self.agent_id} ({self.provider}) Outer Error (Attempt {attempt+1}): {e}")
                 if attempt < max_retries - 1: time.sleep(1); continue
                 else: api_result = f"Error: Exception during API call for {self.provider}. Details: {e}"

        return api_result.strip() # Return error after all retries fail

    # --- generate_initial_response and generate_debate_response methods ---
    # --- (These remain the same, they call self._call_api) ---
    def generate_initial_response(self, query, initial_prompt_template):
        prompt = initial_prompt_template.format(query=query, agent_id=self.agent_id, provider=self.provider)
        self.current_response = self._call_api(prompt)
        self.history.append({"round": 0, "response": self.current_response})
        print(f"Agent {self.agent_id} ({self.provider}) initial response generated.")
        return self.current_response

    def generate_debate_response(self, query, other_responses_dict, debate_prompt_template, round_num):
      formatted_other_responses = "\n\n".join([
          f"--- Response from Agent {other_id} (Provider: {other_provider}) ---\n{resp}"
          for (other_id, other_provider), resp in other_responses_dict.items()
      ]) if other_responses_dict else "No other agent responses were provided."

      # <<< NEW: Calculate previous round number >>>
      prev_round_num = round_num - 1

      # Format the prompt using the template string passed to the method
      prompt = debate_prompt_template.format(
          agent_id=self.agent_id,
          provider=self.provider,
          round_num=round_num,
          prev_round_num=prev_round_num, # <<< NEW: Pass the calculated value
          original_query=query,
          my_previous_response=self.current_response,
          other_agent_responses=formatted_other_responses
      )
      self.current_response = self._call_api(prompt)
      self.history.append({"round": round_num, "response": self.current_response})
      print(f"Agent {self.agent_id} ({self.provider}) debate response generated for round {round_num}.")
      return self.current_response

# --- Multi-Agent Debate Function (with Early Stopping) ---
def run_multi_agent_debate(query, agent_providers, num_rounds=2):
    """
    Manages the multi-agent debate process, stopping early if agents converge.
    Args:
        query (str): The debate topic/question.
        agent_providers (list): List of provider strings, e.g. ["gemini", "openai"].
        num_rounds (int): Maximum number of debate rounds after the initial response.
    Returns:
        dict: Dictionary of final responses {(agent_id, provider): response_text}
              from the last completed round (could be before num_rounds if converged early).
              Returns None if initialization fails.
    """
    num_agents = len(agent_providers)
    if num_agents == 0: return None
    print("="*40 + f"\nStarting {num_agents}-Agent Debate (Providers: {agent_providers})\nQuery: {query[:100]}...\nMax Rounds: {num_rounds}\n" + "="*40)

    # --- Prompt Templates ---
    initial_prompt = (
        "You are Agent {agent_id}, an AI assistant ({provider}). Solve the math problem: {query}\n"
        "Explain step-by-step. Final answer in \\boxed{{number}} format."
    )
    # Indented correctly inside the function now
    debate_prompt = (
        "You are Agent {agent_id} ({provider}) in Round {round_num} of a multi-agent debate about a math problem.\n"
        "Original Problem: {original_query}\n\n"
        # Use {prev_round_num} instead of {round_num-1} - Ensure Agent class calculates this!
        "Your Previous Response (Round {prev_round_num}):\n"
        "--- START ---\n{my_previous_response}\n--- END ---\n\n"
        # Use {prev_round_num} instead of {round_num-1} - Ensure Agent class calculates this!
        "Other Agent's Response (Round {prev_round_num}):\n"
        "--- START ---\n{other_agent_responses}\n--- END ---\n\n"
        "**Instructions for This Round ({round_num}):**\n"
        "1. Review both responses. Identify any errors in calculation or reasoning steps.\n"
        "2. Provide an updated step-by-step reasoning and calculation.\n"
        "3. State your final numerical answer clearly in the format \\boxed{{answer}} at the very end.\n\n"
        "Your Updated Reasoning and Final Answer for Round {round_num}:"
    )
    # --- End Prompt Templates ---

    # --- Initialize Agents ---
    agents = {}
    agent_id_counter = 1
    all_providers_ready = True
    for provider in agent_providers:
        # Check if keys/clients are ready for the specific provider
        # Corrected check logic slightly
        provider_ready = False
        if provider == "gemini" and keys.get("gemini"):
            provider_ready = True
        elif provider != "gemini" and clients.get(provider):
             provider_ready = True

        if not provider_ready:
            print(f"ERROR: Config for Provider '{provider}' not ready. Stopping.")
            all_providers_ready = False
            break

        try:
            agent = Agent(agent_id=agent_id_counter, provider=provider)
            agents[agent_id_counter] = agent
            print(f"Initialized Agent {agent_id_counter} with provider: {provider} using model: {agent.model_name}")
            agent_id_counter += 1
        except ValueError as e:
            print(f"Error init Agent {agent_id_counter}: {e}")
            all_providers_ready = False
            break

    if not all_providers_ready or len(agents) != len(agent_providers):
        print("ERROR: Could not initialize all required agents.")
        return None
    print(f"\nInitialized {len(agents)} agents successfully.")

    # --- Round 0: Initial Responses ---
    print(f"\n--- ROUND 0: Initial ---")
    agent_responses_round_0 = {}
    for agent_id, agent in agents.items():
        # Pass the correct initial_prompt template string
        response = agent.generate_initial_response(query, initial_prompt)
        agent_responses_round_0[(agent_id, agent.provider)] = response
        print(f"Agent {agent_id} ({agent.provider}) Initial Snippet: {response[:100]}...")
    all_round_responses = {0: agent_responses_round_0}
    last_completed_round = 0

    # --- Debate Rounds ---
    for round_num in range(1, num_rounds + 1):
        print(f"\n--- ROUND {round_num}: Debate ---")
        previous_round_responses = all_round_responses[round_num - 1]
        current_round_responses = {} # Store responses for *this* round

        for agent_id, agent in agents.items():
            other_responses_for_agent = {k: r for k, r in previous_round_responses.items() if k[0] != agent_id}
            # Pass the correct debate_prompt template string
            response = agent.generate_debate_response(query, other_responses_for_agent, debate_prompt, round_num)
            current_round_responses[(agent_id, agent.provider)] = response
            print(f"Agent {agent_id} ({agent.provider}) Round {round_num} Snippet: {response[:100]}...")

        all_round_responses[round_num] = current_round_responses
        last_completed_round = round_num # Update last completed round

        # --- Early Convergence Check ---
        print(f"\nChecking for convergence after Round {round_num}...")
        extracted_answers_this_round = []
        all_agents_answered = True
        for (id, p), resp_text in current_round_responses.items():
            answer = extract_boxed_answer(resp_text) # Use existing function
            if answer is not None:
                extracted_answers_this_round.append(answer)
            else:
                all_agents_answered = False
                print(f"Warning: Agent {id}({p}) no answer in R{round_num}.")
                break # Cannot converge if an agent failed to answer properly

        if all_agents_answered and len(extracted_answers_this_round) == len(agents):
             first_answer = extracted_answers_this_round[0]
             if all(abs(ans - first_answer) < 1e-6 for ans in extracted_answers_this_round):
                 print(f"--- CONVERGENCE DETECTED in Round {round_num}! Stopping early. ---")
                 break # Exit the rounds loop
        # --- End Convergence Check ---

    # --- Final Output ---
    print("\n--- DEBATE COMPLETE ---")
    # Use responses from the actual last completed round
    final_responses = all_round_responses[last_completed_round]
    print(f"Final Responses from Each Agent (After Round {last_completed_round})")
    # (Printing individual final responses happens outside this function in the eval loop)

    return final_responses # Return dict from the last completed round

# --- Answer Extraction Function ---
def extract_boxed_answer(text):
    if text is None or not isinstance(text, str): return None
    match = re.search(r"\\boxed\{(-?[\d,]+\.?\d*)\}", text)
    if match:
        try: answer_str = match.group(1).replace(",", ""); return float(answer_str) if '.' in answer_str else int(answer_str)
        except ValueError: return None
    else:
        numbers = re.findall(r'-?[\d,]+\.?\d*', text)
        if numbers:
            try: last_num_str = numbers[-1].replace(",", ""); return float(last_num_str) if '.' in last_num_str else int(last_num_str)
            except ValueError: return None
    return None

# --- Aggregation and Convergence Check Function ---
def get_final_answer_from_debate(final_responses):
    """
    Aggregates final answers, checks for convergence.
    Falls back to majority vote if possible (more relevant for >2 agents).
    Defaults to Agent 1's answer if no convergence/majority.

    Args:
        final_responses (dict): Dictionary {(agent_id, provider): response_text}

    Returns:
        tuple: (final_answer, status)
               final_answer can be number or None.
               status can be "Converged", "Majority", "Defaulted to Agent 1",
                          "No Consensus: Agent 1 Failed", "Error",
                          "No Consensus: No valid answers extracted".
    """
    if not final_responses:
        return None, "Error: No final responses provided."

    # Store extracted answers mapped to their agent key
    extracted_answers_map = {}
    agent1_key = None # Store Agent 1's key: (1, 'gemini')
    agent1_answer = None

    print("\n--- Aggregating Final Answers ---")
    for agent_key, response_text in final_responses.items():
        agent_id, provider = agent_key
        answer = extract_boxed_answer(response_text)
        extracted_answers_map[agent_key] = answer
        if answer is not None:
            print(f"Agent {agent_id} ({provider}) extracted answer: {answer}")
        else:
            print(f"Warning: Could not extract answer from Agent {agent_id} ({provider}) final response.")
        # Keep track of Agent 1's answer specifically for the fallback
        if agent_id == 1 and provider == "gemini": # Assuming Agent 1 is Gemini
             agent1_key = agent_key
             agent1_answer = answer


    valid_extracted_answers = [ans for ans in extracted_answers_map.values() if ans is not None]

    if not valid_extracted_answers:
        print("Aggregation Result: No valid answers extracted from any agent.")
        return None, "No Consensus: No valid answers extracted"

    # 1. Check for Convergence (all valid extracted answers are the same)
    is_converged = False
    # Only declare convergence if *all* agents provided a valid, identical answer
    if len(valid_extracted_answers) == len(final_responses):
        first_answer = valid_extracted_answers[0]
        if all(abs(ans - first_answer) < 1e-6 for ans in valid_extracted_answers):
            is_converged = True

    if is_converged:
        print("Aggregation Result: Converged on a single answer.")
        return valid_extracted_answers[0], "Converged"
    else:
        print("Aggregation Result: Agents did not converge on a single answer.")

        # 2. Check for Majority (more relevant for >2 agents, but handles single unique answer case)
        answer_counts = Counter(valid_extracted_answers)
        most_common = answer_counts.most_common(2)

        # Check if there's a clear majority winner
        is_majority = False
        majority_answer = None
        if len(most_common) == 1: # Only one unique answer was extracted overall
            is_majority = True
            majority_answer = most_common[0][0]
            print(f"Majority Check: Only one unique answer found ({majority_answer}). Treating as majority.")
        elif len(most_common) > 1 and most_common[0][1] > most_common[1][1]: # Strict majority
            is_majority = True
            majority_answer = most_common[0][0]
            print(f"Majority Check: Found clear majority answer: {majority_answer}")

        if is_majority:
            return majority_answer, "Majority"
        else:
             print("Majority Check: No clear majority found.")
             # 3. Default to Agent 1 (Gemini) if no convergence and no majority
             print("Defaulting to Agent 1's answer...")
             if agent1_answer is not None:
                 print(f"Using Agent 1's answer: {agent1_answer}")
                 return agent1_answer, "Defaulted to Agent 1"
             else:
                 print("Fallback Failed: Agent 1 did not provide a valid answer.")
                 return None, "No Consensus: Agent 1 Failed"
# --- GSM8K Evaluation Loop ---
print("\n--- Starting GSM8K Evaluation ---")
try:
    gsm8k_dataset = load_dataset("openai/gsm8k", "main", trust_remote_code=True)
    gsm8k_test = gsm8k_dataset['test']
    print(f"Loaded {len(gsm8k_test)} GSM8K test examples.")
except Exception as e: print(f"Error loading dataset: {e}"); raise

SAMPLE_SIZE = 100 # Evaluate on N questions
evaluation_subset = gsm8k_test.select(range(SAMPLE_SIZE))
results = []
correct_count = 0
convergence_count = 0
providers_in_debate = ["gemini", "openai"]

if "gemini" in keys and keys["gemini"] and "openai" in clients and clients["openai"]: # Check config is ready
    print(f"\n--- Evaluating on {SAMPLE_SIZE} GSM8K Samples ---")
    for item in tqdm(evaluation_subset, desc="Evaluating GSM8K"):
        query = item["question"]
        true_answer_match = re.search(r"####\s*(-?[\d,]+\.?\d*)", item["answer"])
        if not true_answer_match: print(f"Skipping Q (GT parse fail): {query[:50]}..."); continue
        try: expected_answer_str = true_answer_match.group(1).replace(",", ""); expected_answer = float(expected_answer_str) if '.' in expected_answer_str else int(expected_answer_str)
        except ValueError: print(f"Skipping Q (GT value fail): {true_answer_match.group(1)}"); continue

        print(f"\nProcessing Q: {query[:80]}... [Expected: {expected_answer}]")
        final_responses = run_multi_agent_debate(query=query, agent_providers=providers_in_debate, num_rounds=2)
        aggregated_answer, debate_status = get_final_answer_from_debate(final_responses)

        print(f"\n--- Evaluation Result for Query ---"); print(f"Debate Status: {debate_status}"); print(f"Final Aggregated Answer: {aggregated_answer}")
        if debate_status == "Converged": convergence_count += 1
        is_correct = False
        if aggregated_answer is not None:
            if abs(aggregated_answer - expected_answer) < 1e-6: is_correct = True; correct_count += 1
        print(f"Correct?: {is_correct}"); print("-" * 30)
        results.append({ "query": query, "expected": expected_answer, "status": debate_status, "aggregated_answer": aggregated_answer, "correct": is_correct, "all_final_responses": final_responses })
        # time.sleep(1)

    # --- Final Performance Calculation ---
    total_evaluated = len(results)
    accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
    convergence_percentage = (convergence_count / total_evaluated) * 100 if total_evaluated > 0 else 0

    print(f"\n--- GSM8K Multi-Agent Debate FINAL SUMMARY ---")
    print(f"Total Questions Evaluated: {total_evaluated}")
    print(f"Correct Aggregated Answers: {correct_count}")
    print(f"Accuracy (Aggregated Answer): {accuracy:.2f}%")
    print(f"Convergence Rate (Exact Match): {convergence_count} / {total_evaluated} = {convergence_percentage:.2f}%")
else:
    print("Skipping evaluation because one or more required API clients/keys failed to initialize.")

#Optional: Display results table
import pandas as pd
if results:
  df_results = pd.DataFrame(results)
  print("\nDetailed Results:")
  with pd.option_context('display.max_colwidth', 60):
    print(df_results[['query', 'expected', 'status', 'aggregated_answer', 'correct']].head(SAMPLE_SIZE))

--- Configuring API Keys ---
API Keys fetched from Colab Secrets.
Successfully loaded required API keys.
API Keys loaded. Clients will be called via curl/subprocess.

--- Starting GSM8K Evaluation ---
Loaded 1319 GSM8K test examples.

--- Evaluating on 100 GSM8K Samples ---


Evaluating GSM8K:   0%|          | 0/100 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

--- Evaluation Result for Query ---
Debate Status: Converged
Final Aggregated Answer: 6
Correct?: True
------------------------------

Processing Q: I have 10 liters of orange drink that are two-thirds water and I wish to add it ... [Expected: 15]
Starting 2-Agent Debate (Providers: ['gemini', 'openai'])
Query: I have 10 liters of orange drink that are two-thirds water and I wish to add it to 15 liters of pine...
Max Rounds: 2
Initialized Agent 1 with provider: gemini using model: gemini-2.5-pro-preview-03-25
Initialized Agent 2 with provider: openai using model: gpt-4-turbo

Initialized 2 agents successfully.

--- ROUND 0: Initial ---

--- Agent 1 (gemini) sending prompt via curl ---
Executing curl for Agent 1 (gemini, Attempt 1)...
--- Agent 1 (gemini) received response (curl) ---
Agent 1 (gemini) initial response generated.
Agent 1 (gemini) Initial Snippet: Okay, let's break this down step-by-step:

1.  **Calculate th

In [None]:
import os
import re
import time
import subprocess # For running curl
import json       # For handling JSON payloads/responses
import random     # For potential backoff jitter
from datasets import load_dataset
from tqdm.notebook import tqdm # For progress bar in notebooks

# ----------------------------------------------------------------------------
#                           API KEY CONFIGURATION
# ----------------------------------------------------------------------------
# ... (Keep the API Key loading section exactly the same as the previous version) ...
print("--- Configuring API Key ---")
try:
    from google.colab import userdata
    keys["gemini"] = os.environ.get('GEMINI_API_KEY')
    if not gemini_api_key: raise ValueError("Key not found in Secrets")
    print("Gemini API Key fetched from Colab Secrets.")
except ImportError:
    print("Using environment variable for Gemini key.")
    gemini_api_key = os.environ.get('GEMINI_API_KEY')
    if not gemini_api_key: raise ValueError("Key not found in environment variables")
except ValueError as e: print(f"ERROR: {e}"); raise
except Exception as e: print(f"Error loading API keys: {e}"); raise ValueError("Failed to load API keys.")

# ----------------------------------------------------------------------------
#                           MODEL CONFIGURATION
# ----------------------------------------------------------------------------
# ... (Keep Model Config, Generation Config, Safety Settings the same) ...
gemini_model_name = "gemini-1.5-pro-latest"
gemini_generation_config_payload = { "temperature": 0.3, "maxOutputTokens": 1024 }
gemini_safety_settings_payload = [ {"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]



# ----------------------------------------------------------------------------
#                       API Call Function (REFACTORED)
# ----------------------------------------------------------------------------
def call_gemini_via_curl(prompt, model_name, api_key):
    """Calls the Gemini API using curl and subprocess, with improved error/retry logic."""
    print(f"\n--- Sending prompt to {model_name} via curl ---")
    # Default error message if all retries fail
    api_result = f"Error: Call failed for {model_name} after multiple attempts."
    max_retries = 2

    for attempt in range(max_retries):
        success = False # Flag to check if we got a valid response this attempt
        try:
            # --- Construct Payload and Command ---
            data_payload = {
                "contents": [{"parts": [{"text": prompt}]}],
                "generationConfig": gemini_generation_config_payload,
                "safetySettings": gemini_safety_settings_payload
            }
            json_payload = json.dumps(data_payload)
            url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={api_key}"
            curl_command = ['curl', '-s', '-H', 'Content-Type: application/json', '-X', 'POST', '-d', json_payload, url]

            print(f"Executing curl (Attempt {attempt+1}/{max_retries})...")
            result = subprocess.run(curl_command, capture_output=True, text=True, check=False, timeout=180)

            # --- Process Curl Result ---
            if result.returncode == 0 and result.stdout:
                try:
                    response_data = json.loads(result.stdout)
                    # 1. Check for explicit API errors first
                    if "error" in response_data:
                        err_msg = response_data['error'].get('message', 'Unknown API error')
                        print(f"!!! API Error JSON: {err_msg}")
                        api_result = f"Error: API error: {err_msg}"
                        success = False # Treat API error as failure, don't retry usually
                        break # Exit retry loop on definitive API error

                    # 2. Check for successful content structure
                    elif ("candidates" in response_data and response_data["candidates"] and
                          "content" in response_data["candidates"][0] and
                          "parts" in response_data["candidates"][0]["content"] and
                          response_data["candidates"][0]["content"]["parts"]):
                        api_result = response_data['candidates'][0]['content']['parts'][0]['text']
                        print(f"--- Received response (curl) ---")
                        success = True # Mark as success
                        break # Exit retry loop on success

                    # 3. Check for other finish reasons (e.g., safety)
                    else:
                        finish_reason = response_data.get("candidates", [{}])[0].get("finishReason", "UNKNOWN")
                        api_result = f"Error: No content (Finish Reason: {finish_reason})"
                        if finish_reason == 'SAFETY':
                             api_result = "Error: Response blocked by safety."
                        print(f"Warning: No text part found. Finish Reason: {finish_reason}")
                        success = False # Treat as failure, likely non-retriable
                        break # Exit retry loop

                except json.JSONDecodeError:
                    print(f"!!! Failed JSON decode (curl): {result.stdout[:500]}")
                    api_result = "Error: Failed API JSON decode."
                    success = False # Mark as failure
                except (KeyError, IndexError, TypeError) as e:
                    print(f"!!! Error parsing JSON structure: {e} | Response: {result.stdout[:500]}")
                    api_result = "Error: Unexpected API JSON structure."
                    success = False # Mark as failure

            else: # curl command itself failed
                print(f"!!! Curl command failed. Exit Code: {result.returncode}")
                print(f"STDERR: {result.stderr[:500]}")
                api_result = f"Error: Curl failed (Code: {result.returncode})."
                success = False # Mark as failure

        except subprocess.TimeoutExpired:
             print(f"!!! Curl command timed out (Attempt {attempt+1}).")
             api_result = f"Error: Curl command timed out."
             success = False
        except Exception as e:
             print(f"!!! Outer Error during API call (Attempt {attempt+1}): {e}")
             api_result = f"Error: Exception during API call. Details: {e}"
             success = False

        # --- Decide whether to retry ---
        if not success and attempt < max_retries - 1:
            wait_time = (1.5 ** attempt) + random.uniform(0.1, 0.5) # Slightly gentler backoff
            print(f"Attempt {attempt+1} failed. Retrying in {wait_time:.2f}s...")
            time.sleep(wait_time)
        elif success:
            break # Exit loop if successful
        # If not success and it was the last attempt, the loop ends naturally

    # Return the final result (either success text or last error message)
    return api_result.strip()
# ----------------------------------------------------------------------------
#            <<< NEW/MODIFIED: Answer Extraction Function (Boxed Letter) >>>
# ----------------------------------------------------------------------------
def extract_boxed_letter_answer(text):
    """
    Extracts the final multiple-choice answer (A, B, C, D)
    specifically from the \boxed{<LETTER>} format.
    """
    if text is None or not isinstance(text, str):
        return None

    # Search specifically for \boxed{A}, \boxed{B}, \boxed{C}, or \boxed{D}
    # Making sure it captures only the single letter inside
    match = re.search(r"\\boxed\{([A-D])\}", text, re.IGNORECASE)
    if match:
        return match.group(1).upper() # Return the captured letter in uppercase
    else:
        # Optional: Add fallback to previous method if needed, but for now, be strict
        print(f"Warning: Could not extract \\boxed{{<LETTER>}} answer from response: {text[:100]}...")
        return None

# ----------------------------------------------------------------------------
#                           MMLU DATASET LOADING
# ----------------------------------------------------------------------------
# ... (Keep MMLU Dataset Loading section the same, choose your subject) ...
print("\n--- Loading MMLU Dataset ---")
MMLU_SUBJECT = "high_school_computer_science"
try:
    mmlu_dataset = load_dataset("cais/mmlu", MMLU_SUBJECT, trust_remote_code=True)
    mmlu_eval_split = mmlu_dataset.get('test') or mmlu_dataset.get('validation')
    if not mmlu_eval_split: raise ValueError(f"Split not found for {MMLU_SUBJECT}")
    print(f"Loaded {len(mmlu_eval_split)} examples for MMLU subject: {MMLU_SUBJECT}")
except Exception as e: print(f"Error loading MMLU dataset ({MMLU_SUBJECT}): {e}"); raise


# ----------------------------------------------------------------------------
#                   MMLU EVALUATION LOOP (Single Agent via curl)
# ----------------------------------------------------------------------------
SAMPLE_SIZE = 100
evaluation_subset = mmlu_eval_split.select(range(SAMPLE_SIZE))
results = []
correct_count = 0

# <<< MODIFIED: MMLU Prompt Template (Requesting Boxed Letter) >>>
mmlu_prompt_template = (
    "The following is a multiple-choice question about {subject}.\n"
    "Question: {question}\n"
    "Options:\n"
    "A) {option_A}\n"
    "B) {option_B}\n"
    "C) {option_C}\n"
    "D) {option_D}\n\n"
    "Think step-by-step to determine the correct answer. "
    "After your reasoning, state your final choice clearly by enclosing the single capital letter "
    "(A, B, C, or D) in a box like this: \\boxed{{<LETTER>}}. Make sure this is the very last part of your response."
)

print(f"\n--- Starting Single-Agent MMLU Evaluation (Sample Size: {SAMPLE_SIZE}) ---")
print(f"--- Using Model: {gemini_model_name} via curl ---")

# <<< MODIFIED: Use tqdm wrapper >>>
for item in tqdm(evaluation_subset, desc=f"Evaluating MMLU ({MMLU_SUBJECT})"):
    question_text = item["question"]
    choices = item["choices"]
    correct_index = item["answer"]
    correct_letter = chr(ord('A') + correct_index)

    # Format the prompt
    prompt_for_api = mmlu_prompt_template.format(
        subject=MMLU_SUBJECT.replace("_", " "),
        question=question_text,
        option_A=choices[0], option_B=choices[1],
        option_C=choices[2], option_D=choices[3]
    )

    print(f"\nProcessing MMLU Question: {question_text[:80]}... [Correct: {correct_letter}]")

    # Call the API
    model_response_text = call_gemini_via_curl(prompt_for_api, gemini_model_name, gemini_api_key)

    # <<< MODIFIED: Use new extraction function >>>
    extracted_answer_letter = extract_boxed_letter_answer(model_response_text)

    print(f"\n--- Evaluation Result for Query ---")
    print(f"Model Raw Response Snippet:\n{model_response_text[:300]}...")
    print(f"Extracted Boxed Letter: {extracted_answer_letter}") # Changed print label

    # Compare extracted answer letter to ground truth letter
    is_correct = False
    if extracted_answer_letter is not None:
        if extracted_answer_letter == correct_letter:
            is_correct = True
            correct_count += 1

    print(f"Correct?: {is_correct}")
    print("-" * 30)

    results.append({
        "question": question_text,
        "choices": choices,
        "expected_letter": correct_letter,
        "response": model_response_text,
        "extracted_answer_letter": extracted_answer_letter, # Correct variable used here now
        "correct": is_correct
    })
    # time.sleep(1)

# --- Final Performance Calculation (Remains the same) ---
total_evaluated = len(results)
accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0

print(f"\n--- Single-Agent MMLU Evaluation FINAL SUMMARY ({MMLU_SUBJECT}) ---")
print(f"Model Tested: {gemini_model_name} (via curl)")
print(f"Total Questions Evaluated: {total_evaluated}")
print(f"Correct Answers: {correct_count}")
print(f"Accuracy: {accuracy:.2f}%")

# Optional: Display results table
import pandas as pd
if results:
    df_results = pd.DataFrame(results)
    print("\nDetailed Results:")
    with pd.option_context('display.max_colwidth', 40):
         print(df_results[['question', 'expected_letter', 'extracted_answer_letter', 'correct']].head(SAMPLE_SIZE))

--- Configuring API Key ---
Gemini API Key fetched from Colab Secrets.

--- Loading MMLU Dataset ---
Loaded 100 examples for MMLU subject: high_school_computer_science

--- Starting Single-Agent MMLU Evaluation (Sample Size: 100) ---
--- Using Model: gemini-1.5-pro-latest via curl ---


Evaluating MMLU (high_school_computer_science):   0%|          | 0/100 [00:00<?, ?it/s]


Processing MMLU Question: Let x = 1. What is x << 3 in Python 3?... [Correct: C]

--- Sending prompt to gemini-1.5-pro-latest via curl ---
Executing curl (Attempt 1/2)...
--- Received response (curl) ---

--- Evaluation Result for Query ---
Model Raw Response Snippet:
The << operator is the left bit shift operator.  It shifts the bits of the left operand to the left by the number of positions specified by the right operand.

x = 1. In binary, this is 0001.

x << 3 means we shift the bits of x (0001) three places to the left.

0001 becomes 1000.

1000 in binary is...
Extracted Boxed Letter: C
Correct?: True
------------------------------

Processing MMLU Question: In Python 3, which of the following function convert a string to an int in pytho... [Correct: A]

--- Sending prompt to gemini-1.5-pro-latest via curl ---
Executing curl (Attempt 1/2)...
--- Received response (curl) ---

--- Evaluation Result for Query ---
Model Raw Response Snippet:
We are looking for a function that convert

In [None]:
# ----------------------------------------------------------------------------
#                              INSTALLATIONS
# ----------------------------------------------------------------------------
# !pip install --upgrade datasets tqdm # openai library not needed if using curl

# ----------------------------------------------------------------------------
#                                 IMPORTS
# ----------------------------------------------------------------------------
import os
import re
import time
import subprocess # For running curl
import json       # For handling JSON payloads/responses
import random     # For potential backoff jitter
from collections import Counter # For majority voting
from datasets import load_dataset
from tqdm.notebook import tqdm # For progress bar in notebooks
# import torch # Only needed if using transformers library

# ----------------------------------------------------------------------------
#                        API KEY CONFIGURATION
# ----------------------------------------------------------------------------
print("--- Configuring API Keys ---")
keys = {}
try:
    from google.colab import userdata
    keys["gemini"] = os.environ.get('GEMINI_API_KEY')
    keys["openai"] = os.environ.get('OPENAI_API_KEY')
    print("API Keys fetched from Colab Secrets.")
    if not keys["gemini"] or not keys["openai"]:
         print("ERROR: Ensure both GEMINI_API_KEY and OPENAI_API_KEY secrets exist and have Notebook Access enabled.")
         raise ValueError("Required API keys not found/loaded from Colab Secrets.")
    print("Successfully loaded required API keys.")

except ImportError:
    # Fallback to environment variables
    print("Warning: Not running in Colab. Attempting to use environment variables.")
    keys["gemini"] = os.environ.get('GEMINI_API_KEY')
    keys["openai"] = os.environ.get('OPENAI_API_KEY')
    if not keys["gemini"] or not keys["openai"]:
        raise ValueError("Required API keys (Gemini, OpenAI) not found in environment variables.")
    print("Successfully loaded required API keys from environment variables.")

except userdata.SecretNotFoundError as e:
     print(f"ERROR: Secret not found: {e}. Please check secret names and Notebook Access toggle.")
     raise
except Exception as e:
     print(f"Error loading API keys: {e}")
     raise ValueError("Failed to load API keys.")

# --- NO CLIENT INITIALIZATION NEEDED IF BOTH USE CURL ---
print("API Keys loaded. Clients will be called via curl/subprocess.")

# ----------------------------------------------------------------------------
#                            MODEL CONFIGURATION
# ----------------------------------------------------------------------------
model_config = {
    # Using state-of-the-art models
    "gemini": "gemini-1.5-pro-latest", # Switched back from 2.5-pro-preview for potentially more stability/speed
    "openai": "gpt-4o",             # Changed from gpt-4-turbo to gpt-4o
}
# Generation parameters
generation_config = {
    "temperature": 0.2, # Lowered temp further for MCQ consistency
    "max_tokens": 1536 # Adjusted max tokens
}
# Gemini payload configs
gemini_safety_settings_payload = [ {"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
gemini_generation_config_payload = {"temperature": generation_config["temperature"], "maxOutputTokens": generation_config["max_tokens"]}
# OpenAI payload configs
openai_generation_config_payload = {"temperature": generation_config["temperature"], "max_tokens": generation_config["max_tokens"]}


# ----------------------------------------------------------------------------
#               AGENT CLASS (Using curl for BOTH Agents - Corrected Methods)
# ----------------------------------------------------------------------------
class Agent:
    """Represents an LLM agent using curl/subprocess for API calls."""
    def __init__(self, agent_id, provider):
        self.agent_id = agent_id
        self.provider = provider
        self.model_name = model_config.get(provider)
        self.api_key = keys.get(provider)
        self.current_response = ""
        self.history = []
        if not self.api_key: raise ValueError(f"{provider.capitalize()} API Key missing for Agent {agent_id}")
        if not self.model_name: raise ValueError(f"Model name for provider '{provider}' is missing.")
    def _call_api(self, prompt):
        """Calls the appropriate API via curl based on the agent's provider. (Simplified return logic)"""
        print(f"\n--- Agent {self.agent_id} ({self.provider}) sending prompt via curl ---")
        # Initialize with default error message
        current_attempt_result = f"Error: Call failed for {self.provider} after multiple attempts."
        max_retries = 2

        for attempt in range(max_retries):
            print(f"Executing curl (Attempt {attempt+1}/{max_retries})...")
            success = False # Flag for this attempt
            # Initialize error message for THIS attempt (gets overwritten on success/specific error)
            current_attempt_result = f"Error: Unknown failure in attempt {attempt+1}"

            try:
                # --- Construct Payload and Command ---
                curl_command = []; json_payload = ""
                # (Code for constructing curl_command and json_payload based on provider remains the same)
                if self.provider == "gemini":
                    data_payload = {"contents": [{"parts": [{"text": prompt}]}],"generationConfig": gemini_generation_config_payload,"safetySettings": gemini_safety_settings_payload}
                    json_payload = json.dumps(data_payload); url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_name}:generateContent?key={self.api_key}"
                    curl_command = ['curl', '-s', '-H', 'Content-Type: application/json', '-X', 'POST', '-d', json_payload, url]
                elif self.provider == "openai":
                    data_payload = {"model": self.model_name, "messages": [{"role": "user", "content": prompt}], **openai_generation_config_payload}
                    json_payload = json.dumps(data_payload); url = "https://api.openai.com/v1/chat/completions"; auth_header = f"Authorization: Bearer {self.api_key}"
                    curl_command = ['curl', url, '-s', '-H', 'Content-Type: application/json', '-H', auth_header, '-X', 'POST', '-d', json_payload]
                else:
                    current_attempt_result = f"Error: Unsupported provider '{self.provider}'"; break # Exit loop for unsupported provider

                # --- Execute Curl Command ---
                result = subprocess.run(curl_command, capture_output=True, text=True, check=False, timeout=180)

                # --- Process Curl Result ---
                if result.returncode == 0 and result.stdout:
                    try:
                        response_data = json.loads(result.stdout)
                        if "error" in response_data:
                            err_msg = response_data['error'].get('message', 'Unknown API error')
                            print(f"!!! Agent {self.agent_id} API Error: {err_msg}")
                            current_attempt_result = f"Error: API error: {err_msg}"
                            success = False; break # Non-retriable API error
                        elif self.provider == "gemini":
                            if "candidates" in response_data and response_data["candidates"] and "content" in response_data["candidates"][0] and "parts" in response_data["candidates"][0]["content"] and response_data["candidates"][0]["content"]["parts"]:
                                current_attempt_result = response_data['candidates'][0]['content']['parts'][0]['text']
                                print(f"--- Agent {self.agent_id} ({self.provider}) received response ---")
                                success = True; break # SUCCESS
                            else:
                                finish_reason = response_data.get("candidates", [{}])[0].get("finishReason", "UNKNOWN")
                                current_attempt_result = f"Error: No content (Finish: {finish_reason})"
                                if finish_reason == 'SAFETY': current_attempt_result = "Error: Blocked(Safety)."
                                print(f"Warning: {current_attempt_result}"); success = False; break # Non-retriable
                        elif self.provider == "openai":
                            if "choices" in response_data and response_data["choices"] and "message" in response_data["choices"][0] and "content" in response_data["choices"][0]["message"]:
                                current_attempt_result = response_data['choices'][0]['message']['content']
                                print(f"--- Agent {self.agent_id} ({self.provider}) received response ---")
                                success = True; break # SUCCESS
                            else:
                                print(f"Warning: Agent {self.agent_id} ({self.provider}) - Unexpected OpenAI structure.")
                                current_attempt_result = "Error: Unexpected OpenAI structure."
                                success = False # Treat as failure, allow retry potentially
                    except json.JSONDecodeError:
                        print(f"!!! Agent {self.agent_id} ({self.provider}) - Failed JSON decode: {result.stdout[:500]}")
                        current_attempt_result = "Error: Failed JSON decode."
                        success = False
                    except (KeyError, IndexError, TypeError) as e:
                        print(f"!!! Agent {self.agent_id} ({self.provider}) - Error parsing JSON: {e} | Resp: {result.stdout[:500]}")
                        current_attempt_result = "Error: Unexpected API JSON structure."
                        success = False
                else: # curl command itself failed
                    print(f"!!! Agent {self.agent_id} ({self.provider}) - Curl failed. Code: {result.returncode}")
                    print(f"STDERR: {result.stderr[:500]}")
                    current_attempt_result = f"Error: Curl failed (Code: {result.returncode})."
                    success = False

            except subprocess.TimeoutExpired:
                 print(f"!!! Agent {self.agent_id} ({self.provider}) Curl timeout (Attempt {attempt+1}).")
                 current_attempt_result = f"Error: Curl command timed out."
                 success = False
            except Exception as e:
                 print(f"!!! Agent {self.agent_id} ({self.provider}) Outer Error (Attempt {attempt+1}): {e}")
                 current_attempt_result = f"Error: Exception during API call. Details: {e}"
                 success = False

            # --- Decide whether to retry ---
            if success:
                break # Exit loop if successful

            # If failed and more retries left
            if not success and attempt < max_retries - 1:
                wait_time = (1.5 ** attempt) + random.uniform(0.1, 0.5)
                print(f"Attempt {attempt+1} failed with error: {current_attempt_result[:100]}... Retrying in {wait_time:.2f}s...")
                time.sleep(wait_time)
            # If failed and no retries left, the loop ends, current_attempt_result has the last error

        # Return the result of the last attempt (either success or the final error)
        return current_attempt_result.strip()

    # --- <<< Corrected generate_initial_response for MMLU query_data >>> ---
    def generate_initial_response(self, query_data, initial_prompt_template):
        """Generates the first response using prompt template and query data dict."""
        # Use dictionary unpacking to format the prompt with all keys from query_data
        try:
            prompt = initial_prompt_template.format(**query_data, agent_id=self.agent_id, provider=self.provider)
        except KeyError as e:
            print(f"ERROR formatting initial_prompt_template: Missing key {e}")
            print(f"Ensure query_data dict has keys: {list(query_data.keys())}")
            print(f"Template requires keys like: {{subject}}, {{question}}, {{option_A}}...")
            return f"Error: Prompt formatting failed due to missing key {e}."

        self.current_response = self._call_api(prompt)
        self.history.append({"round": 0, "response": self.current_response})
        print(f"Agent {self.agent_id} ({self.provider}) initial response generated.")
        return self.current_response

    # --- <<< Corrected generate_debate_response for MMLU query_data >>> ---
    def generate_debate_response(self, query_data, other_responses_dict, debate_prompt_template, round_num):
      """Generates an updated response based on others' inputs for a specific round."""
      formatted_other_responses = "\n\n".join([
          f"--- Response from Agent {other_id} (Provider: {other_provider}) ---\n{resp}"
          for (other_id, other_provider), resp in other_responses_dict.items()
      ]) if other_responses_dict else "No other agent responses were provided."

      prev_round_num = round_num - 1

      # Use dictionary unpacking for query_data, and pass other args explicitly
      try:
          prompt = debate_prompt_template.format(
              **query_data, # Unpack MMLU data (subject, question, options)
              agent_id=self.agent_id,
              provider=self.provider,
              round_num=round_num,
              prev_round_num=prev_round_num,
              my_previous_response=self.current_response,
              other_agent_responses=formatted_other_responses
          )
      except KeyError as e:
            print(f"ERROR formatting debate_prompt_template: Missing key {e}")
            print(f"Ensure query_data dict has keys: {list(query_data.keys())}")
            print(f"Template requires keys like: {{subject}}, {{question}}, {{option_A}}...")
            return f"Error: Prompt formatting failed due to missing key {e}."

      self.current_response = self._call_api(prompt)
      self.history.append({"round": round_num, "response": self.current_response})
      print(f"Agent {self.agent_id} ({self.provider}) debate response generated for round {round_num}.")
      return self.current_response

# ----------------------------------------------------------------------------
#                      MULTI-AGENT DEBATE FUNCTION (MMLU Version)
# ----------------------------------------------------------------------------
def run_multi_agent_debate(query_data, agent_providers, num_rounds=2): # Accepts query_data dict
    """Manages the multi-agent MMLU debate process, stopping early if agents converge."""
    num_agents = len(agent_providers)
    if num_agents == 0: return None
    # <<< Corrected print statement to use query_data['question'] >>>
    print("="*40 + f"\nStarting {num_agents}-Agent MMLU Debate (Providers: {agent_providers})\nSubject: {query_data.get('subject','N/A')}\nMax Rounds: {num_rounds}\n" + "="*40)

    # --- MMLU Prompt Templates ---
    initial_prompt = (
        "You are Agent {agent_id} ({provider}). The following is a multiple-choice question about {subject}.\n"
        "Question: {question}\n"
        "Options:\nA) {option_A}\nB) {option_B}\nC) {option_C}\nD) {option_D}\n\n"
        "Think step-by-step to determine the correct answer. After your reasoning, state your final choice clearly by enclosing the single capital letter (A, B, C, or D) in a box like this: \\boxed{{<LETTER>}}. Make sure this is the very last part of your response."
    )
    debate_prompt = (
        "You are Agent {agent_id} ({provider}) in Round {round_num} of a multi-agent debate on a multiple-choice question about {subject}.\n"
        "Original Question:\n{question}\nOptions:\nA) {option_A}\nB) {option_B}\nC) {option_C}\nD) {option_D}\n\n"
        "Your Previous Response (Round {prev_round_num}):\n--- START ---\n{my_previous_response}\n--- END ---\n\n"
        "Other Agent's Response (Round {prev_round_num}):\n--- START ---\n{other_agent_responses}\n--- END ---\n\n"
        "**Instructions for Round {round_num}:**\n"
        "1. Critique your previous reasoning/choice AND the other agent's. Identify errors or stronger arguments.\n"
        "2. Provide updated step-by-step reasoning.\n"
        "3. State your final choice *only* as the capital letter A, B, C, or D, enclosed in a box like \\boxed{{<LETTER>}} at the very end.\n\n"
        "Your Updated Reasoning and Final Answer for Round {round_num}:"
    )

    # --- Initialize Agents ---
    agents = {}; agent_id_counter = 1; all_providers_ready = True
    for provider in agent_providers:
        provider_ready = False;
        if provider == "gemini" and keys.get("gemini"): provider_ready = True
        elif provider == "openai" and keys.get("openai"): provider_ready = True
        if not provider_ready: print(f"ERROR: Config for Provider '{provider}' not ready. Stopping."); all_providers_ready = False; break
        try: agent = Agent(agent_id=agent_id_counter, provider=provider); agents[agent_id_counter] = agent; agent_id_counter += 1
        except ValueError as e: print(f"Error init Agent {agent_id_counter}: {e}"); all_providers_ready = False; break
    if not all_providers_ready or len(agents) != len(agent_providers): print("ERROR: Could not initialize all agents."); return None
    print(f"\nInitialized {len(agents)} agents successfully.")


    # --- Round 0 ---
    print(f"\n--- ROUND 0: Initial ---"); agent_responses_round_0 = {}
    for agent_id, agent in agents.items():
        # Pass the query_data dictionary for formatting
        response = agent.generate_initial_response(query_data, initial_prompt)
        agent_responses_round_0[(agent_id, agent.provider)] = response
        print(f"Agent {agent_id} ({agent.provider}) Initial Snippet: {response[:100]}...")
    all_round_responses = {0: agent_responses_round_0}; last_completed_round = 0

    # --- Debate Rounds ---
    for round_num in range(1, num_rounds + 1):
        print(f"\n--- ROUND {round_num}: Debate ---"); previous_round_responses = all_round_responses[round_num - 1]; current_round_responses = {}
        for agent_id, agent in agents.items():
            other_responses_for_agent = {k: r for k, r in previous_round_responses.items() if k[0] != agent_id}
            # Pass the query_data dictionary and the debate_prompt template string
            response = agent.generate_debate_response(query_data, other_responses_for_agent, debate_prompt, round_num)
            current_round_responses[(agent_id, agent.provider)] = response
            print(f"Agent {agent_id} ({agent.provider}) Round {round_num} Snippet: {response[:100]}...")
        all_round_responses[round_num] = current_round_responses; last_completed_round = round_num

        # --- Early Convergence Check (using boxed letter extraction) ---
        print(f"\nChecking for MMLU convergence after Round {round_num}...")
        extracted_answers_this_round = []
        all_agents_answered = True
        for (id, p), resp_text in current_round_responses.items():
            answer = extract_boxed_letter_answer(resp_text) # Use MMLU extraction
            if answer is not None: extracted_answers_this_round.append(answer)
            else: all_agents_answered = False; print(f"Warning: Agent {id}({p}) no boxed letter in R{round_num}."); break
        if all_agents_answered and len(extracted_answers_this_round) == len(agents):
             first_answer = extracted_answers_this_round[0]
             # Simple comparison for letters
             if all(ans == first_answer for ans in extracted_answers_this_round):
                 print(f"--- CONVERGENCE DETECTED in Round {round_num} on Answer '{first_answer}'! Stopping early. ---"); break
        # --- End Convergence Check ---

    print("\n--- DEBATE COMPLETE ---"); final_responses = all_round_responses[last_completed_round]
    # Print final individual responses for clarity before aggregation
    print(f"\nFinal Responses from Each Agent (After Round {last_completed_round})")
    for (agent_id, provider), response in final_responses.items():
        print(f"\n--- Agent {agent_id} ({provider}) Final Response ---:\n{response}\n")
    return final_responses


# ----------------------------------------------------------------------------
#                    ANSWER EXTRACTION FUNCTION (Boxed Letter)
# ----------------------------------------------------------------------------
def extract_boxed_letter_answer(text):
    """Extracts the final multiple-choice answer (A, B, C, D) from \boxed{<LETTER>}."""
    if text is None or not isinstance(text, str): return None
    match = re.search(r"\\boxed\{([A-D])\}", text, re.IGNORECASE)
    if match: return match.group(1).upper()
    else: # Fallback: Look for "Final Answer: <LETTER>" pattern
        match_final = re.search(r"Final Answer:\s*([A-D])\s*$", text, re.IGNORECASE | re.MULTILINE)
        if match_final: print(f"Warning: Using 'Final Answer: <LETTER>' fallback."); return match_final.group(1).upper()
        # Fallback: Look for last standalone letter
        found_letters = re.findall(r'\b([A-D])\b', text);
        if found_letters: print(f"Warning: Using last standalone letter fallback."); return found_letters[-1].upper()
    print(f"Warning: Could not extract A,B,C,D answer from: {text[:100]}...")
    return None

# ----------------------------------------------------------------------------
#              AGGREGATION & CONVERGENCE CHECK FUNCTION (MMLU)
# ----------------------------------------------------------------------------
def get_final_answer_from_debate(final_responses):
    """Aggregates MMLU answers (letters), checks convergence, defaults to Agent 1."""
    # ... (Keep the MMLU version of this function exactly the same as previous response) ...
    if not final_responses: return None, "Error: No final responses."
    extracted_answers_map = {}; agent1_answer = None
    print("\n--- Aggregating Final MMLU Answers ---")
    for agent_key, response_text in final_responses.items():
        agent_id, provider = agent_key; answer = extract_boxed_letter_answer(response_text); extracted_answers_map[agent_key] = answer
        if answer: print(f"Agent {agent_id} ({provider}) extracted: {answer}")
        else: print(f"Warning: Agent {agent_id} ({provider}) failed extraction.")
        if agent_id == 1 and provider == "gemini": agent1_answer = answer
    valid_extracted_letters = [ans for ans in extracted_answers_map.values() if ans is not None]
    if not valid_extracted_letters: print("Agg Result: No valid letters."); return None, "No Consensus: No valid answers"
    is_converged = False
    if len(valid_extracted_letters) == len(final_responses): first_answer = valid_extracted_letters[0]; is_converged = all(ans == first_answer for ans in valid_extracted_letters)
    if is_converged: print(f"Agg Result: Converged on '{valid_extracted_letters[0]}'."); return valid_extracted_letters[0], "Converged"
    else:
        print("Agg Result: No convergence. Defaulting to Agent 1...")
        if agent1_answer: print(f"Using Agent 1's answer: {agent1_answer}"); return agent1_answer, "Defaulted to Agent 1"
        else: print("Fallback Failed: Agent 1 has no valid answer."); return None, "No Consensus: Agent 1 Failed"

# ----------------------------------------------------------------------------
#                           MMLU DATASET LOADING
# ----------------------------------------------------------------------------
print("\n--- Loading MMLU Dataset ---")
MMLU_SUBJECT = "high_school_computer_science" # <<< CHOOSE MMLU SUBJECT HERE
try:
    mmlu_dataset = load_dataset("cais/mmlu", MMLU_SUBJECT, trust_remote_code=True)
    mmlu_eval_split = mmlu_dataset.get('test') or mmlu_dataset.get('validation')
    if not mmlu_eval_split: raise ValueError(f"Split not found for {MMLU_SUBJECT}")
    print(f"Loaded {len(mmlu_eval_split)} examples for MMLU subject: {MMLU_SUBJECT}")
except Exception as e: print(f"Error loading MMLU dataset ({MMLU_SUBJECT}): {e}"); raise

# ----------------------------------------------------------------------------
#                   MMLU EVALUATION LOOP (Multi-Agent via curl)
# ----------------------------------------------------------------------------
SAMPLE_SIZE = 100 # Evaluate on N questions
evaluation_subset = mmlu_eval_split.select(range(SAMPLE_SIZE))
results = []
correct_count = 0
convergence_count = 0
providers_in_debate = ["gemini", "openai"] # Define agents

# Check config ready - <<< Corrected Check >>>
if keys.get("gemini") and keys.get("openai"):
    print(f"\n--- Evaluating on {SAMPLE_SIZE} MMLU Samples ({MMLU_SUBJECT}) ---")
    for item in tqdm(evaluation_subset, desc=f"Evaluating MMLU ({MMLU_SUBJECT})"):
        question_text = item["question"]
        choices = item["choices"]; correct_index = item["answer"]
        correct_letter = chr(ord('A') + correct_index)

        # <<< Corrected: Prepare query_data dictionary >>>
        query_data = {
            "subject": MMLU_SUBJECT.replace("_", " "),
            "question": question_text,
            "option_A": choices[0], "option_B": choices[1],
            "option_C": choices[2], "option_D": choices[3],
        }

        print(f"\nProcessing MMLU Q: {question_text[:80]}... [Correct: {correct_letter}]")

        # <<< Corrected: Pass query_data dictionary >>>
        final_responses = run_multi_agent_debate(
            query_data=query_data, # Pass the dict here
            agent_providers=providers_in_debate,
            num_rounds=2 # MMLU might converge faster, try 1 round first
        )

        # Aggregate and check convergence/majority/default
        aggregated_answer_letter, debate_status = None, "Error: Debate Failed" # Default if final_responses is None
        if final_responses is not None:
             aggregated_answer_letter, debate_status = get_final_answer_from_debate(final_responses)
        else:
             print("ERROR: Debate failed to run for this question.")


        print(f"\n--- Evaluation Result for MMLU Question ---")
        print(f"Debate Status: {debate_status}")
        print(f"Final Aggregated Answer Letter: {aggregated_answer_letter}")

        # Track Convergence
        if debate_status == "Converged": convergence_count += 1

        # Compare aggregated answer letter to ground truth letter
        is_correct = False
        if aggregated_answer_letter is not None and aggregated_answer_letter == correct_letter:
            is_correct = True; correct_count += 1

        print(f"Correct?: {is_correct}"); print("-" * 30)
        # <<< Corrected: Use correct variable name in results dict >>>
        results.append({ "question": question_text, "choices": choices, "expected_letter": correct_letter, "status": debate_status, "aggregated_answer_letter": aggregated_answer_letter, "correct": is_correct, "all_final_responses": final_responses})
        time.sleep(1) # Add a small delay between questions

    # --- Final Performance Calculation ---
    total_evaluated = len(results)
    accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
    convergence_percentage = (convergence_count / total_evaluated) * 100 if total_evaluated > 0 else 0

    print(f"\n--- MMLU Multi-Agent Debate FINAL SUMMARY ({MMLU_SUBJECT}) ---")
    print(f"Models Tested: {model_config['gemini']} (curl) vs {model_config['openai']} (curl)")
    print(f"Total Questions Evaluated: {total_evaluated}")
    print(f"Correct Aggregated Answers: {correct_count}")
    print(f"Accuracy (Aggregated Answer): {accuracy:.2f}%")
    print(f"Convergence Rate (Same Letter Choice): {convergence_count} / {total_evaluated} = {convergence_percentage:.2f}%")

else:
    print("Skipping evaluation because required API keys failed to initialize.")

# Optional: Display results table
# import pandas as pd
# if results:
#     df_results = pd.DataFrame(results)
#     print("\nDetailed Results:")
#     with pd.option_context('display.max_colwidth', 40):
#          print(df_results[['question', 'expected_letter', 'status', 'aggregated_answer_letter', 'correct']].head(SAMPLE_SIZE))

--- Configuring API Keys ---
API Keys fetched from Colab Secrets.
Successfully loaded required API keys.
API Keys loaded. Clients will be called via curl/subprocess.

--- Loading MMLU Dataset ---
Loaded 100 examples for MMLU subject: high_school_computer_science

--- Evaluating on 100 MMLU Samples (high_school_computer_science) ---


Evaluating MMLU (high_school_computer_science):   0%|          | 0/100 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
--- Agent 2 (openai) received response ---
Agent 2 (openai) initial response generated.
Agent 2 (openai) Initial Snippet: To determine which option is LEAST likely to indicate a phishing attack, let's analyze each option:
...

--- ROUND 1: Debate ---

--- Agent 1 (gemini) sending prompt via curl ---
Executing curl (Attempt 1/2)...
--- Agent 1 (gemini) received response ---
Agent 1 (gemini) debate response generated for round 1.
Agent 1 (gemini) Round 1 Snippet: My previous reasoning and Agent 2's reasoning are both sound and reach the same conclusion.  We both...

--- Agent 2 (openai) sending prompt via curl ---
Executing curl (Attempt 1/2)...
--- Agent 2 (openai) received response ---
Agent 2 (openai) debate response generated for round 1.
Agent 2 (openai) Round 1 Snippet: **Critique of Previous Reasoning:**

1. **My Previous Response:**
   - I correctly identified that o...

Checking for MMLU convergence after Round 1..