In [1]:
import os
import json
import time
from openai import OpenAI
import together
import random
from dotenv import load_dotenv
import google.generativeai as genai
import anthropic
from concurrent.futures import ThreadPoolExecutor


In [2]:
# === Load Keys ===
load_dotenv()
together.api_key = os.getenv("TOGETHER_API_KEY")
client = together.Together()
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
claude_client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))


# MODEL_GENERATOR = "gemini-2.5-flash"
MODEL_mx = "mistralai/Mixtral-8x7B-Instruct-v0.1"
MODEL_gpt = "gpt-4.1-2025-04-14"
MODEL_o3 = "o3"
MODEL_cld = "claude-3-5-sonnet-20241022"
MODEL_cld4 = "claude-sonnet-4-20250514"
MODEL_gemini = "gemini-2.5-pro"

In [3]:
MAX_CYCLES = 4

BIOLOGY_BASE_TOPICS = [
    "a metabolic pathway involving feedback inhibition and allosteric regulation under nutrient stress",
    "a gene expression analysis under epigenetic modification and transcription factor knockdown",
    "a CRISPR-Cas9 mediated gene editing experiment in eukaryotic cells with off-target effect analysis",
    "enzyme kinetics in a multi-substrate reaction with competitive and non-competitive inhibition",
    "a signal transduction cascade with second messengers and phosphorylation events under varying ligand concentrations",
    "population dynamics in a predator-prey ecosystem with resource limitation and carrying capacity shifts",
    "a study of bacterial quorum sensing and autoinducer accumulation across biofilm formation stages",
    "a comparative genomics analysis identifying orthologs and paralogs in evolutionarily related species",
    "a thermodynamic analysis of DNA melting and reannealing in GC-rich vs AT-rich sequences",
    "a photosynthetic light response curve under variable CO₂ concentration and light intensity",
    "a mitochondrial electron transport chain experiment under hypoxic and oxidative stress conditions",
    "a comparative analysis of innate and adaptive immune responses to viral antigen presentation",
    "analysis of cell cycle checkpoints with cyclin/CDK interactions and mutational disruptions",
    "an electrophysiological study of action potential propagation under altered ion gradients",
    "a diffusion-limited nutrient uptake model in cells with varying surface-area-to-volume ratios",
    "a protein folding and misfolding study involving chaperone activity and prion aggregation",
    "an investigation of horizontal gene transfer mechanisms in bacteria exposed to antibiotics",
    "an osmoregulation study of freshwater vs marine fish using gill Na⁺/K⁺ pump expression levels",
    "analysis of plant tropisms with auxin distribution under differential light and gravity exposure",
    "a synthetic biology circuit design involving feedback loops and gene toggling behavior",
    "study of antigenic drift and shift in influenza virus across multiple host species",
    "a membrane transport study involving carrier-mediated diffusion and saturation kinetics",
    "a developmental biology experiment on morphogen gradients and segmentation gene expression",
    "analysis of telomere shortening and telomerase activity in cancerous vs healthy cells",
    "a transcriptomics study identifying differentially expressed genes in response to heat shock",
    "investigation of neural synaptic plasticity with long-term potentiation and memory encoding",
    "a biodiversity and species richness survey using Shannon and Simpson diversity indices",
    "mitotic spindle assembly and chromosome segregation defects under chemotherapeutic treatment",
    "analysis of circadian gene regulation with light/dark cycle entrainment in mammals",
    "study of meiosis and recombination frequency mapping using genetic linkage analysis",
    "nutrient cycling and microbial decomposition in a closed ecological system model",
    "protein-protein interaction networks analyzed using co-immunoprecipitation and mass spectrometry",
    "epistasis mapping in metabolic disorders using model organisms and gene knockout libraries",
    "bioluminescence quantification in marine organisms under variable pH and salinity conditions",
    "microbiome dynamics in gut flora in response to dietary fiber and antibiotic treatment",
    "analysis of apoptotic pathway activation under DNA damage stress and p53 regulation",
    "adaptive immunity modeling using clonal selection and memory B-cell dynamics post-vaccination",
    "genetic drift, bottleneck effects, and founder effects in isolated island populations",
    "RNA interference (RNAi) effects on post-transcriptional gene silencing across species",
    "comparative respiration rates in C₃ vs CAM plants under different humidity conditions",
    "chloroplast ultrastructure and efficiency changes under photoinhibition in high-light environments",
    "in vitro fertilization and preimplantation genetic screening of embryos for chromosomal abnormalities",
    "impact of endocrine disruptors on hormonal signaling in amphibian metamorphosis",
    "analysis of histone modification patterns across euchromatin and heterochromatin regions",
    "mitochondrial heteroplasmy effects in inheritance of rare metabolic syndromes",
    "study of hemoglobin allosteric transitions in response to CO₂ and pH (Bohr effect)",
    "axonal transport rates in neurons under ATP-depleted vs ATP-sufficient conditions",
    "study of programmed cell death (apoptosis vs necrosis) under nutrient deprivation and cytokine exposure",
    "gene-environment interaction effects in phenotype plasticity using twin studies"
]

BIOLOGY_COMPLEXITY_TEMPLATES = [
    "Integrate multiple biological scales, such as molecular regulation with physiological outcomes or population-level dynamics with individual fitness.",
    "Design problems involving both qualitative pathway interpretation and quantitative expression analysis (e.g., fold change, kinetic rates).",
    "Include dependency on environmental or stress conditions like hypoxia, nutrient limitation, circadian phase, or immune challenge.",
    "Require a multi-step reasoning chain: gene mutation → pathway disruption → cellular phenotype → physiological effect.",
    "Incorporate experimental design principles like controls, replicates, and statistical interpretation of gene/protein expression data.",
    "Introduce non-linear feedback or redundancy in pathways (e.g., feedback inhibition, cross-talk between signaling cascades).",
    "Include graphical data such as Western blots, gene expression heatmaps, or growth curves for interpretation.",
    "Use real-world constraints like temporal delay in expression, energy cost of protein production, or stochastic gene expression noise.",
    "Explicitly define all assumptions (e.g., cell type, steady state, dilution factor) and force evaluation of those in calculation and reasoning.",
    "Require comparative evaluation across evolutionary adaptations, phenotypic plasticity, or experimental organism models."
]


In [4]:
# === Utility: Call LLM ===
# ---openai---
def call_model_openai(prompt: str, model: str) -> str:
    response = openai_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

#---gemini---

def call_model_gemini(prompt: str, model: str) -> str:
    model = genai.GenerativeModel(model_name=model)
    response = model.generate_content(prompt)
    return response.text.strip()

# ---together---

def call_model_tog(prompt: str, model: str) -> str:
    prompt = prompt
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=1.0
    )
    return response.choices[0].message.content.strip()

# ---claude---

def call_model_claude(prompt: str, model: str) -> str:
    try:
        response = claude_client.messages.create(
            model=model,
            max_tokens=1024,
            temperature=0.7,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text.strip()
    except Exception as e:
        print("[Claude Error]", e)
        return "[Error]" 

In [5]:
# === Step 1: Generate Initial Biology Question ===
def generate_biology_seed_question() -> str:
    def generate_initial_prompt() -> str:
        topic_k = random.randint(2, min(4, len(BIOLOGY_BASE_TOPICS)))
        complexity_k = random.randint(4, min(6, len(BIOLOGY_COMPLEXITY_TEMPLATES)))
        
        topic = random.sample(BIOLOGY_BASE_TOPICS, k=topic_k)
        complexity = random.sample(BIOLOGY_COMPLEXITY_TEMPLATES, k=complexity_k)

        return f"""
You are a domain-expert AI in biology problem generation. Your task is to generate a **rigorous, self-contained graduate-level biology problem** that:

🔹 Combines multiple areas of biology  
🔹 Involves real-world biological and physiological dependencies  
🔹 Requires multi-step, chained reasoning (quantitative, symbolic, or diagrammatic)  
🔹 Is **fully defined, reproducible**, and yields **one unique final answer** (quantitative or descriptive)  
🔹 **Must use LaTeX format** for biological terms, equations, and diagrams if needed.

---

### Problem Requirements:

1. **Interdisciplinary Concept Integration**  
   Integrate at least **two major biology domains**, such as molecular biology + physiology, genetics + ecology, cell biology + neuroscience, etc.

2. **Structured Reasoning and Analysis**  
   The problem must require **minimum four to five reasoning steps**, including pathway tracing, genetic logic, calculations, or system-level inferences.

3. **Complete and Unambiguous Specifications**  
   - Clearly define system setup, experimental or observational context, and all assumptions.  
   - Explicitly state all given data: gene expression levels, molecular concentrations, population sizes, signal intensities, etc.  
   - Use SI or biological standard units (e.g., ng/µL, cells/mL, mmol/L).

4. **Question Construction Constraints**  
   - Pose **one final, well-defined question** (either numeric, logical, or experimental conclusion).  
   - Do not request multiple sub-answers or vague discussion — focus on a **single boxed outcome**.  
   - Include any constants (e.g., Avogadro's number, dilution factor, generation time) directly in the problem.

5. **Strict Formatting Guidelines**  
   - Use **LaTeX** for biological structures, reactions, equations, and symbols.  
   - Avoid placeholders, “assume reasonable,” or ambiguous instructions.  
   - Present the final answer as a **boxed result**, with appropriate units or clear conclusion.

6. **Reproducibility and Uniqueness**  
   - Ensure that the problem leads to **one unique and reproducible conclusion**.  
   - Avoid speculative reasoning not grounded in provided data.

---

### Use These Concepts (Topics):  
{topic}

### Follow These Complexity Requirements:  
{complexity}

---

### Output Format (Strict JSON Only):

Return **only** the following JSON object (no markdown, no comments — strictly this format):

{{
  "question": "<Complete and rigorous LaTeX-formatted biology problem, with all assumptions, pathways, equations, constants, given values, units, and one final boxed result requested>",
  "topic_tags": ["<tag1>", "<tag2>", ...]
}}

---

### Important Guidelines:
- No placeholders or generic references — all values and constants must be specified.
- The problem must simulate real-world biological complexity at the graduate level.
- Avoid trivia or qualitative-only questions. Focus on integrated reasoning, experimental design, and biological insight.
- Final problem must be solvable using the provided data only — **no external lookups or assumptions**.

Produce **only** the JSON format shown — no markdown, extra formatting, or narrative.
""".strip()

    prompt = generate_initial_prompt()
    return call_model_gemini(prompt, MODEL_gemini)


In [6]:
# === Step 2: Extract Parameters, Assumptions, Ambiguities for Biology ===
def extract_parts_from_biology_question(seed_json: str) -> str:
    prompt = f"""
You are a biology reasoning assistant.

Your task is to extract **all** key biological components from the provided biology problem JSON. Be precise, comprehensive, and follow these guidelines:

1. **Parameters**  
   - List all biological variables, symbols, and constants involved in the question.  
   - For each, provide:
     - `symbol` (e.g., "Vmax", "Km", "ΔG", "[ATP]")
     - `meaning` (e.g., "maximum reaction velocity", "Michaelis constant", "change in Gibbs free energy", "ATP concentration")
     - `unit` (e.g., "µM", "nmol/min", "kJ/mol", or "dimensionless" if applicable)
   - Include any relevant numerical constants or physiological standard values (e.g., body temp = 37°C, normal pH = 7.4, Avogadro’s number, etc.).

2. **Assumptions**  
   - Extract explicitly stated assumptions (e.g., "steady-state", "organism is at homeostasis", "reaction is enzyme-catalyzed").
   - Also infer biologically meaningful implicit assumptions necessary for solving or modeling the problem (e.g., "constant temperature", "no feedback inhibition", "closed population").

3. **Concepts Used**  
   - Identify **biological principles, processes, models, or laws** involved in reasoning through the question.
   - For each concept, briefly explain its biological relevance in this context.
   - Include both basic (e.g., enzyme kinetics, osmosis, gene expression) and advanced ideas (e.g., signal transduction, feedback regulation, evolutionary selection pressure) as appropriate.

### Output JSON Format (strict):
{{
  "parameters": [
    {{"symbol": "...", "meaning": "...", "unit": "..."}},
    ...
  ],
  "assumptions": ["...", "..."],
  "concepts": ["...", "..."]
}}

### Input JSON:
{seed_json}
"""
    return call_model_openai(prompt, MODEL_o3)


In [7]:
def rewrite_parts_to_make_tougher_biology(extracted_json: str) -> str:
    prompt = f"""
You are an expert biology reasoning engine.

Your task is to:
1. Take the extracted components of a biology problem.
2. Rewrite and enhance them to make the question **1000 times more challenging** — at an advanced **graduate-level**, requiring **multi-domain conceptual and quantitative reasoning**.
3. Ensure the final question is:
   - Fully presented using appropriate **scientific notation**, symbols, and terminology where applicable (e.g., molecular concentrations, rate constants, physiological metrics).
   - Requires **no intermediate biological process breakdowns** — ask only **one final, precise, quantitative result**.
   - Demands **deep understanding of multiple biology subfields** (e.g., molecular biology + physiology, genetics + bioinformatics, cell signaling + systems biology).
   - Includes **realistic, explicitly defined numerical parameters** — no vague assumptions.
   - Results in **one unique numeric final answer with correct biological units**, clearly expressed.
   - Involves **quantitative formulation**, **systems-level thinking**, and **mathematical or computational modeling** where relevant.
   - Entirely self-contained — all constants, assumptions, and conditions must be specified.

---

### OUTPUT FORMAT (JSON only):

Return a fully rewritten **toughened problem** in the following strict JSON format:

{{
  "question": "<Complete, rigorous biology problem with all quantitative and conceptual depth>",
  "topic_tags": ["<relevant biology domains>"]
}}

---

### INPUT (Original Extracted JSON):
{extracted_json}

---
Only return the final JSON — no explanation, no commentary.
"""
    return call_model_gemini(prompt, MODEL_gemini)


In [8]:
# === Step 4: Get Feedback and Improve Biology Question ===
def get_biology_feedback(question_json: str) -> str:
    prompt = f"""
You are a critical biology evaluator AI trained to review complex biology problems and solutions with a rigorous expert lens.

Analyze the following JSON which contains a biology question.

### Your tasks:
1. **Identify and point out ambiguities** in the question:
   - Are biological entities such as species, cell types, tissues, molecules, or genes undefined or vague?
   - Are experimental conditions (e.g., temperature, pH, growth media, environmental factors) clearly specified?
   - Are phrases like “the sample,” “the expression,” or “the signal” ill-defined?

2. **Check for consistency**:
   - Are all symbols, biological terms, and abbreviations clearly defined (e.g., ATP, mRNA, ΔΨ, R₀)?
   - Are processes described using biologically valid language (e.g., transcription, inhibition, signal transduction)?
   - Are there contradictory or biologically implausible statements?

3. **Assess logical flow and biological reasoning**:
   - Does the question reflect correct biological mechanisms (e.g., enzyme kinetics, genetic regulation, population dynamics)?
   - Are reasoning steps biologically valid and not overly simplified?
   - Are the described processes feasible under the stated or implied conditions?

4. **Check assumptions and constraints**:
   - Are biological assumptions (e.g., homeostasis, equilibrium, normal physiology) explicitly stated?
   - Are there hidden or unrealistic assumptions (e.g., instantaneous gene expression, 100% enzyme efficiency)?
   - Are experimental constraints (e.g., species-specific differences, physiological limits) clearly acknowledged?

5. **Ensure No Intermediate Sub-Questions**:
   - The question should lead directly to a **single final biological output** (e.g., expression level, growth rate, energy yield) with appropriate units if applicable.
   - Avoid asking for stepwise or scaffolded answers like:
     - “First identify the gene, then...”
     - “Calculate the transcription rate, and then...”
     - “Determine the protein levels and then...”
   - All intermediate reasoning must be implicit. Only the **final biological conclusion or value** should be asked.

### Input JSON:
{json.dumps(question_json, indent=2)}

### Output:
Return a **well-structured, technical paragraph of feedback**, identifying flaws, gaps, or improvements as per the tasks above. Be extremely specific, avoid generic praise, and give examples when possible. Focus on clarity, biological plausibility, and strong question design.
""".strip()

    return call_model_openai(prompt, MODEL_o3)


In [9]:
def improve_biology_question_based_on_feedback(feedback: str, original_json: str) -> str:
    prompt = f"""
You previously generated a high-level biology problem, but it has been reviewed and received the following detailed feedback:
### Original question:
{original_json}

### Feedback:
\"\"\"{feedback}\"\"\"

Your new task:
- Revise and improve the original biology problem so that it fully addresses **every point** in the feedback.
- The resulting question must:
  1. Be **fully self-contained**, clearly describing the biological context, organisms (if any), molecular players (e.g., genes, proteins), experimental setup, environmental conditions, and units.
  2. Be **biologically rigorous**, integrating at least two subdomains (e.g., molecular biology + physiology, or genetics + evolution, etc.).
  3. Be **challenging and solvable**, requiring **multi-step biological reasoning**, **data interpretation**, or **experimental design logic**.
  4. Include **scientific figures or data tables in LaTeX notation**, if relevant (e.g., DNA sequences, gel bands, rate curves, concentration tables).
  5. Require **only one** explicit **final outcome** (e.g., a biological conclusion, calculated value, classification, or prediction).
  6. Do **not** ask for intermediate interpretations — just one clear, final result.
  7. Be answerable solely based on the information provided, such that the answer is **unique**, **biologically consistent**, and **logically deterministic**.

### Important:
- Do NOT include any commentary or markdown formatting.
- Output must be strictly JSON and fully parsable.
- The regenerated problem must **fully reflect and satisfy the feedback** above.

Strictly return ONLY a valid, properly formatted JSON object with the following format:
{{
  "question": "<Improved and fully self-contained biology problem in strict LaTeX notation>",
  "topic_tags": ["<tag1>", "<tag2>", ...]
}}
Produce ONLY the JSON as specified.
""".strip()

    return call_model_openai(prompt, MODEL_gpt)


In [10]:
# === Step 5: Get Final Answer from 2 Models for Biology ===
def get_final_biology_answer(question_json: str) -> dict:
    prompt = f"""
You are a high-accuracy biology expert.

Solve the following biology question and return **only the final boxed answer** with appropriate scientific units, terms, or identifiers.

---

### STRICT OUTPUT RULES:
- Do NOT include any reasoning, explanation, or steps.
- Do NOT return anything other than the final answer.
- Format: Return a single **boxed answer string**, e.g., `"Rate = 2.3 µmol/min"`, `"ATP Yield = 36"`, `"Kd = 1.2 × 10⁻⁶ M"` or `"Answer = Homo sapiens"`.
- Use precise biological terms and correct units (e.g., `µmol`, `min⁻¹`, `ng/mL`, `kDa`, `M`, `bp`, `cells/mL`).
- Round values to **at least 3 significant digits** unless biological context implies a specific format (e.g., gene names, species).
- No markdown, no LaTeX, no JSON, no reasoning. Just a clean final numeric or categorical biological result in a box.

---

### BIOLOGY PROBLEM INPUT:
{question_json}

---

### FINAL OUTPUT:
Return only the final answer in this format:
`"Boxed Final Answer"`
"""
    def run_openai():
        return call_model_openai(prompt, MODEL_o3)  # GPT-4 or o3

    def run_gemini():
        return call_model_gemini(prompt, MODEL_gemini)  # Gemini 2.5 Pro

    with ThreadPoolExecutor() as executor:
        future_openai = executor.submit(run_openai)
        future_gemini = executor.submit(run_gemini)

        openai_result = future_openai.result()
        gemini_result = future_gemini.result()

    return {
        "gpt_o3": openai_result,
        "gemini2.5pro": gemini_result
    }


In [11]:
def compare_biology_answers(ans1: str, ans2: str) -> dict:
    prompt = f"""
You are a biology answer verification engine.

Your task is to compare two final answers from biology models and determine whether they represent the **same biological conclusion or outcome**. Use biological reasoning to evaluate terminology, quantitative agreement, structural/functional equivalence, and semantic consistency in biological processes.

---

### Evaluation Criteria and Scoring

Each of the following aspects contributes to a **similarity score** between 0.0 and 1.0. The total score is calculated by adding weighted partial scores:

| Criterion                          | Weight | Description |
|-----------------------------------|--------|-------------|
| 1. Terminology Equivalence        | 0.20   | Full score if terms used refer to the same biological entity/process (e.g., ATP synthase vs F₀F₁-ATPase). Zero if unrelated. |
| 2. Quantitative Agreement         | 0.25   | Compare numerical values or ratios. Full if within 1%, partial for 1–2%, zero if >2%. |
| 3. Functional Similarity          | 0.20   | Score full if the biological function or consequence described is the same (e.g., inhibition of transcription vs RNA polymerase blockade). |
| 4. Structural/Process Consistency | 0.20   | Score full if the described structures/processes support the same pathway or outcome. |
| 5. Format/Rounding Consistency    | 0.15   | Full score if values differ slightly in notation or rounding but convey the same biological meaning. |

---

### Final Decision Rule

- If total score **≥ 0.80**, return: `"decision": "similar"`
- If total score **< 0.80**, return: `"decision": "different"`

---

### Example:

**Answer 1:** `ATP is produced via oxidative phosphorylation in mitochondria`  
**Answer 2:** `The mitochondria generate ATP using the electron transport chain and chemiosmosis`

Evaluation:

- Terminology: oxidative phosphorylation ≈ ETC + chemiosmosis → 0.20  
- Quantitative: n/a → assume aligned → 0.25  
- Functional: ATP production described identically → 0.20  
- Structural/Process: mitochondria, ETC mentioned → 0.20  
- Format: natural language phrasing difference → 0.15

**Total score = 1.00 → "similar"**

---

### Your Task

Compare the following two answers and return a **valid JSON object** in this format:

{{
  "similarity_score": <float between 0.0 and 1.0>,
  "decision": "similar" or "different",
  "comment": "<brief technical explanation>"
}}

Only return the JSON object. Do NOT include Markdown, LaTeX formatting, or extra commentary.

### Input:
Answer 1: {ans1}  
Answer 2: {ans2}
"""
    response = call_model_openai(prompt, MODEL_gpt)
    return json.loads(response)


In [12]:
# === Main Pipeline Loop ===
def pipeline_loop():
    seed_json = None  # Initialize seed_json
    improved_question = None  # Initialize improved_question
    cycle_logs = []

    for cycle in range(MAX_CYCLES):
        print(f"\n [Cycle {cycle + 1}]")

        # Step 1: Generate new seed only if first cycle or models disagreed last time
        if seed_json is None:
            seed_json = generate_biology_seed_question()

        # Step 2
        extracted = extract_parts_from_biology_question(seed_json)

        # Step 3
        tougher_question = rewrite_parts_to_make_tougher_biology(extracted)

        # Step 4
        feedback = get_biology_feedback(tougher_question)
        improved_question = improve_biology_question_based_on_feedback(feedback, tougher_question)

        # Step 5
        
        answer_dict = get_final_biology_answer(improved_question)
        ans_o3 = answer_dict["gpt_o3"]
        ans_gemini = answer_dict["gemini2.5pro"]

        # Step 6
        similarity_result = compare_biology_answers(ans_gemini, ans_o3)

        # If using enhanced version with JSON output:
        if isinstance(similarity_result, dict):
            decision = similarity_result.get("decision", "different")
            score = similarity_result.get("similarity_score", 0.0)
        else:
            decision = similarity_result
            score = None

        print(f"\n Gemini Answer: {ans_gemini}\n o3 Answer: {ans_o3}\n Similarity Decision: {decision}  (Score: {score})")
        
        # Save this cycle's outputs
        cycle_logs.append({
            "cycle": cycle + 1,
            "seed_json": seed_json,
            "extracted": extracted,
            "tougher_question": tougher_question,
            "feedback": feedback,
            "improved_question": improved_question,
            "ans_gpt": ans_gemini,
            "ans_o3": ans_o3,
            "similarity_result": similarity_result,
        })

        if decision == "different":
            print("\n Final Refined Question (as models disagree):")
            print(improved_question)
            return improved_question, cycle_logs

        print(" Answers similar — reusing same question as new seed...\n")
        # Reuse the improved question as next seed
        seed_json = improved_question
        time.sleep(2)
    
    print("\nReturning last question.")
    print(improved_question)
    return improved_question, cycle_logs

In [13]:
# === Run ===
if __name__ == "__main__":
    all_results = []
    iter = 10
    all_cycle_logs = []
    
    for i in range(iter):
        print(f"\n=== Running pipeline_loop #{i+1}/{iter} ===")
        try: 
            final_question, cycle_logs = pipeline_loop()
            all_results.append(final_question)
            all_cycle_logs.append({
                "iteration": i + 1,
                "logs": cycle_logs
            })
        except Exception as e:
            print(f"[ERROR] Iteration {i+1} failed: {e}")
            all_results.append({"error": str(e)})
            all_cycle_logs.append({
                "iteration": i + 1,
                "logs": [],
                "error": str(e)
            })
    
                
    all_results_path = "Bio_all_results0.json"            
    with open(all_results_path, "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2)
        
    # Save all cycle logs
    all_cycle_logs_path = "Bio_cycle_logs_0.json"
    with open(all_cycle_logs_path, "w", encoding="utf-8") as f:
        json.dump(all_cycle_logs, f, indent=2)
            
    print("\n Completed all iterations and saved results to all_results.json")        


=== Running pipeline_loop #1/10 ===

 [Cycle 1]

 Gemini Answer: `[G]_ss, avg = 1720 nM`
 o3 Answer: [EGFP]ₛₛ = 1.72 × 10³ nM
 Similarity Decision: similar  (Score: 1.0)
 Answers similar — reusing same question as new seed...


 [Cycle 2]

 Gemini Answer: ```json
"CV²_pop = 0.1112"
```
 o3 Answer: [CV²_pop = 0.1112]
 Similarity Decision: similar  (Score: 1.0)
 Answers similar — reusing same question as new seed...


 [Cycle 3]

 Gemini Answer: ```
Answer = 0.9818
```
 o3 Answer: [1.0000]
 Similarity Decision: different  (Score: 0.7)

 Final Refined Question (as models disagree):
{
  "question": "A team engineers an **isogenic co-culture of two human ES cell lines** to study the quantitative contribution of EP300 haploinsufficiency to extrinsic noise in a synthetic bistable gene circuit. The model integrates molecular genetics, epigenetic regulation, cell physiology, and stochastic gene expression. The entire system is maintained at $37^\\circ$C, 5% CO$_2$, in standard high-glucose DME