In [None]:
import subprocess
import time
import random
import re
from openai import OpenAI
import csv
import os

# === Configuration ===

LEAN_FILE_PATH = r"D:\MSc Research Project\MathlibProject\MathlibProject.lean"
LEAN_PROJECT_DIR = r"D:\MSc Research Project\MathlibProject"
LOG_ROOT = r"D:\MSc Research Project\Problem-3 Logs 200 trials with 20 attempts (Correct Prompt)"
MAX_TRIALS = 100
MAX_ATTEMPTS = 20

# DeepSeek API Setup
client = OpenAI(
    api_key="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
    base_url="https://api.deepseek.com"
)

problem = (
    "If $3+\sqrt{{5}}$ is a root of the equation \[x^2 - 3x + b = 0,\] compute $b.$"
)

# === Prompts ===

INITIAL_PROMPT = (
    "Given the following math word problem, provide a **concise and brief** solution: include only the minimal necessary explanation and the final answer.\n" 
    "Return the answer in the format: \boxed{{answer}}.\n" 
    "Problem: {problem}"
)

RETRY_SOLVE_ERROR_PROMPT = (
    "Given the following math problem:\n"
    "{problem}\n\n"
    "You previously solved it like this:\n"
    "{previous_answer}\n\n"
    "But it had the following error:\n"
    "{error}\n\n"
    "Retry from scratch\n"
    "Provide a **concise and brief** solution: include only the minimal necessary explanation and the final answer.\n" 
    "Return the answer in the format: \boxed{{answer}}.\n" 
)

BINARY_CORRECT_PROMPT = (
    "Given the following math problem:\n"
    "{problem}\n\n"
    "You previously solved it like this:\n"
    "{previous_answer}\n\n"
    "This answer was incorrect. Retry the problem from scratch.\n"
    "Provide a **concise and brief** solution: include only the minimal necessary explanation and the final answer.\n" 
    "Return the answer in the format: \boxed{{answer}}.\n" 
)

RETRY_SOLVE_WITHOUT_PREVIOUS_ANSWER_PROMPT = (
    "Given the following math problem:\n"
    "{problem}\n\n"
    "Your previous answer had the following error:\n"
    "{error}\n\n"
    "Provide a **concise and brief** solution: include only the minimal necessary explanation and the final answer.\n" 
    "Return the answer in the format: \boxed{{answer}}.\n" 
)

LEAN_CODE_GENERATION_PROMPT = """
You are given a natural language solution to a math problem. 
Extract the valid values of 'b' (final boxed answer), then output Lean code using the template exactly. 
Replace the value of 'b' with floating point number.

Template:

def givenRoot : Float := 3.0 + 2.2360679775  -- 3 + √5
def input_b : Float := <INPUT_B_VALUE_IN_FLOAT>  -- Float value of b

def evalPoly (x b : Float) : Float := x*x - 3.0*x + b
def approxZero (x : Float) (eps : Float := 0.01) : Bool := Float.abs x < eps

def isRoot : Bool := approxZero (evalPoly givenRoot input_b)

#eval ("Is 3+sqrt5 a root of polynomial?", isRoot)  

Use following problems as examples to generate the Lean code with given output format.

Example 1:

Input:

**Required steps/formula:**
- For a quadratic equation \(x^2 + px + q = 0\), if \(r\) is a root, then the conjugate \(r'\) (if irrational) is also a root.
- Sum of roots: \(r + r' = -p\)
- Product of roots: \(r \cdot r' = q\)
- Here, \(p = -3\), so sum of roots = \(3\).
- Given root: \(3 + \sqrt{{5}}\), so conjugate root: \(3 - \sqrt{{5}}\).
- Compute \(b = (3 + \sqrt{{5}})(3 - \sqrt{{5}})\).

**Explanation:**
Since the coefficients are rational and \(3+\sqrt{{5}}\) is irrational, its conjugate \(3-\sqrt{{5}}\) must also be a root. The product of the roots gives \(b\).

**Calculation:**
\[
(3 + \sqrt{{5}})(3 - \sqrt{{5}}) = 3^2 - (\sqrt{{5}})^2 = 9 - 5 = 4.
\]

**Final answer:**
\[
\\boxed{{4}}
\]

Output:

def givenRoot : Float := 3.0 + 2.2360679775  -- 3 + √5
def input_b : Float := 4.0  -- Float value of b

def evalPoly (x b : Float) : Float := x*x - 3.0*x + b
def approxZero (x : Float) (eps : Float := 0.01) : Bool := Float.abs x < eps

def isRoot : Bool := approxZero (evalPoly givenRoot input_b)

#eval ("Is 3+sqrt5 a root of polynomial?", isRoot) 
Example 2:

Input:

**Step 1:** Since \(3 + \sqrt{{5}}\) is a root and the coefficients are real, the conjugate \(3 - \sqrt{{5}}\) must also be a root.

**Step 2:** For a quadratic equation \(x^2 - 3x + b = 0\), the sum of the roots is \(3\) and the product is \(b\).

**Step 3:** Sum of roots:
\[
(3 + \sqrt{{5}}) + (3 - \sqrt{{5}}) = 6.
\]
But the coefficient of \(x\) is \(-3\), so the sum should be \(3\). There is a discrepancy.

**Step 4:** Re-examine the equation: it is \(x^2 - 3x + b = 0\). The sum of the roots is indeed \(3\). However, the sum we computed is \(6\), which is not \(3\). This indicates that the conjugate root is not necessarily present because the coefficients are integers? Actually, \(b\) is to be computed, and it might not be integer. But the problem states "compute \(b\)", and the equation has integer coefficients for \(x^2\) and \(x\).

Wait: The equation is \(x^2 - 3x + b = 0\). The coefficient of \(x^2\) is 1, and of \(x\) is \(-3\), both integers. Therefore, if \(3 + \sqrt{{5}}\) is a root, then its conjugate \(3 - \sqrt{{5}}\) must also be a root, because the irrational part \(\sqrt{{5}}\) must cancel out.

But then the sum would be \(6\), but it should be \(3\). This is a contradiction.

**Step 5:** Perhaps the conjugate is not required. Alternatively, maybe the equation has only one root given, and we can substitute to find \(b\).

Substitute \(x = 3 + \sqrt{{5}}\) into the equation:
\[
(3 + \sqrt{{5}})^2 - 3(3 + \sqrt{{5}}) + b = 0.
\]
Compute:
\[
(9 + 6\sqrt{{5}} + 5) - 9 - 3\sqrt{{5}} + b = 0,
\]
\[
14 + 6\sqrt{{5}} - 9 - 3\sqrt{{5}} + b = 0,
\]
\[
5 + 3\sqrt{{5}} + b = 0.
\]
So,
\[
b = -5 - 3\sqrt{{5}}.
\]

This is a valid value for \(b\), and it is not an integer. The conjugate root is not necessarily present because \(b\) is not constrained to be an integer? The problem does not state that \(b\) is integer, only to compute it.

Therefore, we can proceed with this.

**Final Answer:**
\[
\\boxed{{-5 - 3\sqrt{{5}}}}
\]

Output:
def givenRoot : Float := 3.0 + 2.2360679775 
def input_b : Float := -5.0 - 3*2.2360679775  -- Float value of b= -5 - 3√5

def evalPoly (x b : Float) : Float := x*x - 3.0*x + b
def approxZero (x : Float) (eps : Float := 0.01) : Bool := Float.abs x < eps

def isRoot : Bool := approxZero (evalPoly givenRoot input_b)

#eval ("Is 3+sqrt5 a root of polynomial?", isRoot)  

Now do this for the following input:
{initial_solution}
"""

LEAN_CODE_INTERPRETATION_PROMPT = """
    Given the given Lean code input, and its output, interpret the results and provide a concise explanation of the error.
    Do not include any additional information or context, just the interpretation of the Lean output.
    
    Use the following example as a guide for your response:

    Input:

    Lean code:
    def candidateRoot : Float := 3.0 + 2.2360679775  --- 3 + √5
    def input_b : Float := 4.0 --- Float value of b=4

    def evalPoly (x b : Float) : Float := x*x - 3.0*x + b
    def approxZero (x : Float) (eps : Float := 0.01) : Bool := Float.abs x < eps

    #eval ("Is 3+sqrt5 a root of polynomial?", isRoot) 

    Lean output:

    #eval result: ("Is 3+sqrt5 a root of polynomial?", false)

    Output:
    3+sqrt{{5}} is not a root of the polynomial x^2 - 3x + b = 0 for the given value of b.

    Now this for the following input:

    Lean code:
    {lean_code}

    Lean output:
    {lean_output}
    """

# === Helper functions ===

def call_deepseek(user_prompt: str, system_prompt: str) -> str:
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.0,
        max_tokens=1024,
        stream=False
    )
    return response.choices[0].message.content.strip()

def write_lean_code(code: str):
    with open(LEAN_FILE_PATH, "w", encoding="utf-8") as f:
        f.write(code)

def run_lean_code() -> tuple[bool, str]:
    result = subprocess.run(
        ["lake", "build"],
        cwd=LEAN_PROJECT_DIR,
        capture_output=True,
        text=True
    )
    stdout, stderr = result.stdout, result.stderr

    # Assume build success if no 'error' in stderr
    success = "error" not in stderr.lower()

    output = f"=== STDOUT ===\n{stdout}\n\n=== STDERR ===\n{stderr}"

    # Clear Lean file after build (optional)
    # with open(LEAN_FILE_PATH, "w", encoding="utf-8") as f:
    #     f.write("")

    return success, output

def clean_lean_code(raw_code: str) -> str:
    """
    Extract Lean code from API response, stripping any markdown fences and leading/trailing text.
    """
    # Remove markdown triple backticks if present
    if raw_code.startswith("```"):
        # Find the first and last triple backtick
        parts = raw_code.split("```")
        # The code is usually the second element after split: ```lang\ncode\n```
        if len(parts) >= 3:
            code = parts[1]
        else:
            code = raw_code
    else:
        code = raw_code

    # Strip leading/trailing whitespace
    code = code.strip()

    return code



def run_strategy(trial_dir, strategy_name, prompt_template, problem, previous_answer=None, error=None):
    log_path = os.path.join(trial_dir, f"{strategy_name}.csv")
    with open(log_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["attempt", "prompt", "response", "lean_code", "lean_output", "Correct/Incorrect"])

        current_previous_answer = previous_answer
        current_error = error
        
        for attempt in range(1, MAX_ATTEMPTS + 1):
            if strategy_name == "no_feedback":
                prompt = prompt_template.format(problem=problem)
            elif strategy_name == "binary":
                prompt = prompt_template.format(problem=problem, previous_answer=current_previous_answer)
            elif strategy_name == "error_prev_sol":
                prompt = prompt_template.format(problem=problem, previous_answer=current_previous_answer, error=current_error)
            elif strategy_name == "error_no_prev_sol":
                prompt = prompt_template.format(problem=problem, error=current_error)

            print(f"Attempt {attempt} with strategy {strategy_name}")

            response = call_deepseek(prompt, system_prompt="You are a concise math problem solver.")
            match = re.search(r'\\boxed\{([^}]*)\}', response)
            if not match:
                solved = False
                writer.writerow([attempt, prompt, response, "", "", 0])
                continue

            # Generate Lean code
            lean_code = call_deepseek(
                LEAN_CODE_GENERATION_PROMPT.format(initial_solution=response),
                system_prompt="You are a Lean code generator and verifier for math solutions."
            )
            lean_code_clean = clean_lean_code(lean_code)
            write_lean_code(lean_code_clean)

            success, lean_output = run_lean_code()
            solved = ("false" not in lean_output.lower())

            if solved:
                writer.writerow([attempt, prompt, response, lean_code_clean, lean_output, "Correct"])
                return (attempt, "Solved")

            error_description = call_deepseek(
                LEAN_CODE_INTERPRETATION_PROMPT.format(
                    lean_code=lean_code_clean,
                    lean_output=lean_output
                ),
                system_prompt="You are a Lean code interpreter and verifier for math solutions."
            )

            writer.writerow([attempt, prompt, response, lean_code_clean, lean_output, "Incorrect"])
            current_previous_answer = response  
            current_error = error_description  

    return (MAX_ATTEMPTS, "Not Solved")


# === Main Loop ===
def feedback_loop():
    os.makedirs(LOG_ROOT, exist_ok=True)
    summary_path = os.path.join(LOG_ROOT, "trial_summaries.csv")
    with open(summary_path, "w", newline="", encoding="utf-8") as f_sum:
        summary_writer = csv.writer(f_sum)
        summary_writer.writerow(["Trial No.", "No Feedback Attempts", "Binary Attempts", "Error With Solution Attempts", "Error Without Solution Attempts"])

        trial = 0
        first_try_correct = 0
        incomplete_answer = 0
        while trial < MAX_TRIALS:
        
            print(f"\n=== Starting Trial {trial + 1} ===")
            #Initial solution
            initial_prompt = INITIAL_PROMPT.format(problem=problem)
            initial_solution = call_deepseek(initial_prompt, system_prompt="You are a concise math problem solver.")
            
            match = re.search(r'\\boxed\{([^}]*)\}', initial_solution)
            if not match:
                incomplete_answer += 1
                print("Incomplete answer, retrying trial.")
                continue

            lean_code = call_deepseek(
                user_prompt=LEAN_CODE_GENERATION_PROMPT.format(initial_solution=initial_solution),
                system_prompt="You are a Lean code generator and verifier for math solutions."
            )
            clean_code = clean_lean_code(lean_code)
            write_lean_code(clean_code)

            # Step 4: Run Lean build
            success, lean_output = run_lean_code()

            # Check if first try correct
            if "false" not in lean_output.lower():
                first_try_correct += 1
                print("First try correct, skipping trial (not counted).")
                continue
            
            print("First try incorrect, proceeding with feedback strategies.")

            trial += 1
        
            trial_dir = os.path.join(LOG_ROOT, f"trial_{trial}")
            os.makedirs(trial_dir, exist_ok=True)

            error_description = call_deepseek(
                user_prompt=LEAN_CODE_INTERPRETATION_PROMPT.format(
                lean_code=clean_code,
                lean_output=lean_output
                ),
                system_prompt="You are a Lean code interpreter and verifier for math solutions."
            )

            # Step 3: Run feedback arms

            print(f"Running No Feedback strategy for Trial {trial}")
            attempts_no_feedback, solved_no_feedback = run_strategy(trial_dir, "no_feedback", INITIAL_PROMPT, problem=problem)
            if solved_no_feedback == "Not Solved":
                attempts_no_feedback = "Did not solve"
            
            print(f"Running Binary Correctness strategy for Trial {trial}")
            attempts_binary, solved_binary = run_strategy(trial_dir, "binary", BINARY_CORRECT_PROMPT, problem=problem, previous_answer=initial_solution)
            if solved_binary == "Not Solved":
                attempts_binary = "Did not solve"
            
            print(f"Running Error Feedback with Previous Solution strategy for Trial {trial}")
            attempts_error_prev_sol, solved_error_prev_sol = run_strategy(trial_dir, "error_prev_sol", RETRY_SOLVE_ERROR_PROMPT,problem=problem,
                                                   previous_answer=initial_solution, error=error_description)
            if solved_error_prev_sol == "Not Solved":
                attempts_error_prev_sol = "Did not solve"
            
            print(f"Running Error Feedback without Previous Solution strategy for Trial {trial}")
            attempts_error_no_prev_sol, solved_error_no_prev_sol = run_strategy(trial_dir, "error_no_prev_sol", RETRY_SOLVE_WITHOUT_PREVIOUS_ANSWER_PROMPT, problem=problem,
                                                           error=error_description)
            if solved_error_no_prev_sol == "Not Solved":
                attempts_error_no_prev_sol = "Did not solve"
            

            # Step 4: Log trial summary
            print(f"Trial {trial} Summary: No Feedback Attempts: {attempts_no_feedback}, Binary Attempts: {attempts_binary}, Error with Prev Attempts: {attempts_error_prev_sol}, Error without Prev Attempts: {attempts_error_no_prev_sol}")
            summary_writer.writerow([trial, attempts_no_feedback, attempts_binary, attempts_error_prev_sol, attempts_error_no_prev_sol])

    print(f"\n=== Experiment Completed ===")
    print(f"Total Trials Conducted: {trial}")   
    print(f"First Try Correct (not counted in trials): {first_try_correct}")
    print(f"Incomplete Answers (not counted in trials): {incomplete_answer}")


if __name__ == "__main__":
    feedback_loop()