## Given Code

In [2]:
# %% Minimal setup
# If needed (uncomment in a notebook):
# !pip install requests python-dotenv

import os, json, textwrap, re, time
import requests
import pandas as pd

API_KEY  = os.getenv("OPENAI_API_KEY", "cse476")
API_BASE = os.getenv("API_BASE", "http://10.4.58.53:41701/v1")  
MODEL    = os.getenv("MODEL_NAME", "bens_model")              

def call_model_chat_completions(prompt: str,
                                system: str = "You are a helpful assistant. Reply with only the final answer—no explanation.",
                                model: str = MODEL,
                                temperature: float = 0.0,
                                timeout: int = 60) -> dict:
    """
    Calls an OpenAI-style /v1/chat/completions endpoint and returns:
    { 'ok': bool, 'text': str or None, 'raw': dict or None, 'status': int, 'error': str or None, 'headers': dict }
    """
    url = f"{API_BASE}/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type":  "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user",   "content": prompt}
        ],
        "temperature": temperature,
        "max_tokens": 128,
    }

    try:
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        status = resp.status_code
        hdrs   = dict(resp.headers)
        if status == 200:
            data = resp.json()
            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
            return {"ok": True, "text": text, "raw": data, "status": status, "error": None, "headers": hdrs}
        else:
            # try best-effort to surface error text
            err_text = None
            try:
                err_text = resp.json()
            except Exception:
                err_text = resp.text
            return {"ok": False, "text": None, "raw": None, "status": status, "error": str(err_text), "headers": hdrs}
    except requests.RequestException as e:
        return {"ok": False, "text": None, "raw": None, "status": -1, "error": str(e), "headers": {}}


## Prompts

In [None]:
SYSTEM_COT = """ You are a helpful reasoning assistant. 
Think step by step to solve the problem, but ONLY output the final answer.
When you are finished, make sure that you only reveal the final answer in a line that is in the following format: Final answer : <answer>"""

SYSTEM_SELF = SYSTEM_COT

SYSTEM_RAP = """ You are a general reasoning agent with access to a calculator tool.
You may choose exactly on eof the following actions:
1)  CALCULATOR: <expression>
            - Use this when you need arithmetic to verify or refine the answer
            - Use only numbers, + - * / **, parentheses,a dn round(x, ndigits)
    
2)  THINK: <new answer>
            - Use this when you can refine the answer without arithmetic
    
3)  FINAL: <answer>
            - Use this when you are ready to give the final answer
            - Return ONE line with the directive and the value, no other text                 
"""

## Helper Funcitons

In [None]:
# similar to the parse_action function from mini_lab 5
# look at that lab for reference

def get_final_answer(text: str):
    """ 
    Get the final answer after 'Final answer:'.
    If nothing is found, return the whole text stripped or blank
    """

    if not text:
        return ""
    
    # just to get the final answer into a nice format
    lower_case = text.lower()
    ans = "final answer:"
    i = lower_case.find(ans)
    if i == -1:
        return text.strip()
    
    return text[ i + len(ans):].strip()


def make_first_prompt


## Function Implementation 

### Actual Agent

In [None]:
def run_agent(question):

    # first step: do CoT
    # second step: do RAP (this refines the CoT answer)
    # third step: do self consistency (small k; maybe 5-7)

    # return final answer

### Chain of thought (CoT)

In [14]:
def run_cot(question: str, temperature: float = 0.0) -> str:
    user_prompt = f""" Problem: {question} 
Think step by step amd show your reasoning. Then on the last line of your answer write it exactly in the given format: Final answer: <answer> """
    
    response = call_model_chat_completions(prompt = user_prompt, system = SYSTEM_COT, temperature = temperature)



    # part of the testing functions in the tutorial for reference 
    if not response["ok"]:
        return RuntimeError("error : {response['error']}")
    
    got = (response["text"] or "").strip()
    final_answer = get_final_answer(got)
    return final_answer

### Self Consistency (CoT)

In [None]:
#def run_self_consistency():
    # just loop the def run_cot function

### RAP

In [None]:
 def run_rap():

## Data Loading ans Testing

In [6]:
df_dev = pd.read_json("cse476_final_project_dev_data.json")

In [7]:
df_dev.head()

Unnamed: 0,input,output,domain
0,Let $ABCD$ be a convex quadrilateral with $AB ...,112,math
1,A tennis player computes her win ratio by divi...,164,math
2,What is the product of the real roots of the e...,20,math
3,"In $\triangle ABC$ , $AB= 425$ , $BC=450$ , an...",306,math
4,How many even integers between 4000 and 7000 h...,728,math


In [9]:
sample_questions = df_dev.sample(3, random_state = 1225074669)
sample_questions

Unnamed: 0,input,output,domain
567,Are both Tim McIlrath and Spike Slawson Americ...,yes,common_sense
446,Carolyn practices the piano for 20 minutes a d...,First find Carolyn's total violin practice tim...,math
896,What was John Glenn/'s first spacecraft called...,friendship 7,common_sense


In [15]:
for i, row in sample_questions.iterrows():
    test_question = row["input"]
    test_expected_output = row["output"]

    test_prediction = run_cot(test_question)

    print(f"\nQuestion {i}:")
    print("Input: ", test_question)
    print("Expected : ", test_expected_output)
    print("Predicted: ", test_prediction)


Question 567:
Input:  Are both Tim McIlrath and Spike Slawson American punk rock musicians?
Expected :  yes
Predicted:  Yes

Question 446:
Input:  Carolyn practices the piano for 20 minutes a day and the violin for three times as long. If she practice six days a week, how many minutes does she spend practicing in a month with four weeks?
Expected :  First find Carolyn's total violin practice time by tripling her piano practice time: 20 minutes/day * 3 = <<20*3=60>>60 minutes/day
Then find the total amount of time she spends practicing each day: 60 minutes/day + 20 minutes/day = <<60+20=80>>80 minutes/day
Then find the total time she spends practicing each week: 80 minutes/day * 6 days/week = <<80*6=480>>480 minutes/week
Then find the total time she spends practicing each month: 480 minutes/week * 4 weeks/month = <<480*4=1920>>1920 minutes/month
#### 1920
Predicted:  1920

Question 896:
Input:  What was John Glenn/'s first spacecraft called? Answer the question using the context.

 Joh