In [3]:
import litellm
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from judge import Judge

load_dotenv()


# Optional: model aliases
litellm.model_alias_map["openai/gpt-4o"] = "gpt-4o"
litellm.model_alias_map["gemini/flash-1.5"] = "gemini/gemini-1.5-flash"

#load the dataset
df_100 = pd.read_csv("../output_data/100_data_math500.csv")
# Prompts
COT_PROMPT = """
You are a math tutor helping a student. Solve the following math problem. Only give the answer and nothing else.

Question:
{question}

"""

REFLECTION_PROMPT = """
Now reflect on the solution above. Was the reasoning and final answer correct? If there was a mistake, explain it and provide a corrected solution.
"""
judge = Judge(model='gemini/gemini-2.0-flash')

df_100.head(2)


Unnamed: 0.1,Unnamed: 0,problem,solution,answer,subject,level,unique_id
0,0,"Convert the point $(0,3)$ in rectangular coord...",We have that $r = \sqrt{0^2 + 3^2} = 3.$ Also...,"\left( 3, \frac{\pi}{2} \right)",Precalculus,2,test/precalculus/807.json
1,1,Define\n\[p = \sum_{k = 1}^\infty \frac{1}{k^2...,We count the number of times $\frac{1}{n^3}$ a...,p - q,Intermediate Algebra,5,test/intermediate_algebra/1994.json


In [4]:
# LLM Call
def call_litellm(prompt, model="openai/gpt-4o"):
    response = litellm.completion(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response['choices'][0]['message']['content'].strip()


rows = []

for _,item in tqdm(df_100.iterrows()):
    q = item["problem"]
    truth = item["answer"]

    # Chain of Thought
    prompt = COT_PROMPT.format(question=q)
    response = call_litellm(prompt,model="openai/gpt-4o")

    

    result = judge.prediction(query=q,answer1=response,answer2=truth).correct

    rows.append({
        "question": q,
        "ground_truth": truth,
        "answer": response,
        "result": result
    })

# Create a DataFrame
df = pd.DataFrame(rows)

# Save or display
# df.to_csv("../output_data/math500_cot_reflection_output_gpt_4o.csv", index=False)
# print(df.head())

100it [04:26,  2.66s/it]


In [6]:
df['result'].value_counts()

result
False    52
True     48
Name: count, dtype: int64

In [7]:
df.head()

Unnamed: 0,question,ground_truth,answer,result
0,"Convert the point $(0,3)$ in rectangular coord...","\left( 3, \frac{\pi}{2} \right)","\((3, \frac{\pi}{2})\)",True
1,Define\n\[p = \sum_{k = 1}^\infty \frac{1}{k^2...,p - q,The answer is \(\frac{1}{2}(p - q)\).,False
2,"If $f(x) = \frac{3x-2}{x-2}$, what is the valu...",\frac{14}{3},The value of \( f(-2) + f(-1) + f(0) \) is \(-...,False
3,How many positive whole-number divisors does 1...,9,9,True
4,The results of a cross-country team's training...,\text{Evelyn},Carla,False
