In [2]:
import litellm
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from judge import Judge

load_dotenv()


# Optional: model aliases
litellm.model_alias_map["openai/gpt-4o"] = "gpt-4o"
litellm.model_alias_map["gemini/flash-1.5"] = "gemini/gemini-1.5-flash"

#load the dataset
df_100 = pd.read_csv("../output_data/100_data_math500.csv")
# Prompts
COT_PROMPT = """
You are a math tutor helping a student. Solve the following math problem step-by-step.

Question:
{question}

Let's think step-by-step.
"""

REFLECTION_PROMPT = """
Now reflect on the solution above. Was the reasoning and final answer correct? If there was a mistake, explain it and provide a corrected solution.
"""
judge = Judge(model='gemini/gemini-2.0-flash')

df_100.head(2)


Unnamed: 0.1,Unnamed: 0,problem,solution,answer,subject,level,unique_id
0,0,"Convert the point $(0,3)$ in rectangular coord...",We have that $r = \sqrt{0^2 + 3^2} = 3.$ Also...,"\left( 3, \frac{\pi}{2} \right)",Precalculus,2,test/precalculus/807.json
1,1,Define\n\[p = \sum_{k = 1}^\infty \frac{1}{k^2...,We count the number of times $\frac{1}{n^3}$ a...,p - q,Intermediate Algebra,5,test/intermediate_algebra/1994.json


In [10]:
# LLM Call
def call_litellm(prompt, model="openai/gpt-4o"):
    response = litellm.completion(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response['choices'][0]['message']['content'].strip()


rows = []

for _,item in tqdm(df_100.iterrows()):
    q = item["problem"]
    truth = item["answer"]

    # Chain of Thought
    cot_prompt = COT_PROMPT.format(question=q)
    cot_response = call_litellm(cot_prompt,model="openai/gpt-4o")

    # Self Reflection
    reflection_prompt = cot_response + "\n\n" + REFLECTION_PROMPT
    reflection_response = call_litellm(reflection_prompt,model="openai/gpt-4o")

    result = judge.prediction(query=q,answer1=reflection_response,answer2=truth).correct

    rows.append({
        "question": q,
        "ground_truth": truth,
        "cot_answer": cot_response,
        "reflected_answer": reflection_response,
        "result": result
    })

# Create a DataFrame
df = pd.DataFrame(rows)

# Save or display
df.to_csv("../output_data/math500_cot_reflection_output_gpt_4o.csv", index=False)
print(df.head())

100it [27:17, 16.38s/it]

                                            question  \
0  Convert the point $(0,3)$ in rectangular coord...   
1  Define\n\[p = \sum_{k = 1}^\infty \frac{1}{k^2...   
2  If $f(x) = \frac{3x-2}{x-2}$, what is the valu...   
3  How many positive whole-number divisors does 1...   
4  The results of a cross-country team's training...   

                      ground_truth  \
0  \left( 3, \frac{\pi}{2} \right)   
1                            p - q   
2                     \frac{14}{3}   
3                                9   
4                    \text{Evelyn}   

                                          cot_answer  \
0  To convert the point \((0, 3)\) from rectangul...   
1  To solve this problem, we need to express the ...   
2  To solve the problem, we need to evaluate the ...   
3  To find the number of positive whole-number di...   
4  To determine which student has the greatest av...   

                                    reflected_answer  result  
0  The solution provided is correc




In [None]:
# LLM Call
def call_litellm(prompt, model="openai/gpt-4o"):
    response = litellm.completion(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response['choices'][0]['message']['content'].strip()


rows = []

for _,item in tqdm(df_100.iterrows()):
    q = item["problem"]
    truth = item["answer"]

    # Chain of Thought
    cot_prompt = COT_PROMPT.format(question=q)
    cot_response = call_litellm(cot_prompt,model="gemini/gemini-1.5-flash")

    # Self Reflection
    reflection_prompt = cot_response + "\n\n" + REFLECTION_PROMPT
    reflection_response = call_litellm(reflection_prompt,model="gemini/gemini-1.5-flash")

    result = judge.prediction(query=q,answer1=reflection_response,answer2=truth).correct

    rows.append({
        "question": q,
        "ground_truth": truth,
        "cot_answer": cot_response,
        "reflected_answer": reflection_response,
        "result": result
    })

# Create a DataFrame
df = pd.DataFrame(rows)

# Save or display
df.to_csv("../output_data/math500_cot_reflection_output_gemini_1_5_flash.csv", index=False)
print(df.head())

In [14]:
df['result'].value_counts()

result
True     81
False    19
Name: count, dtype: int64

In [15]:
df_gpt4o = pd.read_csv("../output_data/math500_cot_reflection_output_gpt_4o.csv")
df_gpt4o['result'].value_counts()

result
True     80
False    20
Name: count, dtype: int64