In [None]:
import pandas as pd 

gsm_data = pd.read_csv('../data/preprocessed/gsm_data.csv')
gsm_data = gsm_data.sample(frac = 1)

**DONT RUN THE ABOVE AFTER RUNNING ONCE if running the below code more than 1 time** \
It ensures all types of questions have a fair chance of being answered by the LLM \
GSM-symbolic groups questions together based on template. This causes bias for the first few templates.  

In [None]:
from openai import OpenAI

def ask_gemma3(prompt):
  client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="...",
  )

  completion = client.chat.completions.create(
    model="google/gemma-3-27b-it",
    messages=[
      {
        "role": "user",
        "content": prompt
      }
    ]
  )
  if completion.choices[0].message.content != None:
    return completion.choices[0].message.content
  else:
    return None

In [3]:
import re

def extract_final_ans(response):
    answer = response.replace('*', '')
    answer = answer.replace("," ,'')
    answer = answer.lower().split('answer: ')

    try:
        final = re.findall(r"\d+", answer[-1])[0]
    except IndexError:
        final = None
    
    return final

### Collect 50 incorrect solutions first:

In [None]:
import time 

def collect_responses(df, mistakes):
    llm_response = []
    n = 0

    for idx, row in df.iterrows():
        if n==mistakes:
            return llm_response
        
        question = row['question']
        answer = row['answer']
        
        prompt = f"""
        Answer the following question. Please use Chain of Thought and number each step:
        {question}
        
        Have your final answer strictly at the end of your response in this form:
        Answer: 
        """
        
        final = None
    
        while final == None:
            response = ask_gemma3(prompt)
            final = extract_final_ans(response)
        
        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        if ref_final != final:
            n += 1
            llm_response.append({'question': question, 'ref_answer': answer, 'llm_answer': response, 'final_ans': final})
            temp = pd.DataFrame(llm_response)
            temp.to_csv('../data/responses/temp.csv')
            
        print("final_ans: ", final)
        print("ref_ans: ", ref_final)
        print('iteration: ', n)

    return llm_response

In [None]:
llm_gsm = collect_responses(gsm_data, 100) # gather more than 50 incorrct solutions to be able to ensure you have 50 valid samples after filtering
llm_gsm

In [None]:
save_directory = "../data/responses"
df_gsm= pd.DataFrame(llm_gsm)
df_gsm.to_csv(save_directory + '/gemma3_response.csv')

### Self correction process - w/ error location and error type

In [None]:
import pandas as pd

df_responses = pd.read_csv('../data/responses/gemma3/gemma3-27b_response_annotated.csv')
df_responses = df_responses.head(50)

In [None]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        error_step = row['Error Step']
        error_type = re.sub(r'^[\d.]+', '', row['Error Reason'].strip('()')).lstrip()
        answer = row['ref_answer']

        correction_prompt = f"""
        Here are definitions of error types:
        - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
        - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
        - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
        - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
        - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
        - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
        - Incorrect Value: If the numerical value of the final answer is incorrect.
        
        The following solution of the given question is generated using the same model as you. Consider the question and solution that uses the Chain of Thought strategy with each step numbered.

        Question: {question}

        Solution: {solution}

        You made a mistake in Step {int(error_step)} due to {error_type}.  Can you revise your solution and carefully solve it again?
        """

        response = ask_gemma3(correction_prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
    
    return llm_response_corrected
        

In [33]:
llm_self_corrected = self_correct(df_responses)

Ref answer:  5
LLM initial answer:  25
LLM self revised:  25
Ref answer:  1
LLM initial answer:  8
LLM self revised:  1
Ref answer:  4230
LLM initial answer:  29610
LLM self revised:  4230
Ref answer:  1225
LLM initial answer:  7350
LLM self revised:  1225
Ref answer:  63
LLM initial answer:  75
LLM self revised:  75
Ref answer:  90
LLM initial answer:  100
LLM self revised:  100
Ref answer:  40
LLM initial answer:  35
LLM self revised:  40
Ref answer:  544
LLM initial answer:  613
LLM self revised:  544
Ref answer:  33
LLM initial answer:  43
LLM self revised:  9
Ref answer:  3
LLM initial answer:  0
LLM self revised:  0
Ref answer:  770
LLM initial answer:  2310
LLM self revised:  2310
Ref answer:  10
LLM initial answer:  0
LLM self revised:  0
Ref answer:  1610
LLM initial answer:  14
LLM self revised:  14
Ref answer:  343
LLM initial answer:  377
LLM self revised:  528
Ref answer:  42
LLM initial answer:  0
LLM self revised:  42
Ref answer:  64
LLM initial answer:  24
LLM self revi

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected)
df_corrected.to_csv(save_directory + '/gemma3/gemma3-27b_selfcorrected_errorloc.csv')

### Self correction process - general error location w/ error type

In [None]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        error_type = re.sub(r'^[\d.]+', '', row['Error Reason'].strip('()')).lstrip()
        answer = row['ref_answer']


        if row['Error Step'] <= (row['Total Steps']/2):
            prompt = f"""
            Here are definitions of error types:
            - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
            - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
            - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
            - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
            - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
            - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
            - Incorrect Value: If the numerical value of the final answer is incorrect.

            The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

            Question: {question}

            Solution: {solution}

            You made a mistake in the upper half of your solution due to {error_type}.  Can you revise your solution and carefully solve it again?
            """
        else:
            prompt = f"""
            Here are definitions of error types:
            - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
            - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
            - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
            - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
            - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
            - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
            - Incorrect Value: If the numerical value of the final answer is incorrect.
            
            The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

            Question: {question}

            Solution: {solution}

            You made a mistake in the latter half of your solution due to {error_type}.  Can you revise your solution and carefully solve it again?
            """

        response = ask_gemma3(prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
        temp = pd.DataFrame(llm_response_corrected)
        temp.to_csv('temp.csv')
    
    return llm_response_corrected
        

In [9]:
llm_self_corrected = self_correct(df_responses)

Ref answer:  5
LLM initial answer:  25
LLM self revised:  25
Ref answer:  1
LLM initial answer:  8
LLM self revised:  8
Ref answer:  4230
LLM initial answer:  29610
LLM self revised:  29610
Ref answer:  1225
LLM initial answer:  7350
LLM self revised:  1225
Ref answer:  63
LLM initial answer:  75
LLM self revised:  62
Ref answer:  90
LLM initial answer:  100
LLM self revised:  225
Ref answer:  40
LLM initial answer:  35
LLM self revised:  45
Ref answer:  544
LLM initial answer:  613
LLM self revised:  544
Ref answer:  33
LLM initial answer:  43
LLM self revised:  43
Ref answer:  3
LLM initial answer:  0
LLM self revised:  0
Ref answer:  770
LLM initial answer:  2310
LLM self revised:  770
Ref answer:  10
LLM initial answer:  0
LLM self revised:  0
Ref answer:  1610
LLM initial answer:  14
LLM self revised:  14
Ref answer:  343
LLM initial answer:  377
LLM self revised:  324
Ref answer:  42
LLM initial answer:  0
LLM self revised:  0
Ref answer:  64
LLM initial answer:  24
LLM self revi

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected)
df_corrected.to_csv(save_directory + '/gemma3/gemma3-27b_selfcorrected_genloc.csv')

### Self correction process - with no error location

In [None]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        error_type = re.sub(r'^[\d.]+', '', row['Error Reason'].strip('()')).lstrip()
        answer = row['ref_answer']

        prompt = f"""
        Here are definitions of error types:
        - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
        - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
        - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
        - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
        - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
        - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
        - Incorrect Value: If the numerical value of the final answer is incorrect.
        
        The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

        Question: {question}

        Solution: {solution}

        You made a mistake in your solution due to {error_type}.  Can you revise your solution and carefully solve it again?
        """

        response = ask_gemma3(prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
    
    return llm_response_corrected
        

In [12]:
llm_self_corrected = self_correct(df_responses)

Ref answer:  5
LLM initial answer:  25
LLM self revised:  25
Ref answer:  1
LLM initial answer:  8
LLM self revised:  8
Ref answer:  4230
LLM initial answer:  29610
LLM self revised:  4230
Ref answer:  1225
LLM initial answer:  7350
LLM self revised:  1225
Ref answer:  63
LLM initial answer:  75
LLM self revised:  75
Ref answer:  90
LLM initial answer:  100
LLM self revised:  225
Ref answer:  40
LLM initial answer:  35
LLM self revised:  36
Ref answer:  544
LLM initial answer:  613
LLM self revised:  544
Ref answer:  33
LLM initial answer:  43
LLM self revised:  9
Ref answer:  3
LLM initial answer:  0
LLM self revised:  0
Ref answer:  770
LLM initial answer:  2310
LLM self revised:  770
Ref answer:  10
LLM initial answer:  0
LLM self revised:  0
Ref answer:  1610
LLM initial answer:  14
LLM self revised:  14
Ref answer:  343
LLM initial answer:  377
LLM self revised:  326
Ref answer:  42
LLM initial answer:  0
LLM self revised:  0
Ref answer:  64
LLM initial answer:  24
LLM self revise

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected)
df_corrected.to_csv(save_directory + '/gemma3/gemma3_selfcorrected_noloc.csv')

### Baseline: No error location or error type

In [19]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        answer = row['ref_answer']

        prompt = f"""
        The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

        Question: {question}

        Solution: {solution}

        You made a mistake in your solution.  Can you revise your solution and carefully solve it again?
        """

        response = ask_gemma3(prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
    
    return llm_response_corrected
        

In [20]:
llm_self_corrected_1 = self_correct(df_responses)

Ref answer:  5
LLM initial answer:  25
LLM self revised:  25
Ref answer:  1
LLM initial answer:  8
LLM self revised:  8
Ref answer:  4230
LLM initial answer:  29610
LLM self revised:  4230
Ref answer:  1225
LLM initial answer:  7350
LLM self revised:  1225
Ref answer:  63
LLM initial answer:  75
LLM self revised:  75
Ref answer:  90
LLM initial answer:  100
LLM self revised:  100
Ref answer:  40
LLM initial answer:  35
LLM self revised:  45
Ref answer:  544
LLM initial answer:  613
LLM self revised:  613
Ref answer:  33
LLM initial answer:  43
LLM self revised:  79
Ref answer:  3
LLM initial answer:  0
LLM self revised:  0
Ref answer:  770
LLM initial answer:  2310
LLM self revised:  770
Ref answer:  10
LLM initial answer:  0
LLM self revised:  0
Ref answer:  1610
LLM initial answer:  14
LLM self revised:  14
Ref answer:  343
LLM initial answer:  377
LLM self revised:  377
Ref answer:  42
LLM initial answer:  0
LLM self revised:  0
Ref answer:  64
LLM initial answer:  24
LLM self revis

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected_1)
df_corrected.to_csv(save_directory + '/gemma3/gemma3_selfcorrected_baseline.csv')