In [None]:
import pandas as pd 

gsm_data = pd.read_csv('../data/preprocessed/gsm_data.csv')
gsm_data = gsm_data.sample(frac = 1)

**DONT RUN THE ABOVE AFTER RUNNING ONCE if running the below code more than 1 time** \
It ensures all types of questions have a fair chance of being answered by the LLM \
GSM-symbolic groups questions together based on template. This causes bias for the first few templates.  

In [None]:
from openai import OpenAI

def ask_phi(prompt):
  client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="..",
  )

  completion = client.chat.completions.create(
    model="microsoft/phi-4",
    messages=[
      {
        "role": "user",
        "content": prompt
      }
    ]
  )
  return completion.choices[0].message.content

In [13]:
import re

def extract_final_ans(response):
    answer = response.replace('*', '')
    answer = answer.replace("," ,'')
    answer = answer.lower().split('answer: ')

    try:
        final = re.findall(r"\d+", answer[-1])[0]
    except IndexError:
        final = None
    
    return final

### Collect 50 incorrect solutions first:

In [None]:
import time 

def collect_responses(df, mistakes):
    llm_response = []
    n = 0

    for idx, row in df.iterrows():
        if n==mistakes:
            return llm_response
        
        question = row['question']
        answer = row['answer']
        
        prompt = f"""
        Answer the following question. Please use Chain of Thought and number each step:
        {question}
        
        Have your final answer strictly at the end of your response in this form:
        Answer: 
        """
        
        final = None
    
        while final == None:
            response = ask_phi(prompt)
            final = extract_final_ans(response)
        
        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        if ref_final != final:
            n += 1
            llm_response.append({'question': question, 'ref_answer': answer, 'llm_answer': response, 'final_ans': final})
            temp = pd.DataFrame(llm_response)
            temp.to_csv('../data/responses/temp3.csv')
            
        print("final_ans: ", final)
        print("ref_ans: ", ref_final)
        print('iteration: ', n)

    return llm_response

In [None]:
llm_gsm = collect_responses(gsm_data, 100) # gather more than 50 incorrct solutions to be able to ensure you have 50 valid samples after filtering
llm_gsm

final_ans:  736
ref_ans:  736
iteration:  0
final_ans:  1714
ref_ans:  1714
iteration:  0
final_ans:  378
ref_ans:  378
iteration:  0
final_ans:  3
ref_ans:  3
iteration:  0
final_ans:  17
ref_ans:  17
iteration:  0
final_ans:  190
ref_ans:  1
iteration:  1
final_ans:  25
ref_ans:  25
iteration:  1
final_ans:  63
ref_ans:  90
iteration:  2
final_ans:  310
ref_ans:  310
iteration:  2
final_ans:  92
ref_ans:  92
iteration:  2
final_ans:  246
ref_ans:  246
iteration:  2
final_ans:  27
ref_ans:  27
iteration:  2
final_ans:  11
ref_ans:  11
iteration:  2
final_ans:  7255
ref_ans:  5804
iteration:  3
final_ans:  330
ref_ans:  330
iteration:  3
final_ans:  9
ref_ans:  9
iteration:  3
final_ans:  64
ref_ans:  64
iteration:  3
final_ans:  1
ref_ans:  1
iteration:  3
final_ans:  144
ref_ans:  144
iteration:  3
final_ans:  39
ref_ans:  39
iteration:  3
final_ans:  331
ref_ans:  331
iteration:  3
final_ans:  81
ref_ans:  81
iteration:  3
final_ans:  77
ref_ans:  77
iteration:  3
final_ans:  130
re

[{'question': "Tara's aunt said that she had ₣190 budgeted for her birthday party. She wants to make sure she and her friends all get to play one round of bowling, have ₣3 in game room tokens, and get to ride the bumper cars eight times. A round of bowling is ₣4. The bumper cars cost ₣11 a ride. How many friends can she invite?",
  'ref_answer': 'The bumper cars will cost ₣88 per person because 11 x 8 = 88\nEach person costs ₣95 because 3 + 4 + 88 = 95\n2 total people can attend because 190 / 95 = 2\nShe can invite 1 friends because 2 - 1 =1\n#### 1',
  'llm_answer': "To determine how many friends Tara can invite to her birthday party within the budget of ₣190, we need to calculate the total cost per person and then see how many people can be covered by the budget.\n\n1. **Cost per Person Calculation**:\n   - **Bowling**: Each person participates in one round of bowling, which costs ₣4.\n   - **Game Room Tokens**: Each person receives ₣3 in game room tokens.\n   - **Bumper Cars**: Each

In [None]:
save_directory = "../data/responses"
df_gsm= pd.DataFrame(llm_gsm)
df_gsm.to_csv(save_directory + '/phi4_response2.csv')

### Self correction process - w/ error location and error type

In [None]:
import pandas as pd

df_responses = pd.read_csv('../data/responses/phi4/phi4_response_annotated.csv')
df_responses = df_responses.head(50)

In [29]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        error_step = row['Error Step']
        error_type = re.sub(r'^[\d.]+', '', row['Error Reason'].strip('()')).lstrip()
        answer = row['ref_answer']

        correction_prompt = f"""
        Here are definitions of error types:
        - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
        - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
        - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
        - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
        - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
        - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
        
        The following solution of the given question is generated using the same model as you. Consider the question and solution that uses the Chain of Thought strategy with each step numbered.

        Question: {question}

        Solution: {solution}

        You made a mistake in Step {int(error_step)} due to {error_type}.  Can you revise your solution and carefully solve it again?
        """

        response = ask_phi(correction_prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
    
    return llm_response_corrected
        

In [30]:
llm_self_corrected = self_correct(df_responses)

Ref answer:  360
LLM initial answer:  300
LLM self revised:  300
Ref answer:  120
LLM initial answer:  168
LLM self revised:  168
Ref answer:  1
LLM initial answer:  3
LLM self revised:  2
Ref answer:  44
LLM initial answer:  43
LLM self revised:  43
Ref answer:  16
LLM initial answer:  0
LLM self revised:  0
Ref answer:  39
LLM initial answer:  0
LLM self revised:  1
Ref answer:  16
LLM initial answer:  15
LLM self revised:  16
Ref answer:  13
LLM initial answer:  26
LLM self revised:  13
Ref answer:  175
LLM initial answer:  325
LLM self revised:  1
Ref answer:  413
LLM initial answer:  1
LLM self revised:  1
Ref answer:  39
LLM initial answer:  0
LLM self revised:  0
Ref answer:  100
LLM initial answer:  68
LLM self revised:  100
Ref answer:  17
LLM initial answer:  18
LLM self revised:  18
Ref answer:  12
LLM initial answer:  0
LLM self revised:  0
Ref answer:  826
LLM initial answer:  878
LLM self revised:  2
Ref answer:  400
LLM initial answer:  200
LLM self revised:  200
Ref ans

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected)
df_corrected.to_csv(save_directory + '/phi4/phi_selfcorrected_errorloc.csv')

### Self correction process - general error location w/ error type

In [34]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        error_type = re.sub(r'^[\d.]+', '', row['Error Reason'].strip('()')).lstrip()
        answer = row['ref_answer']


        if row['Error Step'] <= (row['Total Steps']/2):
            prompt = f"""
            Here are definitions of error types:
            - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
            - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
            - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
            - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
            - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
            - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
            
            The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

            Question: {question}

            Solution: {solution}

            You made a mistake in the upper half of your solution due to {error_type}.  Can you revise your solution and carefully solve it again?
            """
        else:
            prompt = f"""
            Here are definitions of error types:
            - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
            - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
            - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
            - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
            - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
            - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
            
            The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

            Question: {question}

            Solution: {solution}

            You made a mistake in the latter half of your solution due to {error_type}.  Can you revise your solution and carefully solve it again?
            """

        response = ask_phi(prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
        temp = pd.DataFrame(llm_response_corrected)
        temp.to_csv('temp.csv')
    
    return llm_response_corrected
        

In [35]:
llm_self_corrected = self_correct(df_responses)

Ref answer:  360
LLM initial answer:  300
LLM self revised:  300
Ref answer:  120
LLM initial answer:  168
LLM self revised:  1
Ref answer:  1
LLM initial answer:  3
LLM self revised:  3
Ref answer:  44
LLM initial answer:  43
LLM self revised:  1
Ref answer:  16
LLM initial answer:  0
LLM self revised:  0
Ref answer:  39
LLM initial answer:  0
LLM self revised:  1
Ref answer:  16
LLM initial answer:  15
LLM self revised:  16
Ref answer:  13
LLM initial answer:  26
LLM self revised:  13
Ref answer:  175
LLM initial answer:  325
LLM self revised:  1
Ref answer:  413
LLM initial answer:  1
LLM self revised:  1
Ref answer:  39
LLM initial answer:  0
LLM self revised:  0
Ref answer:  100
LLM initial answer:  68
LLM self revised:  2250
Ref answer:  17
LLM initial answer:  18
LLM self revised:  18
Ref answer:  12
LLM initial answer:  0
LLM self revised:  0
Ref answer:  826
LLM initial answer:  878
LLM self revised:  878
Ref answer:  400
LLM initial answer:  200
LLM self revised:  200
Ref ans

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected)
df_corrected.to_csv(save_directory + '/phi4/phi4_selfcorrected_genloc.csv')

## Self correction process - with no error location

In [39]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        error_type = re.sub(r'^[\d.]+', '', row['Error Reason'].strip('()')).lstrip()
        answer = row['ref_answer']

        prompt = f"""
        Here are definitions of error types:
        - Incorrect value of variable cited or used: If the numerical value of a known (input or intermediate) variable is cited or used incorrectly.
        - Value of a known variable calculated again: If the value of an input or intermediate (determined mid solution) variable is calculated when it's value is already provided/ known.
        - Irrelevant/incorrect variable cited or used: If a variable in a solution is found to lie outside of the ideal variable-set of the solutioin (thereby making the solution innacurate).
        - Relevant variable missing: If any of the required variables in a solutioin for a target variable is missing (thereby making the solution innacurate).
        - Unit Inconsistency: If during the execution of an expression, two or more unit incompatible quantities are operated between.
        - Calculation Error: If the resultant numerical value of an operation or set of operations between numerical values is incorrect. 
        
        The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

        Question: {question}

        Solution: {solution}

        You made a mistake in your solution due to {error_type}.  Can you revise your solution and carefully solve it again?
        """

        response = ask_phi(prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
    
    return llm_response_corrected
        

In [40]:
llm_self_corrected = self_correct(df_responses)

Ref answer:  360
LLM initial answer:  300
LLM self revised:  300
Ref answer:  120
LLM initial answer:  168
LLM self revised:  1
Ref answer:  1
LLM initial answer:  3
LLM self revised:  3
Ref answer:  44
LLM initial answer:  43
LLM self revised:  43
Ref answer:  16
LLM initial answer:  0
LLM self revised:  0
Ref answer:  39
LLM initial answer:  0
LLM self revised:  1
Ref answer:  16
LLM initial answer:  15
LLM self revised:  16
Ref answer:  13
LLM initial answer:  26
LLM self revised:  13
Ref answer:  175
LLM initial answer:  325
LLM self revised:  1
Ref answer:  413
LLM initial answer:  1
LLM self revised:  1
Ref answer:  39
LLM initial answer:  0
LLM self revised:  0
Ref answer:  100
LLM initial answer:  68
LLM self revised:  50
Ref answer:  17
LLM initial answer:  18
LLM self revised:  18
Ref answer:  12
LLM initial answer:  0
LLM self revised:  0
Ref answer:  826
LLM initial answer:  878
LLM self revised:  1100
Ref answer:  400
LLM initial answer:  200
LLM self revised:  200
Ref ans

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected)
df_corrected.to_csv(save_directory + '/phi4/phi4_selfcorrected_noloc.csv')

### Baseline: No error location or error type

In [42]:
import re

def self_correct(df):
    llm_response_corrected = []

    for idx, row in df.iterrows():
        question = row['question']
        solution = row['llm_answer']
        answer = row['ref_answer']

        prompt = f"""
        The following solution is generated using the same model as you. Consider the following question and solution that uses the Chain of Thought strategy with each step numbered.

        Question: {question}

        Solution: {solution}

        You made a mistake in your solution.  Can you revise your solution and carefully solve it again?
        """

        response = ask_phi(prompt)
        final = extract_final_ans(response)

        ref_final = answer.splitlines()
        ref_final = ref_final[-1].replace("#",'')
        ref_final = ref_final.strip()

        print("Ref answer: ", ref_final)
        print("LLM initial answer: ", row['final_ans'])
        print("LLM self revised: ", final)
        
        llm_response_corrected.append({'question': question, 'ref_answer': answer, 'llm_initial_answer': row['llm_answer'],
                                        'initial_final_ans': row['final_ans'], 'llm_revised_answer': response,
                                        'revised_final_ans': final, 'error_type': row['Error Reason'], 
                                        'error_step': row['Error Step'], 'total_steps': row['Total Steps'],
                                        'correct': ref_final==final})
    
    return llm_response_corrected
        

In [43]:
llm_self_corrected_1 = self_correct(df_responses)

Ref answer:  360
LLM initial answer:  300
LLM self revised:  300
Ref answer:  120
LLM initial answer:  168
LLM self revised:  168
Ref answer:  1
LLM initial answer:  3
LLM self revised:  3
Ref answer:  44
LLM initial answer:  43
LLM self revised:  43
Ref answer:  16
LLM initial answer:  0
LLM self revised:  0
Ref answer:  39
LLM initial answer:  0
LLM self revised:  0
Ref answer:  16
LLM initial answer:  15
LLM self revised:  16
Ref answer:  13
LLM initial answer:  26
LLM self revised:  13
Ref answer:  175
LLM initial answer:  325
LLM self revised:  1
Ref answer:  413
LLM initial answer:  1
LLM self revised:  1
Ref answer:  39
LLM initial answer:  0
LLM self revised:  0
Ref answer:  100
LLM initial answer:  68
LLM self revised:  68
Ref answer:  17
LLM initial answer:  18
LLM self revised:  18
Ref answer:  12
LLM initial answer:  0
LLM self revised:  0
Ref answer:  826
LLM initial answer:  878
LLM self revised:  879
Ref answer:  400
LLM initial answer:  200
LLM self revised:  200
Ref an

In [None]:
save_directory = "../data/responses"

df_corrected = pd.DataFrame(llm_self_corrected_1)
df_corrected.to_csv(save_directory + '/phi4/phi4_selfcorrected_baseline.csv')