In [None]:
import pandas as pd

df_gpt = pd.read_csv('../data/responses/gpt4o/gpt4o_responses_annotated.csv')
df_r1 = pd.read_csv('../data/responses/r1/r1_response_annotated.csv')
df_phi = pd.read_csv('../data/responses/phi4/phi4_response_annotated.csv')
df_gemma = pd.read_csv('../data/responses/gemma3/gemma3-27b_response_annotated.csv')

df_r1 = df_r1.dropna()
df_r1 = df_r1.head(50)
df_gpt = df_gpt.head(50)
df_phi = df_phi.head(50)
df_gemma = df_gemma.head(50)

In [2]:
error_reasons_gpt = df_gpt['Error Reason'].unique()
error_reasons_r1 = df_r1['Error Reason'].unique()
error_reasons_phi = df_phi['Error Reason'].unique()
error_reasons_gemma = df_gemma['Error Reason'].unique()


print('Error reasons present in GPT 4o: \n',error_reasons_gpt, '\n')
print('Error Reasons present in DeepSeek R1: \n', error_reasons_r1)
print('Error Reasons present in Phi 4: \n', error_reasons_phi)
print('Error Reasons present in Gemma 3 27b: \n', error_reasons_gemma)

Error reasons present in GPT 4o: 
 ['(1.3.2 Incorrect value of variable cited or used)'
 '(1.1.2 Value of a known variable calculated again)'
 '(1.3.1.1 Irrelevant/incorrect variable cited or used)'
 '(1.3.1.2 Relevant variable missing)' '2.1 Unit Inconsistency)'] 

Error Reasons present in DeepSeek R1: 
 ['(1.1.2 Value of a known variable calculated again)'
 '(1.3.1.1 Irrelevant/incorrect variable cited or used)'
 '(1.3.2 Incorrect value of variable cited or used)'
 '2.3 Calculation Error)' '2.1 Unit Inconsistency)']
Error Reasons present in Phi 4: 
 ['(1.3.1.1 Irrelevant/incorrect variable cited or used)'
 '(1.3.2 Incorrect value of variable cited or used)'
 '(1.1.2 Value of a known variable calculated again)'
 '(1.3.1.2 Relevant variable missing)' '(3.3.2 Incorrect value)'
 '2.3 Calculation Error)' '2.1 Unit Inconsistency)']
Error Reasons present in Gemma 3 27b: 
 ['(1.3.1.2 Relevant variable missing)'
 '(1.3.1.1 Irrelevant/incorrect variable cited or used)'
 '(1.3.2 Incorrect value

### GPT 4o findings

In [26]:
df_gpt['Error Reason'].value_counts()

Error Reason
(1.3.1.1 Irrelevant/incorrect variable cited or used)    19
(1.1.2 Value of a known variable calculated again)       11
(1.3.2 Incorrect value of variable cited or used)        10
(1.3.1.2 Relevant variable missing)                       8
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

In [2]:
df_gpt['Error Step'].value_counts()

Error Step
1.0    15
3.0    13
4.0    11
5.0     5
2.0     4
6.0     2
Name: count, dtype: int64

In [29]:
df_gpt['Total Steps'].value_counts()

Total Steps
6.0    17
5.0    13
7.0     9
4.0     8
8.0     2
3.0     1
Name: count, dtype: int64

Average of 5.62 steps taken to solve

### DeepSeek R1 findings

In [None]:
df_r1['Error Reason'].value_counts()

Error Reason
(1.3.1.1 Irrelevant/incorrect variable cited or used)    24
(1.1.2 Value of a known variable calculated again)       16
(1.3.2 Incorrect value of variable cited or used)         7
2.1 Unit Inconsistency)                                   2
2.3 Calculation Error)                                    1
Name: count, dtype: int64

In [28]:
df_r1['Total Steps'].value_counts()

Total Steps
5    21
4    17
3     7
6     5
Name: count, dtype: int64

average of 4.48 steps taken to solve 

### Phi 4

In [4]:
df_phi['Error Reason'].value_counts()

Error Reason
(1.3.2 Incorrect value of variable cited or used)        18
(1.3.1.1 Irrelevant/incorrect variable cited or used)    11
(1.1.2 Value of a known variable calculated again)       11
(1.3.1.2 Relevant variable missing)                       3
2.3 Calculation Error)                                    3
(3.3.2 Incorrect value)                                   2
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

In [5]:
df_phi['Total Steps'].value_counts()

Total Steps
4     14
5     12
6      8
7      7
8      5
3      1
10     1
2      1
Name: count, dtype: int64

### Gemma 3 27B

In [8]:
df_gemma['Error Reason'].value_counts()

Error Reason
(1.3.1.1 Irrelevant/incorrect variable cited or used)    23
(1.3.1.2 Relevant variable missing)                      10
(1.3.2 Incorrect value of variable cited or used)         9
(1.1.2 Value of a known variable calculated again)        5
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

In [4]:
df_gemma['Total Steps'].value_counts()

Total Steps
6     11
4      9
5      9
8      7
7      5
11     3
9      3
10     1
21     1
Name: count, dtype: int64

## Self correction w/ error steps and error type

### GPT 4o

In [None]:
gpt_corrected = pd.read_csv('../data/responses/gpt4o/gpt4o_selfcorrected.csv')

In [21]:
gpt_corrected['correct'].value_counts()

correct
False    43
True      7
Name: count, dtype: int64

In [22]:
gpt_corrected['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    19
(1.1.2 Value of a known variable calculated again)       11
(1.3.2 Incorrect value of variable cited or used)        10
(1.3.1.2 Relevant variable missing)                       8
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

In [23]:
corrected_samples = gpt_corrected[gpt_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.2 Relevant variable missing)                      3
(1.3.1.1 Irrelevant/incorrect variable cited or used)    2
(1.3.2 Incorrect value of variable cited or used)        2
Name: count, dtype: int64

In [24]:
incorrect_samples = gpt_corrected[gpt_corrected['correct'] == False]
incorrect_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    17
(1.1.2 Value of a known variable calculated again)       11
(1.3.2 Incorrect value of variable cited or used)         8
(1.3.1.2 Relevant variable missing)                       5
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### Deepseek r1

In [None]:
r1_corrected = pd.read_csv('../data/responses/r1_self_corrected.csv')
r1_corrected['correct'].value_counts()

correct
False    27
True     23
Name: count, dtype: int64

In [44]:
r1_corrected['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    24
(1.1.2 Value of a known variable calculated again)       16
(1.3.2 Incorrect value of variable cited or used)         7
2.1 Unit Inconsistency)                                   2
2.3 Calculation Error)                                    1
Name: count, dtype: int64

In [39]:
corrected_samples = r1_corrected[r1_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    13
(1.3.2 Incorrect value of variable cited or used)         7
2.3 Calculation Error)                                    1
(1.1.2 Value of a known variable calculated again)        1
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

In [41]:
incorrect_samples = r1_corrected[r1_corrected['correct'] == False]
incorrect_samples['error_type'].value_counts()

error_type
(1.1.2 Value of a known variable calculated again)       15
(1.3.1.1 Irrelevant/incorrect variable cited or used)    11
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

### Phi 4

In [None]:
phi_corrected = pd.read_csv('../data/responses/phi4/phi_selfcorrected_errorloc.csv')
phi_corrected['correct'].value_counts()

correct
False    37
True     12
Name: count, dtype: int64

In [7]:
phi_corrected['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        18
(1.3.1.1 Irrelevant/incorrect variable cited or used)    11
(1.1.2 Value of a known variable calculated again)       11
(1.3.1.2 Relevant variable missing)                       3
2.3 Calculation Error)                                    3
(3.3.2 Incorrect value)                                   2
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

In [8]:
corrected_samples = phi_corrected[phi_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        5
2.3 Calculation Error)                                   3
(1.3.1.1 Irrelevant/incorrect variable cited or used)    2
(1.3.1.2 Relevant variable missing)                      1
(1.1.2 Value of a known variable calculated again)       1
Name: count, dtype: int64

In [9]:
corrected_samples = phi_corrected[phi_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        13
(1.1.2 Value of a known variable calculated again)       10
(1.3.1.1 Irrelevant/incorrect variable cited or used)     9
(3.3.2 Incorrect value)                                   2
(1.3.1.2 Relevant variable missing)                       2
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

### Gemma 3 27B

In [None]:
gemma_corrected = pd.read_csv('../data/responses/gemma3/gemma3-27b_selfcorrected_errorloc.csv')
gemma_corrected['correct'].value_counts()

correct
False    32
True     17
Name: count, dtype: int64

In [6]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    9
(1.3.1.2 Relevant variable missing)                      4
(1.3.2 Incorrect value of variable cited or used)        3
(1.1.2 Value of a known variable calculated again)       1
Name: count, dtype: int64

In [7]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    14
(1.3.1.2 Relevant variable missing)                       6
(1.3.2 Incorrect value of variable cited or used)         6
(1.1.2 Value of a known variable calculated again)        4
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

## Self correction w/ general error location and error type

In [None]:
gpt_corrected = pd.read_csv('../data/responses/gpt4o_selfcorrected_genloc.csv')

In [None]:
gpt_corrected['correct'].value_counts()

correct
False    45
True      5
Name: count, dtype: int64

In [7]:
corrected_samples = gpt_corrected[gpt_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    3
(1.3.2 Incorrect value of variable cited or used)        2
Name: count, dtype: int64

In [8]:
incorrect_samples = gpt_corrected[gpt_corrected['correct'] == False]
incorrect_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    16
(1.1.2 Value of a known variable calculated again)       11
(1.3.2 Incorrect value of variable cited or used)         8
(1.3.1.2 Relevant variable missing)                       8
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### DeepSeek R1

In [None]:
r1_corrected = pd.read_csv('../data/responses/r1_selfcorrected_genloc.csv')
r1_corrected['correct'].value_counts()

correct
False    31
True     19
Name: count, dtype: int64

In [18]:
corrected_samples = r1_corrected[r1_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    11
(1.3.2 Incorrect value of variable cited or used)         6
(1.1.2 Value of a known variable calculated again)        1
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

In [None]:
incorrect_samples = r1_corrected[r1_corrected['correct'] == False]
incorrect_samples['error_type'].value_counts()

error_type
(1.1.2 Value of a known variable calculated again)       15
(1.3.1.1 Irrelevant/incorrect variable cited or used)    13
(1.3.2 Incorrect value of variable cited or used)         1
2.3 Calculation Error)                                    1
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

### Phi 4

In [None]:
phi_corrected = pd.read_csv('../data/responses/phi4/phi4_selfcorrected_genloc.csv')
phi_corrected['correct'].value_counts()

correct
False    40
True      9
Name: count, dtype: int64

In [12]:
corrected_samples = phi_corrected[phi_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        4
(1.3.1.1 Irrelevant/incorrect variable cited or used)    2
2.3 Calculation Error)                                   2
(1.1.2 Value of a known variable calculated again)       1
Name: count, dtype: int64

In [13]:
corrected_samples = phi_corrected[phi_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        14
(1.1.2 Value of a known variable calculated again)       10
(1.3.1.1 Irrelevant/incorrect variable cited or used)     9
(1.3.1.2 Relevant variable missing)                       3
(3.3.2 Incorrect value)                                   2
2.3 Calculation Error)                                    1
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

### Gemma 3 27B

In [None]:
gemma_corrected = pd.read_csv('../data/responses/gemma3/gemma3-27b_selfcorrected_genloc.csv')
gemma_corrected['correct'].value_counts()

correct
False    37
True     12
Name: count, dtype: int64

In [12]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    10
(1.3.1.2 Relevant variable missing)                       1
(1.3.2 Incorrect value of variable cited or used)         1
Name: count, dtype: int64

In [13]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    13
(1.3.1.2 Relevant variable missing)                       9
(1.3.2 Incorrect value of variable cited or used)         8
(1.1.2 Value of a known variable calculated again)        5
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### No error location with error type

### GPT 4o

In [None]:
gpt_corrected = pd.read_csv('../data/responses/gpt4o_selfcorrected_noloc.csv')

In [10]:
gpt_corrected['correct'].value_counts()

correct
False    43
True      7
Name: count, dtype: int64

In [11]:
corrected_samples = gpt_corrected[gpt_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    3
(1.3.1.2 Relevant variable missing)                      3
(1.3.2 Incorrect value of variable cited or used)        1
Name: count, dtype: int64

In [12]:
incorrect_samples = gpt_corrected[gpt_corrected['correct'] == False]
incorrect_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    16
(1.1.2 Value of a known variable calculated again)       11
(1.3.2 Incorrect value of variable cited or used)         9
(1.3.1.2 Relevant variable missing)                       5
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### Deepseek R1

In [None]:
r1_corrected = pd.read_csv('../data/responses/r1_selfcorrected_noloc.csv')
r1_corrected['correct'].value_counts()

correct
False    29
True     21
Name: count, dtype: int64

In [29]:
corrected_samples = r1_corrected[r1_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    10
(1.3.2 Incorrect value of variable cited or used)         7
2.1 Unit Inconsistency)                                   2
2.3 Calculation Error)                                    1
(1.1.2 Value of a known variable calculated again)        1
Name: count, dtype: int64

In [30]:
corrected_samples = r1_corrected[r1_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.1.2 Value of a known variable calculated again)       15
(1.3.1.1 Irrelevant/incorrect variable cited or used)    14
Name: count, dtype: int64

### Phi 4

In [None]:
phi_corrected = pd.read_csv('../data/responses/phi4/phi4_selfcorrected_noloc.csv')
phi_corrected['correct'].value_counts()

correct
False    40
True      9
Name: count, dtype: int64

In [15]:
corrected_samples = phi_corrected[phi_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        3
(1.3.1.1 Irrelevant/incorrect variable cited or used)    3
2.3 Calculation Error)                                   2
(1.1.2 Value of a known variable calculated again)       1
Name: count, dtype: int64

In [16]:
corrected_samples = phi_corrected[phi_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        15
(1.1.2 Value of a known variable calculated again)       10
(1.3.1.1 Irrelevant/incorrect variable cited or used)     8
(1.3.1.2 Relevant variable missing)                       3
(3.3.2 Incorrect value)                                   2
2.3 Calculation Error)                                    1
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

### Gemma 3 27B

In [None]:
gemma_corrected = pd.read_csv('../data/responses/gemma3/gemma3_selfcorrected_noloc.csv')
gemma_corrected['correct'].value_counts()

correct
False    33
True     16
Name: count, dtype: int64

In [15]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    13
(1.3.1.2 Relevant variable missing)                       2
(1.3.2 Incorrect value of variable cited or used)         1
Name: count, dtype: int64

In [16]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    10
(1.3.1.2 Relevant variable missing)                       8
(1.3.2 Incorrect value of variable cited or used)         8
(1.1.2 Value of a known variable calculated again)        5
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### Baseline - no error location and error type

In [None]:
gpt_corrected = pd.read_csv('../data/responses/gpt4o_selfcorrected_baseline.csv')

In [14]:
gpt_corrected['correct'].value_counts()

correct
False    45
True      5
Name: count, dtype: int64

In [15]:
corrected_samples = gpt_corrected[gpt_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    3
(1.3.1.2 Relevant variable missing)                      1
(1.3.2 Incorrect value of variable cited or used)        1
Name: count, dtype: int64

In [16]:
incorrect_samples = gpt_corrected[gpt_corrected['correct'] == False]
incorrect_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    16
(1.1.2 Value of a known variable calculated again)       11
(1.3.2 Incorrect value of variable cited or used)         9
(1.3.1.2 Relevant variable missing)                       7
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### Deepseek R1

In [None]:
r1_corrected = pd.read_csv('../data/responses/r1_selfcorrected_baseline.csv')
r1_corrected['correct'].value_counts()

correct
False    31
True     19
Name: count, dtype: int64

In [32]:
corrected_samples = r1_corrected[r1_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    11
(1.3.2 Incorrect value of variable cited or used)         7
2.3 Calculation Error)                                    1
Name: count, dtype: int64

In [33]:
corrected_samples = r1_corrected[r1_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.1.2 Value of a known variable calculated again)       16
(1.3.1.1 Irrelevant/incorrect variable cited or used)    13
2.1 Unit Inconsistency)                                   2
Name: count, dtype: int64

### Phi 4

In [None]:
phi_corrected = pd.read_csv('../data/responses/phi4/phi4_selfcorrected_baseline.csv')
phi_corrected['correct'].value_counts()

correct
False    40
True      9
Name: count, dtype: int64

In [18]:
corrected_samples = phi_corrected[phi_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        4
2.3 Calculation Error)                                   3
(1.1.2 Value of a known variable calculated again)       1
(1.3.1.1 Irrelevant/incorrect variable cited or used)    1
Name: count, dtype: int64

In [19]:
corrected_samples = phi_corrected[phi_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.2 Incorrect value of variable cited or used)        14
(1.3.1.1 Irrelevant/incorrect variable cited or used)    10
(1.1.2 Value of a known variable calculated again)       10
(1.3.1.2 Relevant variable missing)                       3
(3.3.2 Incorrect value)                                   2
2.1 Unit Inconsistency)                                   1
Name: count, dtype: int64

### Gemma 3 27B

In [None]:
gemma_corrected = pd.read_csv('../data/responses/gemma3/gemma3_selfcorrected_baseline.csv')
gemma_corrected['correct'].value_counts()

correct
False    32
True     17
Name: count, dtype: int64

In [19]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == True]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.1 Irrelevant/incorrect variable cited or used)    15
(1.3.1.2 Relevant variable missing)                       1
(1.3.2 Incorrect value of variable cited or used)         1
Name: count, dtype: int64

In [20]:
corrected_samples = gemma_corrected[gemma_corrected['correct'] == False]
corrected_samples['error_type'].value_counts()

error_type
(1.3.1.2 Relevant variable missing)                      9
(1.3.2 Incorrect value of variable cited or used)        8
(1.3.1.1 Irrelevant/incorrect variable cited or used)    8
(1.1.2 Value of a known variable calculated again)       5
2.1 Unit Inconsistency)                                  2
Name: count, dtype: int64