In [1]:
import pandas as pd 
import os 

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

import seaborn as sns

from datasets import load_dataset
import numpy as np
import json

In [2]:
pwd

'/home/al2644/research/codebase/reasoning/perturb-r/notebook'

In [4]:
cd /home/al2644/research/codebase/reasoning/perturb-r

/home/al2644/research/codebase/reasoning/perturb-r


# Reasoning Results

In [5]:
root = "./results/aime2425/benchmark"
k = 8
for fname in os.listdir(root):
    if 'correct' not in fname:
        print(fname)
        df = pd.read_pickle(os.path.join(root, fname))
        pass_at_k = df.groupby("problem")[["correct"]].max()["correct"].mean()
        print(f"Pass@{k} Accuracy: {pass_at_k}")

Qwen3-1.7B.pickle
Pass@8 Accuracy: 0.6166666666666667
Qwen3-1.7B_nothinking.pickle
Pass@8 Accuracy: 0.2833333333333333
Qwen3-8B.pickle
Pass@8 Accuracy: 0.75
Qwen3-8B_nothinking.pickle
Pass@8 Accuracy: 0.4166666666666667


In [86]:
df = pd.read_pickle(os.path.join(root, "Qwen3-8B_nothinking.pickle"))

# Annotated Reasoning

In [74]:
import utils
import importlib
importlib.reload(utils)
from utils.process_thinking import chunk_tool

In [82]:
root = "./results/aime2425/annotated_reasoning/"
df = pd.read_pickle(os.path.join(root, "Qwen3-1.7B.pickle"))

In [92]:
row

problem                      Jen enters a lottery by picking $4$ distinct n...
solution                                                                   116
source                                                                 aime 24
response                     <think>\nOkay, so Jen is entering a lottery, r...
correct                                                                    1.0
chunks                       [<think>\n, Okay, so Jen is entering a lottery...
prompt                       <instruction>\nYou are given a sequence of rea...
idx                                                                         17
1st_answer_annotation_raw                    [<answer_chunk>19</answer_chunk>]
1st_answer_chunk_index                                                      19
Name: 17, dtype: object

In [76]:
idx = 10
row=df.iloc[idx]
problem, response, chunks, index, solution = row["problem"], row["response"], row["chunks"], row["1st_answer_chunk_index"], row["solution"]

In [83]:
print(index)

19


In [78]:
print(solution)

116


In [79]:
print(problem)

Jen enters a lottery by picking $4$ distinct numbers from $S=\{1,2,3,\cdots,9,10\}.$ $4$ numbers are randomly chosen from $S.$ She wins a prize if at least two of her numbers were $2$ of the randomly chosen numbers, and wins the grand prize if all four of her numbers were the randomly chosen numbers. The probability of her winning the grand prize given that she won a prize is $\tfrac{m}{n}$ where $m$ and $n$ are relatively prime positive integers. Find $m+n$.


In [85]:
first_attempt_chunks = chunks[:index+1]

In [86]:
stream=""
for i, chunk in enumerate(first_attempt_chunks):
    stream+=f"###START OF CHUNK{i}\n\n" + chunk  + f"\n###END OF CHUNK{i}\n\n"
print(stream)

###START OF CHUNK0

<think>

###END OF CHUNK0

###START OF CHUNK1

Okay, so Jen is entering a lottery, right? She picks 4 distinct numbers from the set S = {1, 2, 3, ..., 9, 10}. Then, 4 numbers are randomly chosen from S. She wins a prize if at least two of her numbers are in the randomly chosen set, and she wins the grand prize if all four of her numbers are in the randomly chosen set. We need to find the probability that she wins the grand prize given that she won a prize. The answer should be in the form m/n where m and n are coprime, and then find m + n.

###END OF CHUNK1

###START OF CHUNK2


Alright, let's break this down. So, the problem is a conditional probability question. The probability we need is P(grand prize | at least two of her numbers are in the randomly chosen set). In probability terms, that's P(grand prize and at least two numbers in the random set) divided by P(at least two numbers in the random set). 


###END OF CHUNK2

###START OF CHUNK3

But maybe there's a s

In [None]:
prompt = """<instruction> You are given a sequence of reasoning steps, divided into chunks, showing how a model solves a problem. For each chunk, your task is to classify it into deriving or checking.

A checking chunk focuses on checking statements derived in prior chunks. Below are some example cases: 
    1. The model only self-verifies and confirms the prior results.
    2. The model attempts an alternative approach, or continues an alternative approach that reaches the same intermediate results as prior chunks.
    3. The model merely rephrases the problem or prior conclusions.
A rule of thumb for identifying checking chunk is whether it does not reach new intermediate results not covered by prior chunks.

A deriving chunk should not just check prior results but derive a step closer to the answer.
    
Process the chunks from the beginning to the end. At the end, respond with a classification dictionary where each key is the CHUNK_NUMBER and each value is either "deriving" or "checking" within the <output_dict></output_dict> tags.

You MUST return the chunk number using the format: <output_dict>CLASSIFICATION</output_dict>
This exact format is required for downstream processing.
</instruction>
<problem>
{problem}
</problem>
<answer>
{solution}
</answer>
""".format(problem = problem, solution=solution)

In [91]:
print(prompt + stream)

<instruction> You are given a sequence of reasoning steps, divided into chunks, showing how a model solves a problem. For each chunk, your task is to classify it into deriving or checking.

A checking chunk focuses on checking statements in prior chunks. Below are some example cases: 
    1. The model only self-verifies and confirms its prior results.
    2. The model attempts an alternative approach, or continues an alternative approach that reaches the same intermediate results as prior chunks.
    3. The model merely rephrases the problem or prior conclusions.
A rule of thumb for identifying checking chunk is whether it reaches new intermediate results not covered by prior chunks.

A deriving chunk should not just check prior results but derive a step closer to the answer.
    
Process the chunks from the beginning to the end. At the end, respond with a dictionary where each key is the CHUNK_NUMBER and each value is either "deriving" or "checking" within the <output_dict></output_di

In [108]:
df = pd.read_pickle(os.path.join(root, "Qwen3-1.7B_classify_chunk.pickle"))

In [131]:
import re
import json
import ast

def extract_classification(response: list):
    response = response[0].replace("\n", "")
    
    pattern = r"<output_dict>(.*?)</output_dict>"
    match = re.search(pattern, response)
    
    if match:
        try:
            return ast.literal_eval(match.group(1))
        except:
            return match.group(1)
    else:
        return None

In [122]:
df["response"][0]

['<output_dict>{0: "deriving", 1: "deriving", 2: "deriving", 3: "deriving", 4: "deriving", 5: "deriving", 6: "deriving", 7: "deriving", 8: "deriving", 9: "deriving", 10: "deriving", 11: "deriving", 12: "deriving", 13: "deriving", 14: "deriving", 15: "deriving", 16: "deriving", 17: "deriving", 18: "deriving", 19: "deriving", 20: "checking", 21: "deriving", 22: "deriving", 23: "deriving", 24: "deriving", 25: "deriving", 26: "deriving", 27: "deriving", 28: "deriving", 29: "deriving", 30: "checking", 31: "checking", 32: "checking"}</output_dict>']

In [133]:
result = extract_classification(df["response"][1])

In [135]:
df["response"][1]

['<output_dict>\n{\n  0: "deriving",\n  1: "deriving",\n  2: "deriving",\n  3: "deriving",\n  4: "deriving",\n  5: "deriving",\n  6: "deriving",\n  7: "deriving",\n  8: "deriving",\n  9: "deriving",\n  10: "deriving",\n  11: "deriving",\n  12: "deriving",\n  13: "deriving",\n  14: "deriving",\n  15: "deriving",\n  16: "deriving",\n  17: "deriving",\n  18: "deriving",\n  19: "deriving"\n}\n</output_dict>']

In [132]:
ast.literal_eval(result)

{0: 'deriving',
 1: 'deriving',
 2: 'deriving',
 3: 'deriving',
 4: 'deriving',
 5: 'deriving',
 6: 'deriving',
 7: 'deriving',
 8: 'deriving',
 9: 'deriving',
 10: 'deriving',
 11: 'deriving',
 12: 'deriving',
 13: 'deriving',
 14: 'deriving',
 15: 'deriving',
 16: 'deriving',
 17: 'deriving',
 18: 'deriving',
 19: 'deriving',
 20: 'checking',
 21: 'deriving',
 22: 'deriving',
 23: 'deriving',
 24: 'deriving',
 25: 'deriving',
 26: 'deriving',
 27: 'deriving',
 28: 'deriving',
 29: 'deriving',
 30: 'checking',
 31: 'checking',
 32: 'checking'}