# Overview

Let's build a dataset with questions, SoT(rasoning path) with different temperatures and answers from different reasoning paths. There are many benefits:
* We generate reasoning path by using GPT3.5-turbo LLM, it isn't the state of art LLM.
* We have to build a dataset for fine-tuning our encoder, this is the part of our experiment.
* We will update all the answers of all the questions. The first version of dataset, we only record CoT(reasoning path) for fine-tuning model. This time we also want to track the answers.

In [1]:
from datasets import load_dataset

df=load_dataset("openai/gsm8k", "main", split="test").to_pandas().head(1250)
df.shape

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

(1250, 2)

In [2]:
df['ground_truth'] = df['answer'].str.extract(r'####\s*(.*)', expand=False)

In [3]:
import re
from typing import Tuple, Union, Optional

from pydantic import BaseModel

from sketch_of_thought import SoT
from doraemon import Doraemon

class Entity(BaseModel):
    question: str
    answer: str
    ground_truth: str

class ReasoningPath(BaseModel):
    question: str
    reason: str
    answer: Optional[str]=None
    ground_truth: str
    temperature: float
    tokens: int
    score: float


logger=Doraemon.get_logger(name=__name__, logfile="gsm8k_sot_dataset_builder.log")

entities=[Entity.model_validate(entity) for entity in df.to_dict(orient='records')]


def get_answer(raw_answer: str)-> str:
    answer=re.search(r"\\boxed\{(.*?)\}", raw_answer)
    if answer:
        return answer.group(1)
    return None


def process_entity(args)-> Optional[ReasoningPath]:
    
    et,paradigm,temperature=args

    try:
        prompt=SoT.get_initialized_prompt(
            paradigm=paradigm, 
            question=f"Question:{et.question}\n"
        )
        
        r_s, tokens=Doraemon.inference(
            logger=logger, 
            messages=prompt, 
            temperature=temperature
        )
        
        result=ReasoningPath(
            question=str(et.question), 
            reason=str(r_s), 
            answer=get_answer(r_s), 
            ground_truth=str(et.ground_truth), 
            temperature=float(temperature), 
            tokens=int(tokens), 
            score=0.0
        )
        return result
    except Exception as e:
        logger.error(f"Error processing quetion {et.question} at temperature {temperature} with exception {e}")
        return None

paradigm = SoT.classify_question(entities[0].question)
logger.info(paradigm)

2025-06-19 12:36:51.178448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750336611.404034      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750336611.466312      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/769 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

2025-06-19 12:37:08,236 INFO chunked_symbolism


In [4]:
temperatures = [i * 0.25 for i in range(9)]  # [0.0, 0.25, 0.5, ... ,2.0]

tasks=[]
for et in entities:
    for tp in temperatures:
        tasks.append((et, paradigm, tp))
tasks[0]

(Entity(question="Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?", answer='Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18', ground_truth='18'),
 'chunked_symbolism',
 0.0)

In [5]:
from tqdm import tqdm
import concurrent.futures

with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
    results=list(tqdm(executor.map(process_entity, tasks), total=len(tasks)))

 22%|██▏       | 2455/11250 [36:21<3:00:17,  1.23s/it]2025-06-19 13:13:33,141 ERROR Error processing quetion A mother goes shopping. She buys cocoa at $4.20, laundry at $9.45 and a package of pasta at $1.35. She pays $20. How much change does the cashier give back? at temperature 1.75 with exception Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-05-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}
 22%|██▏       | 2459/11250 [36:32<5:47:47,  2.37s/it]2025-06-19 13:13:43,546 ERROR Error processing quetion Terri is knitting a sweater with two sleeves, a collar, and a decorative rosette. The body of the sweater takes 900 stitches to co

In [6]:
import pandas as pd
import pickle


pd.DataFrame([rp.model_dump() for rp in results if rp is not None]).to_pickle('sots.pkl')