In [1]:
!pip install -qq trl

In [2]:
import torch
import gc
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset as HFDataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import DPOTrainer,DPOConfig

# =============================================================================
# 1. Configuration
# =============================================================================
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
DATASET_NAME = "gsm8k"
NUM_ITERATIONS = 16     # Total number of I-DPO cycles
SAMPLES_PER_ITERATION = 1024 # Subset size per iteration (adjust as needed)
EVAL_DATASET_SIZE = 128  #Size of eval dataset
DPO_EPOCHS = 1             # Train for 1 epoch over the generated data
BATCH_SIZE = 4             # DPO optimization batch size
GENERATION_BATCH_SIZE = 32 # Batch size during the generation phase
GRADIENT_ACCUMULATION_STEPS = 32
LEARNING_RATE = 1e-5
DPO_BETA = 0.2             # The temperature parameter for DPO
MAX_LENGTH = 1024
MAX_PROMPT_LENGTH = 512
MAX_TARGET_LENGTH = 2048

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
dataset = load_dataset(DATASET_NAME,'main',split='train')
test_dataset = load_dataset(DATASET_NAME,'main',split='test')


model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map = 'auto'
    )
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer.padding_side = 'left'

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

model = get_peft_model(model,lora_config)
model.print_trainable_parameters()

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


In [4]:
from google.colab import userdata
from openai import OpenAI
from pydantic import BaseModel
from typing import List

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
client = OpenAI(api_key = OPENAI_API_KEY)

class answer_verifier(BaseModel):
    do_answers_match: bool

class make_hints(BaseModel):
    hints: List[str]
    was_assistant_answer_correct:bool

def match_answers(answer,gold_answer):

    instructions = '''You will recieve the answer to a problem from a virtual assistant and the real answer. You will output True or False depending on whether the virtual assistant final answer is correct. (disregard the reasoning, all we care about is if the final answers match)'''
    prompt = f'''Virtual assistant answer: {answer}
    Real answer: {gold_answer}

    Do the final answers match? Return True or False'''
    response = client.responses.parse(
        model = 'gpt-4.1',
        instructions = instructions,
        input = prompt,
        text_format = answer_verifier
    )
    output  = response.output_parsed
    return output.do_answers_match

def get_hints_from_oracle(question,answer,gold_answer):

    instructions = '''You will recieve a virtual assistant answer and a model answer. Your job is to provide a list of hints to strongly guide the model towards the correct answer, while preveting any errors it has made'''
    prompt = f'''Quesiton:{question}
    Virtual assistant answer: {answer}
    Real answer: {gold_answer}

    Please provide a list of hints to strongly guide the model towards the correct answer, while preventing any errors it has made'''

    response = client.responses.parse(
        model = 'gpt-4.1',
        instructions = instructions,
        input = prompt,
        text_format = make_hints
    )
    output = response.output_parsed
    return output.hints,output.was_assistant_answer_correct


In [5]:
def format_prompt(question,hints = None):
    if hints is not None:
        question =  f"Question: {question}\nHints: {hints}\nAnswer:"
    else:
        question =  f"Question: {question}\nAnswer:"

    messages = [
        {'role':'system','content':'Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer'},
        {'role':'user','content':question}
    ]

    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)


def batch_generate(model,device,prompts,temp = 0.7):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation = True,max_length=MAX_PROMPT_LENGTH).to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_TARGET_LENGTH,
            do_sample = True,
            top_p = 0.95,
            temperature = temp,
        )

    generated_tokens = outputs[:, inputs['input_ids'].shape[1]:]
    generated_prompts = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return generated_prompts

def create_dynamic_dpo_dataset(model,dataset_subset,verbose = False):
    model.eval()
    device = model.device

    questions = dataset_subset['question']
    answers = dataset_subset['answer']

    prompts = [format_prompt(q) for q in questions]
    print('Generating initial (rejected) answers)...')
    rejected_answers = []
    for i in tqdm(range(0,len(prompts),GENERATION_BATCH_SIZE)):
        batch = prompts[i:i+GENERATION_BATCH_SIZE]
        rejected_answers += batch_generate(model,device,batch)

    print('Generating augmented prompts...')
    augmented_prompts = []
    rejected_correctness = []

    total_rejected_correct = 0
    for q,r,a in tqdm(zip(questions,rejected_answers,answers)):
        hints,correct = get_hints_from_oracle(q,r,a)
        rejected_correctness.append(correct)
        if correct:
            total_rejected_correct += 1
        augmented_prompts.append(format_prompt(q,hints))

    print('Generating augmented (chosen) answers...')
    chosen_answers = []
    for i in tqdm(range(0,len(augmented_prompts),GENERATION_BATCH_SIZE)):
        batch = augmented_prompts[i:i+GENERATION_BATCH_SIZE]
        chosen_answers += batch_generate(model,device,batch,temp = 0.2)

    false_rejected_but_true_chosen_list = []
    total_false_rejected_but_true_chosen = 0
    total_false_rejected = 0
    print('Verifying answers...')
    for rc,ca,a in tqdm(zip(rejected_correctness,chosen_answers,answers)):
        if not rc:
            total_false_rejected+=1
            is_correct = match_answers(ca,a)
            if is_correct:
                total_false_rejected_but_true_chosen+=1
            false_rejected_but_true_chosen_list.append(is_correct)

        else:
            false_rejected_but_true_chosen_list.append(False)



    preference_data = []
    for p,c,r,frbtc in zip(prompts,chosen_answers,rejected_answers,false_rejected_but_true_chosen_list):
        if frbtc:
            preference_data.append({'prompt':p,'chosen':c,'rejected':r})
    if len(preference_data) == 0:
        print("NO PAIRS WERE CREATED")
    elif verbose:
        print(f'CREATED {len(preference_data)} pairs')
        sample = preference_data[0]
        print('+++++++SAMPLE++++++++')
        print(f"Prompt: {sample['prompt']}")
        print('=====================')
        print(f"Chosen: {sample['chosen']}")
        print('=====================')
        print(f"Rejected: {sample['rejected']}")
        print('++++++++++++++++++++++')
        print(f'Training_accuracy = {total_rejected_correct/len(questions)}')
        print(f'Accepted_accuracy = {total_false_rejected_but_true_chosen/total_false_rejected}')

    return preference_data







In [6]:
eval_dataset = test_dataset.shuffle(seed = 42).select(range(EVAL_DATASET_SIZE))
eval_questions = [format_prompt(q) for q in eval_dataset['question']]
eval_answers = [a for a in eval_dataset['answer']]

In [7]:
def evaluate_model(model,verbose = False):
    model.eval()
    device = model.device
    correct = 0
    total = 0
    print(f'Evaluating model on {len(eval_questions)} questions')

    model_answers = []
    for i in tqdm(range(0,len(eval_questions),GENERATION_BATCH_SIZE)):
        batch = eval_questions[i:i+GENERATION_BATCH_SIZE]
        model_answers += batch_generate(model,device,batch, temp = 0.01)

    counter = 0
    for q,ma,a in zip(eval_questions,model_answers,eval_answers):
        is_correct = match_answers(ma,a)
        if verbose and counter < 2:
            counter += 1
            print(f'++++++++QUESTION {total+1}+++++++++/n')
            print(f'Question: {q}')
            print('=========')
            print(f'Generated answer: {ma}')
            print('=========')
            print(f'Correct answer: {a}')
            print('=========')
            print(f'Is Correct?: {is_correct}')
            print('=========')
        if is_correct:
            correct += 1
        total += 1

    return correct/total





In [8]:
print('\nStarting iterative DPO training...')

eval_accuracy = evaluate_model(model,verbose = True)
print('BASELINE ACCURACY')
print(f'Evaluation accuracy: {eval_accuracy}')
for iteration in range(NUM_ITERATIONS):
    print(f'\nStarting iteration {iteration+1}/{NUM_ITERATIONS}')

    dataset_subset = dataset.shuffle(seed = 42 + iteration).select(range(min(SAMPLES_PER_ITERATION,len(dataset))))

    tokenizer.padding_side = 'left'
    preference_dataset_list = create_dynamic_dpo_dataset(model,dataset_subset,verbose = True)

    preference_df = pd.DataFrame(preference_dataset_list)
    train_dataset = HFDataset.from_pandas(preference_df)

    print(f"Generated {len(preference_df)} preference pairs")

    tokenizer.padding_side = 'right'
    training_args = DPOConfig(
        output_dir = f'./idpo_output_iteration{iteration+1}',
        learning_rate = LEARNING_RATE,
        per_device_train_batch_size = BATCH_SIZE,
        gradient_accumulation_steps = GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs = DPO_EPOCHS,
        logging_steps = 10,
        remove_unused_columns = False, #important for dpo
        optim = 'adamw_torch',
        save_strategy = 'no',
        lr_scheduler_type = 'cosine', #lr_scheduler_type instead of lr_scheduler
        warmup_ratio = 0.1,
        beta = DPO_BETA,
    )

    dpo_trainer = DPOTrainer(
        model = model,
        ref_model = None,
        args = training_args,
        # peft_config = lora_config,
        train_dataset = train_dataset,
        processing_class = tokenizer,
        # max_length = MAX_LENGTH,
        # max_prompt_length = MAX_PROMPT_LENGTH,
        # max_target_length = MAX_TARGET_LENGTH,
    )

    dpo_trainer.train()

    model = dpo_trainer.model

    tokenizer.padding_side = 'left'
    eval_accuracy = evaluate_model(model,verbose = True)
    print(f'Evaluation accuracy: {eval_accuracy}')

    del dpo_trainer, train_dataset, preference_df, preference_dataset_list
    torch.cuda.empty_cache()
    gc.collect()



Starting iterative DPO training...
Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

++++++++QUESTION 1+++++++++/n
Question: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.
Answer:<|im_end|>
<|im_start|>assistant

Generated answer: To solve this problem, let's first find out how old each person is based on the given ratio.

Given that the ratio of Darrell's age to Allen's age is \(7:11\), we can represent their ages as follows:

- Let \(D\) be Darrell's age,
- Let \(A\) be Allen's age.

According to the ratio:
\[ \frac{D}{A} = \frac{7}{11} \]

We also know that together they have a total age of 162 years:
\[ D + A = 162 \]

Now, let's express one variable in terms of the other using the ratio equation:
\[ D = \frac{7}{11}A \]

Substituting \(D\) into the total age equation gives us:
\[ \frac{7}{11}A + A = 162 \]
\[ \frac{18}{11}A = 162 \]
\

  0%|          | 0/32 [00:00<?, ?it/s]

Generating augmented prompts...


0it [00:00, ?it/s]

Generating augmented (chosen) answers...


  0%|          | 0/32 [00:00<?, ?it/s]

Verifying answers...


0it [00:00, ?it/s]

CREATED 240 pairs
+++++++SAMPLE++++++++
Prompt: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Frankie's parents let him have many pets. He has six more snakes than he has cats. He has one less parrot than cats. Six of his pets have four legs. He has 2 dogs. How many pets does he have in total?
Answer:<|im_end|>
<|im_start|>assistant

Chosen: To find out how many pets Frankie has in total, we can follow these steps:

1. **Identify the four-legged pets**: Since only dogs and cats have four legs, we know there are 2 dogs and 4 cats.

2. **Calculate the number of snakes**: We're told that Frankie has six more snakes than he has cats. If he has 4 cats, then he has \(4 + 6 = 10\) snakes.

3. **Determine the number of parrots**: The hint states that he has one less parrot than cats. Given that he has 4 cats, this means he has \(4 - 1 = 3\) parrots.

4. **Sum up all the pets**: Add u

Extracting prompt in train dataset:   0%|          | 0/240 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/240 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/240 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mkokorikos125[0m ([33mkokorikos125-ucl[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss


Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

++++++++QUESTION 1+++++++++/n
Question: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.
Answer:<|im_end|>
<|im_start|>assistant

Generated answer: To solve this problem, let's first find out how old each person is based on the given ratio.

Given that the ratio of Darrell's age to Allen's age is \(7:11\), we can represent their ages as follows:

- Let \(D\) be Darrell's age,
- Let \(A\) be Allen's age.

According to the ratio:
\[ \frac{D}{A} = \frac{7}{11} \]

We also know that together they have a total age of 162 years:
\[ D + A = 162 \]

Now, let's express one variable in terms of the other using the ratio equation:
\[ D = \frac{7}{11}A \]

Substituting \(D\) into the total age equation gives us:
\[ \frac{7}{11}A + A = 162 \]
\[ \frac{18}{11}A = 162 \]
\

  0%|          | 0/32 [00:00<?, ?it/s]

Generating augmented prompts...


0it [00:00, ?it/s]

Generating augmented (chosen) answers...


  0%|          | 0/32 [00:00<?, ?it/s]

Verifying answers...


0it [00:00, ?it/s]

CREATED 289 pairs
+++++++SAMPLE++++++++
Prompt: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Scott has 7 pairs of shoes.  Anthony has 3 times as many pairs of shoes as Scott, and Jim has 2 less pairs than Anthony.  How many more pairs of shoes does Anthony have compared to Jim?
Answer:<|im_end|>
<|im_start|>assistant

Chosen: Anthony has 21 pairs of shoes. Jim has 19 pairs of shoes. So, Anthony has 21 - 19 = 2 more pairs of shoes than Jim.
Rejected: To solve this problem, let's break it down into steps:

1. **Calculate how many pairs of shoes Anthony has**: 
   - Scott has 7 pairs of shoes.
   - Anthony has \(3\) times as many pairs of shoes as Scott, which is \(3 \times 7 = 21\).

2. **Calculate how many pairs of shoes Jim has**: 
   - Anthony has \(2\) less pairs than Jim. Since we know that Anthony has \(21\) pairs of shoes, then Jim has \(21 + 2 = 23\) pairs.

3. **Deter

Extracting prompt in train dataset:   0%|          | 0/289 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/289 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/289 [00:00<?, ? examples/s]



Step,Training Loss


Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

++++++++QUESTION 1+++++++++/n
Question: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.
Answer:<|im_end|>
<|im_start|>assistant

Generated answer: To solve this problem, let's first find out how old each person is based on the given ratio.

Given that the ratio of Darrell's age to Allen's age is \(7:11\), we can represent their ages as follows:

- Let \(D\) be Darrell's age,
- Let \(A\) be Allen's age.

According to the ratio:
\[ \frac{D}{A} = \frac{7}{11} \]

We also know that together they have a total age of 162 years:
\[ D + A = 162 \]

Now, let's express one variable in terms of the other using the ratio equation:
\[ D = \frac{7}{11}A \]

Substituting \(D\) into the total age equation gives us:
\[ \frac{7}{11}A + A = 162 \]
\[ \frac{18}{11}A = 162 \]
\

  0%|          | 0/32 [00:00<?, ?it/s]

Generating augmented prompts...


0it [00:00, ?it/s]

Generating augmented (chosen) answers...


  0%|          | 0/32 [00:00<?, ?it/s]

Verifying answers...


0it [00:00, ?it/s]

CREATED 298 pairs
+++++++SAMPLE++++++++
Prompt: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Thomas wants to throw a party for his best friend Casey. He needs to order enough chairs for all the party guests to sit in, one for each guest. First he orders 3 dozen chairs for 3 dozen guests he invites. Then he finds out that 1/3 of the guests want to bring a guest of their own, so Thomas needs to order more chairs. Eventually, Thomas finds out that 5 of the guests he originally invited can't make it. Thomas would also like to order 12 extra chairs just in case they need them. How many chairs is Thomas going to order for Casey's party?
Answer:<|im_end|>
<|im_start|>assistant

Chosen: Thomas initially invited 3 dozen guests, which is equal to 36 guests. 

Then, 1/3 of these guests decided to bring an additional guest, making the total number of guests 12 more than the initial 36.


Extracting prompt in train dataset:   0%|          | 0/298 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/298 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/298 [00:00<?, ? examples/s]



Step,Training Loss


Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

++++++++QUESTION 1+++++++++/n
Question: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.
Answer:<|im_end|>
<|im_start|>assistant

Generated answer: To solve this problem, we first need to understand that the ratio of Darrell's age to Allen's age is 7:11. This means for every 7 units of Darrell's age, there are 11 units of Allen's age.

Given that their combined current age is 162, let's denote Darrell's current age as \(D\) and Allen's current age as \(A\). According to the information provided:

\[ \frac{D}{A} = \frac{7}{11} \]

We also know that their combined current age is 162, so:

\[ D + A = 162 \]

From the equation above, we can express one variable in terms of the other:

\[ A = 162 - D \]

Substituting \(A\) in terms of \(D\) into the ratio equatio

  0%|          | 0/32 [00:00<?, ?it/s]

Generating augmented prompts...


0it [00:00, ?it/s]

Generating augmented (chosen) answers...


  0%|          | 0/32 [00:00<?, ?it/s]

Verifying answers...


0it [00:00, ?it/s]

CREATED 294 pairs
+++++++SAMPLE++++++++
Prompt: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: A boat is traveling across a square lake at 10 MPH. It takes 2 hours to go across the length of the lake. It then travels the whole width of the lake at the same speed, which takes 30 minutes. How many square miles is the lake?
Answer:<|im_end|>
<|im_start|>assistant

Chosen: To solve this problem, we need to first determine the length and width of the lake based on the information provided.

Given:
- Boat's speed across the lake = 10 mph
- Time taken to travel across the length of the lake = 2 hours
- Time taken to travel across the width of the lake = 30 minutes

First, let's convert the time into hours since the speed is given in miles per hour.
- Time to travel across the length = 2 hours
- Time to travel across the width = 30 minutes = 0.5 hours

Now, let's calculate the length 

Extracting prompt in train dataset:   0%|          | 0/294 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/294 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/294 [00:00<?, ? examples/s]



Step,Training Loss


Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

++++++++QUESTION 1+++++++++/n
Question: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.
Answer:<|im_end|>
<|im_start|>assistant

Generated answer: To solve this problem, we first need to find out how old each person is based on the given ratio.

Given that the ratio of Darrell's age to Allen's age is 7:11, let's denote Darrell's current age as \(7x\) and Allen's current age as \(11x\), where \(x\) is a common multiplier for both ages.

According to the information provided, their combined age is currently 162:

\[7x + 11x = 162\]

Combining like terms gives us:

\[18x = 162\]

Dividing both sides by 18 yields:

\[x = \frac{162}{18} = 9\]

So, Darrell is currently \(7x = 7 \times 9 = 63\) years old, and Allen is \(11x = 11 \times 9 = 99\) years old.

Now, to

  0%|          | 0/32 [00:00<?, ?it/s]

Generating augmented prompts...


0it [00:00, ?it/s]

Generating augmented (chosen) answers...


  0%|          | 0/32 [00:00<?, ?it/s]

Verifying answers...


0it [00:00, ?it/s]

CREATED 326 pairs
+++++++SAMPLE++++++++
Prompt: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: A building has 20 floors. Each floor is 3 meters high, except for the last two floors. The last two floors are each 0.5 meters higher. How tall is the building?
Answer:<|im_end|>
<|im_start|>assistant

Chosen: To solve this problem, we need to follow the steps provided in the hint:

1. Count the number of floors that are not the last two.
2. Multiply the number of non-last-two floors by their height.
3. Add the height of the last two floors.

Let's calculate it step-by-step:

- There are 20 floors in total.
- The last two floors are each 0.5 meters higher than the standard height of 3 meters. So, each of them is 3 + 0.5 = 3.5 meters high.
- Since there are 18 non-last-two floors, and each is 3 meters high, the total height of these floors is 18 * 3 = 54 meters.
- Adding the height of

Extracting prompt in train dataset:   0%|          | 0/326 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/326 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/326 [00:00<?, ? examples/s]



Step,Training Loss


Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

++++++++QUESTION 1+++++++++/n
Question: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Darrell and Allen's ages are in the ratio of 7:11. If their total age now is 162, calculate Allen's age 10 years from now.
Answer:<|im_end|>
<|im_start|>assistant

Generated answer: To solve this problem, we first need to find out how old each person is based on the given ratio.

Given that the ratio of Darrell's age to Allen's age is 7:11, let's denote Darrell's current age as \(7x\) and Allen's current age as \(11x\), where \(x\) is a common multiplier for both ages.

According to the information provided, their combined age is currently 162:

\[7x + 11x = 162\]

Combining like terms gives us:

\[18x = 162\]

Dividing both sides by 18 yields:

\[x = \frac{162}{18} = 9\]

So, Darrell is currently \(7x = 7 \times 9 = 63\) years old, and Allen is \(11x = 11 \times 9 = 99\) years old.

Now, to

  0%|          | 0/32 [00:00<?, ?it/s]

Generating augmented prompts...


0it [00:00, ?it/s]

Generating augmented (chosen) answers...


  0%|          | 0/32 [00:00<?, ?it/s]

Verifying answers...


0it [00:00, ?it/s]

CREATED 330 pairs
+++++++SAMPLE++++++++
Prompt: <|im_start|>system
Solve the problem step by step. Effectively utilize any hints given to you. Clearly state your answer<|im_end|>
<|im_start|>user
Question: Alfonso earns $6 each day walking his aunt’s dog. He is saving to buy a mountain bike helmet for $340. Currently, he already has $40. If he walks his aunt's dog 5 days a week, in how many weeks should Alfonso work to buy his mountain bike?
Answer:<|im_end|>
<|im_start|>assistant

Chosen: To solve this problem, we need to calculate how much more money Alfonso needs to save and then divide that amount by the number of weeks he will work.

1. Calculate how much more money Alfonso needs to save:
   $340 - $40 = $300

2. Calculate how much Alfonso earns per week:
   $6/day * 5 days/week = $30/week

3. Divide the total amount Alfonso still needs to earn by his weekly earnings:
   $300 / $30/week = 10 weeks

Therefore, Alfonso should work for 10 weeks to save enough money to buy his mountai

Extracting prompt in train dataset:   0%|          | 0/330 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/330 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/330 [00:00<?, ? examples/s]



Step,Training Loss


Evaluating model on 128 questions


  0%|          | 0/4 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}