In [1]:
from datasets import load_dataset

# 1) Load the HF dataset
dataset = load_dataset("aqua_rat", "raw")  # already splits into train/validation/test

def make_prompt_and_target(example):
    question = example["question"]
    opts = example["options"]
    # Format options on one line, separated by two spaces
    opt_line = "  ".join(opts)

    # Join the rationale list into newline‐separated text
    rationale_text = (example["rationale"])

    # Final target .= rationale + “The answer is <letter>.”
    target = rationale_text + "\n####The answer is " + example["correct"] + "."

    # Build the prompt
    prompt = f"Question: {question}\nOptions: {opt_line}\nAnswer:\n"

    return {"prompt": prompt, "completion": target}

# 2) Map over the split
train = dataset["train"].map(make_prompt_and_target, remove_columns=dataset["train"].column_names)
validation = dataset["validation"].map(make_prompt_and_target, remove_columns=dataset["validation"].column_names)


Using the latest cached version of the dataset since aqua_rat couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'raw' at /home/ivlabs/.cache/huggingface/datasets/aqua_rat/raw/0.0.0/33301c6a050c96af81f63cad5562cb5363e88971 (last modified on Wed Jun  4 16:40:34 2025).


In [2]:
train

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 97467
})

In [3]:
train[4]

{'prompt': 'Question: The speed at which a man can row a boat in still water is 25 kmph. If he rows downstream, where the speed of current is 11 kmph, what time will he take to cover 80 metres?\nOptions: A)18 seconds  B)27 seconds  C)26 seconds  D)12 seconds  E)8 seconds\nAnswer:\n',
 'completion': 'Speed of the boat downstream = 25 +11\n= 36 kmph\n= 36 * 5/18 = 10 m/s\nHence time taken to cover 80 m = 80/10\n= 8 seconds.\nAnswer:E\n####The answer is E.'}

In [4]:
validation[7]

{'prompt': 'Question: Let A, B and C denote the vertices of a triangle with area 10. Let point D be on side AB,\npoint E be on side BC and point F be on side CA with AD = 2 and DB = 3. The area of\n△ABE and the area of quadrilateral DBEF are the same. What is the value of this area?\nOptions: A)5.5  B)6  C)7  D)8  E)8.25\nAnswer:\n',
 'completion': 'First, note that 4AFD and 4AFE have the same area. Therefore, ED is parallel to AF.\nSince 4DEB and 4ACB are similar, BE=EC = 3=2. Thus, the area of 4AEB is 3=5ths of the\narea of 4ABC.\ncorrect answer B\n####The answer is B.'}

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")

In [6]:
tokenizer.pad_token = tokenizer.eos_token

In [7]:
from trl import SFTConfig, SFTTrainer
from transformers import TrainingArguments, Trainer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device for training: {device}")


training_args = TrainingArguments(
    output_dir="./smolmath-sft2-cot", # Output directory
    num_train_epochs=1, # Number of training epochs
    per_device_train_batch_size=4, # Batch size per device during training
    save_steps=10_000, # Save checkpoint every X updates steps
    save_total_limit=2, # Limit the total amount of checkpoints
    logging_dir="./logs", # Directory for storing logs
    logging_steps=50,
    learning_rate=3e-5,
    weight_decay=0.01,
    eval_strategy="steps", # Evaluate every X steps
    eval_steps=1500,
    # use_cpu = True if device.type == 'cpu' else False # Explicitly set use_cpu
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    eval_dataset=validation,
    processing_class=tokenizer,
    args=training_args,
    #dataset_text_field="text", # The column in the dataset containing the text
    #max_seq_length=512, # Maximum sequence length for training
)

trainer.train()

trainer.save_model("./SmolMath-v2-SFT-CoT-AQuA")

Using device for training: cuda


Applying chat template to train dataset:   0%|          | 0/97467 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/97467 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/97467 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/254 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/254 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/254 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33mbt22ece049[0m ([33mbt21ece003-nit-nagpur[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
1500,1.3302,1.549816
3000,1.2648,1.529393
4500,1.274,1.515157
6000,1.182,1.51577
7500,1.1706,1.514409
9000,1.1885,1.512891
10500,1.0626,1.523832
12000,1.1077,1.513205
13500,1.0626,1.518937
15000,1.0615,1.527391


In [8]:
validation[100]

{'prompt': 'Question: A canteen requires 62 kgs of wheat for 6 days. How many kgs of wheat will it require for 60 days?\nOptions: A)620 kgs  B)1,401kgs  C)1,104kgs  D)1,014kgs  E)None\nAnswer:\n',
 'completion': 'Quantity of wheat for 6 days = 62kg\nquantity of wheat for one day = 62/6 kg\nquantity of wheat for 60 days= 62/6* 60 = 620 kg\nAnswer A\n####The answer is A.'}

In [9]:
input_text = "Question: a student got twice as many sums wrong as he got right . if he attempted 54 sums in all , how many did he solve correctly ? \nOptions: a ) 12 , b ) 16 , c ) 18 , d ) 24 , e ) 26\nAnswer:\n"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate text
output = model.generate(input_ids, max_new_tokens=100, num_return_sequences=1, do_sample=True,top_k=50, temperature=0.6, pad_token_id=tokenizer.eos_token_id,)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [10]:
print(generated_text)

Question: a student got twice as many sums wrong as he got right . if he attempted 54 sums in all , how many did he solve correctly ? 
Options: a ) 12 , b ) 16 , c ) 18 , d ) 24 , e ) 26
Answer:
Explanation :
Let x = total sums. Then,
2x = 54
=> x = 15
Correct Option: E
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E.
####The answer is E


In [13]:
import re
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# Load dataset
dataset = load_dataset("aqua_rat", "raw")

def make_prompt_and_target(example):
    question = example["question"]
    opts = example["options"]
    opt_line = "  ".join(opts)
    rationale_text = (example["rationale"])
    target = rationale_text + "\n####The answer is " + example["correct"] + "."
    prompt = f"Question: {question}\nOptions: {opt_line}\nAnswer:\n"
    return {"prompt": prompt, "completion": target, "gold": example["correct"]}

# Apply transformation
test = dataset["test"].map(make_prompt_and_target, remove_columns=dataset["test"].column_names)


# Evaluation parameters
BATCH_SIZE = 8
MAX_NEW_TOKENS = 100  # adjust as needed

# Function to extract answer from generation
def extract_answer(text):
    match = re.search(r"####The answer is ([A-Z])", text)
    return match.group(1) if match else None

correct = 0
total = 0
no_ans = 0

for i in tqdm(range(0, len(test), BATCH_SIZE)):
    batch = test[i:i+BATCH_SIZE]
    prompts = batch["prompt"]

    # Tokenize
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side='left', truncation=True).to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for j, generated in enumerate(generated_texts):
        pred = extract_answer(generated)
        gold = batch["gold"][j]
        if pred is None:
            no_ans += 1
            continue

        if pred == gold:
            correct += 1
        total += 1

accuracy = correct / total
print(f"\nAccuracy: {accuracy:.4f} ({correct}/{total})")


Using the latest cached version of the dataset since aqua_rat couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'raw' at /home/ivlabs/.cache/huggingface/datasets/aqua_rat/raw/0.0.0/33301c6a050c96af81f63cad5562cb5363e88971 (last modified on Wed Jun  4 18:54:27 2025).
100%|██████████| 32/32 [00:57<00:00,  1.81s/it]


Accuracy: 0.1667 (24/144)





In [15]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
model.push_to_hub("Ashed00/SmolMath-SFT2-CoT_AQuA")
tokenizer.push_to_hub("Ashed00/SmolMath-SFT2-CoT_AQuA")


model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Ashed00/SmolMath-SFT2-CoT_AQuA/commit/0ae9493d8af345f368fb12d8e16847281a55145a', commit_message='Upload tokenizer', commit_description='', oid='0ae9493d8af345f368fb12d8e16847281a55145a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Ashed00/SmolMath-SFT2-CoT_AQuA', endpoint='https://huggingface.co', repo_type='model', repo_id='Ashed00/SmolMath-SFT2-CoT_AQuA'), pr_revision=None, pr_num=None)