In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

#m_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
m_name = "Ashed00/SmolMath-135M"

tokenizer = AutoTokenizer.from_pretrained(m_name)
model = AutoModelForCausalLM.from_pretrained(m_name)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Evaluation function
from tqdm import tqdm
import re
import torch

data = load_dataset("Ashed00/combined_math_problems", split="train")
addsub_data = data.filter(lambda x: x['source'] == 'AddSub')
data = data.filter(lambda x: x['source'] != 'AddSub')


#select 200 random rows from addsub

addsub_data = addsub_data.shuffle().select(range(200))

print(addsub_data)

#concat addsub with data
#data = concatenate_datasets([data, addsub_data])
data = data.to_pandas()

# Drop all columns except 'question' and 'answer'
columns_to_keep = ['question', 'answer']
columns_to_drop = [col for col in data.columns if col not in columns_to_keep]
data = data.drop(columns=columns_to_drop)

# Drop rows with null values in 'question' or 'answer'
data = data.dropna(subset=['question', 'answer'])

# Drop rows where 'question' or 'answer' are not strings
data = data[data['question'].apply(lambda x: isinstance(x, str))]
data = data[data['answer'].apply(lambda x: isinstance(x, str))]

from datasets import Dataset
data = Dataset.from_pandas(data)

data = data.shuffle()

# prompt: Update question to the form "Q: question /n A:"

def format_qa(example):
  example['question'] = "Question: "+f"{example['question']}".strip()+"\n Answer:"
  example['answer'] = f"{example['answer']}".strip() +"\n#End of Answer." + str(tokenizer.eos_token)
  return example

data = data.map(format_qa)

data = data.rename_column("question", "prompt")
data = data.rename_column("answer", "completion")



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer.pad_token = tokenizer.eos_token

# Helper: Extract last number (int or decimal) from string
def extract_last_number(text):
    numbers = re.findall(r'-?\d+(?:\.\d+)?', text)
    return numbers[-1] if numbers else None

# Evaluation function
def evaluate_accuracy(dataset, max_samples=10000):
    correct = 0
    total = 0

    for example in tqdm(dataset.select(range(min(len(dataset), max_samples)))):
        question = example['prompt']
        expected_answer = str(example['completion'])

        # Encode and generate
        inputs = tokenizer(question, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=50, pad_token_id=tokenizer.eos_token_id)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract answers
        generated_answer = generated[len(question):].strip()
        gen_number = extract_last_number(generated_answer)
        exp_number = extract_last_number(expected_answer)

        if gen_number is not None and exp_number is not None and gen_number == exp_number:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0.0
    print(f"Accuracy (last number match): {accuracy:.2%}")

# Run evaluation
evaluate_accuracy(data)

Dataset({
    features: ['ID', 'Body', 'Question', 'Equation', 'Type', 'question', 'answer', 'source', 'input'],
    num_rows: 200
})


Map: 100%|██████████| 1120/1120 [00:00<00:00, 18400.18 examples/s]
100%|██████████| 1120/1120 [16:12<00:00,  1.15it/s]

Accuracy (last number match): 12.05%





In [5]:
from datasets import load_dataset

ds = load_dataset("MU-NLPC/Calc-mawps", "default")

def format_qa_m(example):
  example['question'] = "Question: "+f"{example['question']}".strip()+"\n Answer:"
  example['answer'] = example["result"]
  return example

ds = ds.map(format_qa_m)

ds = ds.rename_column("question", "prompt")
ds = ds.rename_column("answer", "completion")

from datasets import concatenate_datasets

mds = concatenate_datasets(
  [ds["train"],
  ds["validation"],
  ds["test"],]
)

evaluate_accuracy(mds)




100%|██████████| 2649/2649 [35:26<00:00,  1.25it/s]

Accuracy (last number match): 8.31%



