In [2]:
import os 
import utils.visulaiser as visulaiser
from datasets import load_dataset, load_from_disk

from torch import nn
from tqdm import tqdm
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt

from transformers import AutoModelForCausalLM, AutoTokenizer


In [7]:
from datasets import load_dataset
import re

# Load GSM8K dataset
dataset = load_dataset("gsm8k", "main")
# dataset = load_from_disk("./gsm8k/main")

def extract_final_answer(answer):
    """
    Extracts only the numerical value after '####' in the answer field.
    """
    match = re.search(r"####\s*([\d\.]+)", answer)  # Match number after ####
    return match.group(1) if match else 0  # Return extracted number

# Process training and test sets
for split in ["train", "test"]:
    dataset[split] = dataset[split].map(lambda example: {
        "question": example["question"],
        "answer": float(extract_final_answer(example["answer"]))
    })

# Save processed dataset
dataset.save_to_disk("./gsm8k_cleaned")

# Print an example to verify
print(dataset["train"][0])

# Split into train and test sets
train_data = dataset["train"]
test_data = dataset["test"]


Saving the dataset (0/1 shards):   0%|          | 0/7473 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1319 [00:00<?, ? examples/s]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 72.0}


In [4]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [5]:
def print_model_predictions(model, tokenizer, dataset, device="cpu", num_samples=5):
    model.to(device)
    model.eval()
    dataset = load_from_disk(dataset)
    dataset = dataset['test']
    
    for i in range(min(num_samples, len(dataset))):
        example = dataset[i]
        input_text = "YOU ARE A EXPERT AT MATH. NOW ANSWER THIS QUESTION - " + example["question"] + " REPLY JUST THE FINAL ANSWER AS A NUMBER. DO NOT RETURN ANY TEXT."
        target_output = example["answer"]
        
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=1)
        generated_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)[len(input_text):]

        print(f"Example {i+1}:\n")
        print(f"Input: {input_text}\n")
        print(f"Generated Answer: {generated_answer}\n")
        print(f"Target Output: {target_output}\n")
        print("-" * 50)
        if i == 5:
            break

In [43]:
def format_example(example):
    # print(example)
    return f"Question: {example['question']}\nAnswer: {example['answer']}"

# Tokenize data
def preprocess_function(examples):
    texts = format_example(examples)
    return tokenizer(texts, 
                     # padding="max_length", 
                     # truncation=True, 
                     # max_length=512, 
                     return_tensors="pt")

# Apply preprocessing
tokenized_train = train_data.map(preprocess_function, batched=False)
tokenized_test = test_data.map(preprocess_function, batched=False)


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [44]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./qwen_gsm8k_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,  # Reduce if memory is an issue
    per_device_eval_batch_size=1,
    num_train_epochs=1,  # Adjust based on need
    # learning_rate=5e-5,  # Adjust LR based on performance
    # logging_steps=10,
    # save_total_limit=2,
    # generation_max_length=1,
    # report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

trainer.train()


ValueError: too many values to unpack (expected 3)

In [None]:
model.save_pretrained("./qwen_gsm8k_finetuned")
tokenizer.save_pretrained("./qwen_gsm8k_finetuned")
