In [None]:
import os 
import utils.visulaiser as visulaiser
from datasets import load_dataset, load_from_disk

from torch import nn
from tqdm import tqdm
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt
from datasets import load_dataset
import re
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch.optim import AdamW
import torch.nn as nn

from transformers import AutoModelForCausalLM, AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Model

In [174]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
base_model = AutoModelForCausalLM.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [221]:
class ModifiedModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.qwen = base_model.model
        self.linear = nn.Linear(896, 1)

    def forward(self, inputs):
        outputs = self.qwen(**inputs)
        cls_token = outputs.last_hidden_state[:, 0, :]
        return self.linear(cls_token)

# Data

In [180]:
# Load GSM8K dataset
# dataset = load_dataset("gsm8k", "main")
dataset = load_from_disk("./gsm8k_saved")

def extract_final_answer(answer):
    """
    Extracts only the numerical value after '####' in the answer field.
    """
    match = re.search(r"####\s*([\d\.]+)", answer)  # Match number after ####
    return float(match.group(1)) if match else 0  # Return extracted number

# Process training and test sets
for split in ["train", "test"]:
    dataset[split] = dataset[split].map(lambda example: {
        "question": example["question"],
        # "answer": tokenizer(extract_final_answer(example["answer"]),
        #                     padding='max_length',
        #                     truncation=True,
        #                     max_length=16,
        #                     return_tensors='pt').to(device),
        "answer": extract_final_answer(example["answer"])
    })

# Save processed dataset
dataset.save_to_disk("./gsm8k_cleaned")

# Print an example to verify
print(dataset["train"][0])

# Split into train and test sets
train_data = dataset["train"]
test_data = dataset["test"]

Saving the dataset (0/1 shards):   0%|          | 0/7473 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1319 [00:00<?, ? examples/s]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 72.0}


In [181]:
def format_example(example):
    # print(example)
    return f"Question: YOU ARE A EXPERT AT MATH. NOW ANSWER THIS QUESTION - {example['question']}. REPLY JUST THE FINAL ANSWER AS A NUMBER. Answer: "

# Tokenize data
def preprocess_function(examples):
    texts = format_example(examples)
    tokens = tokenizer(texts, 
                     padding="max_length", 
                     truncation=True, 
                     max_length=128, 
                     return_tensors="pt").to(device)
    return tokens

# Apply preprocessing
tokenized_train = train_data.map(preprocess_function, batched=False)
tokenized_test = test_data.map(preprocess_function, batched=False)

# Rename 
tokenized_train = tokenized_train.remove_columns('question')
# tokenized_train = tokenized_train.rename_column('answer', 'labels')

tokenized_test = tokenized_test.remove_columns('question')
# tokenized_test = tokenized_test.rename_column('answer', 'labels')

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [234]:
small_train_dataset = tokenized_train.shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_test.shuffle(seed=42)

In [None]:


train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=1)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=1)

# PyTorch Bare Bones

In [None]:


num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
progress_bar = tqdm(range(num_training_steps))

model = ModifiedModel(base_model)
model = model.to(device)
model.train()

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_metric = nn.MSELoss()

loss_arr = []

for epoch in range(num_epochs):
    running_loss = 0
    for i, sample in enumerate(train_dataloader):
        batch = {k: torch.tensor(v).to(device) for k, v in sample.items() if k != 'answer'}
        
        output = model(batch)
        if isinstance(output, tuple):  # Ensure proper indexing
            output = output[0]
        
        # Ensure shape consistency for loss calculation
        loss = loss_metric(output.view(-1, 1).float(), sample['answer'].view(-1, 1).to(device).float())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        running_loss += loss.item()
        if (i % 20 == 0):
            print(f"Step {i}: Loss = {loss.item()}")

    loss_arr.append(running_loss / len(train_dataloader))

print("Training complete!")


  0%|          | 0/1000 [00:00<?, ?it/s]

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.save_pretrained("./qwen_gsm8k_finetuned")
tokenizer.save_pretrained("./qwen_gsm8k_finetuned")