# Installs

In [None]:
print('Installing packages...')
!pip install transformers accelerate sentencepiece tokenizers datasets tqdm zstandard rouge_score
!pip install datasets --upgrade
!pip install --upgrade transformers

# Imports

In [1]:
import os 
# import utils.visulaiser as visulaiser
from datasets import load_dataset, load_from_disk

from torch import nn
from tqdm import tqdm
import tqdm
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt
import re
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch.optim import AdamW
import torch.nn as nn
from torchvision.transforms import v2
from rouge_score import rouge_scorer
# Logging
from datetime import datetime

from download_datasets_models import get_dataset, get_model
from evaluate_llm import measure_test_accuracy

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
print("Detected Device:", device)

Detected Device: cuda


# Get Model and Tokenizer

In [3]:
model_name = "Qwen/Qwen2-Math-1.5B-Instruct"

In [4]:
model, tokenizer = get_model(model_name, save_model=False)

Using Pre-Downloaded Model and Tokenizer


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer Loaded


# Process Dataset

In [5]:
dataset_name = "gsm8k"

dataset = get_dataset(dataset_name)

Using Pre-Downloaded Dataset
Dataset Loaded


In [6]:
if os.path.isdir(f"./{dataset_name}_tokenized"):
    tokenized_data = load_from_disk(f"./{dataset_name}_tokenized")
else:
    def extract_final_answer(answer):
        """
        Extracts only the numerical value after '####' in the answer field.
        """
        match = re.search(r"####\s*([\d\.]+)", answer)  # Match number after ####
        return float(match.group(1)) if match else 0  # Return extracted number
    
    # Process training and test sets
    for split in ["train", "test"]:
        dataset[split] = dataset[split].map(lambda example: {
            "original_answer": example['answer'],
            "question": example["question"],
            # "answer": tokenizer(extract_final_answer(example["answer"]),
            #                     padding='max_length',
            #                     truncation=True,
            #                     max_length=16,
            #                     return_tensors='pt').to(device),
            "answer": extract_final_answer(example["answer"]),
        })

    def format_example(example):
        # print(example)
        return f"You are a math expert. Now answer this question - " + example["question"] + " Your answer should only contain the final answer as a number. Print final answer here: "
        # return f"Question: YOU ARE A EXPERT AT MATH. NOW ANSWER THIS QUESTION - {example['question']}. REPLY JUST THE FINAL ANSWER AS A NUMBER. Answer: "

    # Tokenize data
    def preprocess_function(examples):
        texts = format_example(examples)
        tokens = tokenizer(texts, 
                        padding="max_length", 
                        truncation=True, 
                        max_length=128, 
                        return_tensors="pt")
        return tokens

    tokenized_data = dataset.map(preprocess_function, batched=False)
    # Save processed dataset
    tokenized_data.save_to_disk("./gsm8k_tokenized")

In [7]:
# Split into train and test sets
# Commenting Train dataset for now

# train_data = tokenized_data["train"]
test_data = tokenized_data["test"]

# small_train_dataset = train_data.shuffle(seed=42).select(range(1000)) # Loading only 1000
small_eval_dataset = test_data.shuffle(seed=42).select(range(200)) # Loading only 200 for quick runs

# train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=1)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=1)

# Checking Model Outputs

In [9]:
def print_model_predictions(model, dataloader, device, num_samples=3, display=False):
    model = model.to(device)
    model.eval()

    num_training_steps = len(dataloader)
    progress_bar = tqdm(range(num_training_steps))

    accuracy_log = []
    accuracy = 0

    print(f"Running only for {num_samples=}")

    with torch.no_grad():
        for i, sample in enumerate(dataloader):
            # print(sample)
            batch = {}
            for k, v in sample.items():
                if k != "question" and k != "answer" and k != 'original_answer':
                    batch[k] = torch.tensor(v).to(device)
            
            output = model.generate(**batch, max_new_tokens=16, do_sample=False)
            # if isinstance(output, tuple):  # Ensure proper indexing
            #     output = output[0]
            
            # output = output[len(batch['input_ids']):]
            output = tokenizer.decode(output[0][len(batch['input_ids'][0]):], skip_special_tokens=True) 

            match = re.search(r"\s*([\d\.]+)", output)  # Match number after ####
            generated_answer = float(match.group(1)) if match else 0  # Return extracted number
            
            if display:
                print(f"Example {i+1}:\n")
                print(f"Input: {sample['question']}\n")
                print(f"Generated Answer: {output}\n")
                print(f"Target Output: {sample['answer'].item()}\n")
                print(f"Output Answer: {generated_answer}")
                print("-" * 50)

            accuracy = (generated_answer == sample['answer'].item())
            accuracy_log.append(accuracy)

            progress_bar.update(1)

            if num_samples == i:
                break

    print(f"Accuracy: {np.sum(accuracy_log)/len(accuracy_log)}")
    print("Complete!")

In [10]:
# print_model_predictions(model, eval_dataloader, device)

# Get the Metrics

In [12]:
from awq_quantizer import pseudo_quantize_model_weight_scaleup, get_calib_feat
from util_functions import get_model_size, evaluate_perplexity
import gc

In [14]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of Parameters: {count_parameters(model)}")

Number of Parameters: 1543714304


In [15]:
# Base Model
del model
gc.collect()
torch.cuda.empty_cache()
model, tokenizer = get_model()

# Evaluate the model
print("=" * 50)
print("Base Model")
model = model.to(device)
model_perplexity = evaluate_perplexity(model, tokenizer)
model_size = get_model_size(model, data_width=32, group_size=128)
measure_test_accuracy(model, tokenizer, eval_dataloader, device)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size:.2f} MiB")
print("=" * 50)

# Remove from GPU Memory
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

Using Pre-Downloaded Model and Tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer Loaded
Base Model
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:25<00:00,  2.53s/it]


  0%|          | 0/200 [00:00<?, ?it/s]



0	0.00	0.1389	0.0857	0.1389

100	0.00	0.1714	0.0000	0.1714

Model Accuracy on GSM8K: 11.00%
Average ROUGE-1: 0.1531
Average ROUGE-2: 0.0603
Average ROUGE-L: 0.1260

model perplexity: 8.87
model size: 5917.56 MiB


In [16]:
# FineTuned Model
fmodel, ftokenizer = get_model("wzzju/Qwen2.5-1.5B-GRPO-GSM8K")

# Evaluate the model
print("=" * 50)
print("FineTuned Model")
fmodel = fmodel.to(device)
model_perplexity = evaluate_perplexity(fmodel, ftokenizer)
model_size = get_model_size(fmodel, data_width=32, group_size=128)
measure_test_accuracy(fmodel, ftokenizer, eval_dataloader, device)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size:.2f} MiB")
print("=" * 50)

# Remove from GPU Memory
del fmodel
del ftokenizer
gc.collect()
torch.cuda.empty_cache()

Downloading Model and Tokenizer


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Saving Model to ./Qwen2.5-1.5B-GRPO-GSM8K
Model and Tokenizer Loaded
FineTuned Model
Using Pre-Downloaded Dataset
Dataset Loaded


evaluating...: 100%|██████████| 10/10 [00:26<00:00,  2.62s/it]


  0%|          | 0/200 [00:00<?, ?it/s]



0	0.00	0.0000	0.0000	0.0000

100	1.00	0.0606	0.0000	0.0606

Model Accuracy on GSM8K: 4.00%
Average ROUGE-1: 0.0429
Average ROUGE-2: 0.0087
Average ROUGE-L: 0.0379

model perplexity: 9.34
model size: 5917.56 MiB


In [17]:
model, tokenizer = get_model()
model = model.to(device)
input_feat = get_calib_feat(model, tokenizer)

for scale_factor in [4]:
    for bit in [2, 4, 8]:
        del model
        gc.collect()
        torch.cuda.empty_cache()
        model, tokenizer = get_model()
        pseudo_quantize_model_weight_scaleup(model, w_bit=bit, q_group_size=128, input_feat=input_feat, scale_factor=scale_factor)

        # Evaluate the model
        model = model.to(device)
        print("=" * 50)
        print(f"{scale_factor=}, {bit=}")
        model_perplexity = evaluate_perplexity(model, tokenizer)
        model_size = get_model_size(model, data_width=bit, group_size=128)
        measure_test_accuracy(model, tokenizer, eval_dataloader, device)
        print(f"\nmodel perplexity: {model_perplexity:.2f}")
        print(f"model size: {model_size:.2f} MiB")

        # model.save_pretrained(f"./{model_name.split('/')[-1]}_{bit}bit")
        # tokenizer.save_pretrained(f"./{model_name.split('/')[-1]}_{bit}bit")
        print("=" * 50)

Using Pre-Downloaded Model and Tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer Loaded
Collecting activation scales...
Using Pre-Downloaded Dataset
Dataset Loaded
 * Split into 30 blocks


100%|██████████| 30/30 [00:17<00:00,  1.68it/s]


Using Pre-Downloaded Model and Tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer Loaded
scale_factor=4, bit=2
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:27<00:00,  2.74s/it]


  0%|          | 0/200 [00:00<?, ?it/s]

0	0.00	0.0290	0.0000	0.0290

100	0.00	0.0000	0.0000	0.0000

Model Accuracy on GSM8K: 0.00%
Average ROUGE-1: 0.0128
Average ROUGE-2: 0.0003
Average ROUGE-L: 0.0123

model perplexity: 96943.91
model size: 396.80 MiB
Using Pre-Downloaded Model and Tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer Loaded
scale_factor=4, bit=4
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:12<00:00,  1.21s/it]


  0%|          | 0/200 [00:00<?, ?it/s]

0	0.00	0.1081	0.0556	0.1081

100	0.00	0.2286	0.0000	0.2286

Model Accuracy on GSM8K: 4.50%
Average ROUGE-1: 0.1491
Average ROUGE-2: 0.0577
Average ROUGE-L: 0.1235

model perplexity: 9.93
model size: 764.85 MiB
Using Pre-Downloaded Model and Tokenizer


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model and Tokenizer Loaded
scale_factor=4, bit=8
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:12<00:00,  1.23s/it]


  0%|          | 0/200 [00:00<?, ?it/s]

0	0.00	0.1389	0.0857	0.1389

100	0.00	0.1714	0.0000	0.1714

Model Accuracy on GSM8K: 10.50%
Average ROUGE-1: 0.1546
Average ROUGE-2: 0.0604
Average ROUGE-L: 0.1267

model perplexity: 8.87
model size: 1500.95 MiB
