# Installs

In [1]:
print('Installing packages...')
!pip install transformers accelerate sentencepiece tokenizers datasets tqdm zstandard rouge_score
!pip install datasets --upgrade
!pip install --upgrade transformers

Installing packages...
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=

# Imports

In [2]:
import os
# import utils.visulaiser as visulaiser
from datasets import load_dataset, load_from_disk

from torch import nn
from tqdm import tqdm
import tqdm
import numpy as np
import torch
import copy
import matplotlib.pyplot as plt
import re
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from torch.optim import AdamW
import torch.nn as nn
from torchvision.transforms import v2
from rouge_score import rouge_scorer
# Logging
from datetime import datetime

from download_datasets_models import get_dataset, get_model
from evaluate_llm import measure_test_accuracy

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
print("Detected Device:", device)

Detected Device: cuda


# Get Model and Tokenizer

In [4]:
model_name = "Qwen/Qwen2-Math-1.5B-Instruct"

In [5]:
model, tokenizer = get_model(model_name, save_model=False)

Downloading Model and Tokenizer


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

Saving Model to ./Qwen2-Math-1.5B-Instruct
Model and Tokenizer Loaded


# Process Dataset

In [5]:
dataset_name = "gsm8k"

dataset = get_dataset(dataset_name)

Using Pre-Downloaded Dataset
Dataset Loaded


In [6]:
if os.path.isdir(f"./{dataset_name}_tokenized"):
    tokenized_data = load_from_disk(f"./{dataset_name}_tokenized")
else:
    def extract_final_answer(answer):
        """
        Extracts only the numerical value after '####' in the answer field.
        """
        match = re.search(r"####\s*([\d\.]+)", answer)  # Match number after ####
        return float(match.group(1)) if match else 0  # Return extracted number

    # Process training and test sets
    for split in ["train", "test"]:
        dataset[split] = dataset[split].map(lambda example: {
            "original_answer": example['answer'],
            "question": example["question"],
            # "answer": tokenizer(extract_final_answer(example["answer"]),
            #                     padding='max_length',
            #                     truncation=True,
            #                     max_length=16,
            #                     return_tensors='pt').to(device),
            "answer": extract_final_answer(example["answer"]),
        })

    def format_example(example):
        # print(example)
        return f"You are a math expert. Now answer this question - " + example["question"] + " Your answer should only contain the final answer as a number. Print final answer here: "
        # return f"Question: YOU ARE A EXPERT AT MATH. NOW ANSWER THIS QUESTION - {example['question']}. REPLY JUST THE FINAL ANSWER AS A NUMBER. Answer: "

    # Tokenize data
    def preprocess_function(examples):
        texts = format_example(examples)
        tokens = tokenizer(texts,
                        padding="max_length",
                        truncation=True,
                        max_length=128,
                        return_tensors="pt")
        return tokens

    tokenized_data = dataset.map(preprocess_function, batched=False)
    # Save processed dataset
    tokenized_data.save_to_disk("./gsm8k_tokenized")

In [7]:
# Split into train and test sets
# Commenting Train dataset for now

# train_data = tokenized_data["train"]
test_data = tokenized_data["test"]

# small_train_dataset = train_data.shuffle(seed=42).select(range(1000)) # Loading only 1000
small_eval_dataset = test_data.shuffle(seed=42)#.select(range(200)) # Loading only 200 for quick runs

# train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=1)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=1)

# Checking Model Outputs

In [14]:
def print_model_predictions(model, dataloader, device, num_samples=3, display=False):
    model = model.to(device)
    model.eval()

    num_training_steps = len(dataloader)
    progress_bar = tqdm(range(num_training_steps))

    accuracy_log = []
    accuracy = 0

    print(f"Running only for {num_samples=}")

    with torch.no_grad():
        for i, sample in enumerate(dataloader):
            # print(sample)
            batch = {}
            for k, v in sample.items():
                if k != "question" and k != "answer" and k != 'original_answer':
                    batch[k] = torch.tensor(v).to(device)

            output = model.generate(**batch, max_new_tokens=16, do_sample=False)
            # if isinstance(output, tuple):  # Ensure proper indexing
            #     output = output[0]

            # output = output[len(batch['input_ids']):]
            output = tokenizer.decode(output[0][len(batch['input_ids'][0]):], skip_special_tokens=True)

            match = re.search(r"\s*([\d\.]+)", output)  # Match number after ####
            generated_answer = float(match.group(1)) if match else 0  # Return extracted number

            if display:
                print(f"Example {i+1}:\n")
                print(f"Input: {sample['question']}\n")
                print(f"Generated Answer: {output}\n")
                print(f"Target Output: {sample['answer'].item()}\n")
                print(f"Output Answer: {generated_answer}")
                print("-" * 50)

            accuracy = (generated_answer == sample['answer'].item())
            accuracy_log.append(accuracy)

            progress_bar.update(1)

            if num_samples == i:
                break

    print(f"Accuracy: {np.sum(accuracy_log)/len(accuracy_log)}")
    print("Complete!")

In [15]:
# print_model_predictions(model, eval_dataloader, device)

# Get the Metrics

In [8]:
from awq_quantizer import pseudo_quantize_model_weight_scaleup, get_calib_feat
from util_functions import get_model_size, evaluate_perplexity
import gc

In [11]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of Parameters: {count_parameters(model)}")

Number of Parameters: 1543714304


In [18]:
# Base Model
del model
gc.collect()
torch.cuda.empty_cache()
model, tokenizer = get_model()

# Evaluate the model
print("=" * 50)
print("Base Model")
model = model.to(device)
model_perplexity = evaluate_perplexity(model, tokenizer)
model_size = get_model_size(model, data_width=32, group_size=128)
measure_test_accuracy(model, tokenizer, eval_dataloader, device)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size:.2f} MiB")
print("=" * 50)

# Remove from GPU Memory
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

Downloading Model and Tokenizer
Saving Model to ./Qwen2-Math-1.5B-Instruct
Model and Tokenizer Loaded
Base Model
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:15<00:00,  1.51s/it]


  0%|          | 0/1319 [00:00<?, ?it/s]



0	0.00	0.1389	0.0857	0.1389

100	0.00	0.1714	0.0000	0.1714

200	0.00	0.1860	0.0952	0.1860

300	0.00	0.2192	0.0845	0.1644

400	0.00	0.1890	0.0960	0.1417

500	0.00	0.1481	0.0769	0.1111

600	1.00	0.2623	0.1356	0.1967

700	0.00	0.1333	0.0000	0.0800

800	0.00	0.2353	0.0909	0.1765

900	0.00	0.1923	0.0980	0.1538

1000	1.00	0.2128	0.0889	0.1277

1100	0.00	0.3077	0.1081	0.1538

1200	0.00	0.1622	0.0556	0.1351

1300	0.00	0.1017	0.0351	0.1017

Model Accuracy on GSM8K: 7.73%
Average ROUGE-1: 0.1584
Average ROUGE-2: 0.0611
Average ROUGE-L: 0.1305

model perplexity: 8.87
model size: 5917.56 MiB


In [19]:
# FineTuned Model
fmodel, ftokenizer = get_model("wzzju/Qwen2.5-1.5B-GRPO-GSM8K")

# Evaluate the model
print("=" * 50)
print("FineTuned Model")
fmodel = fmodel.to(device)
model_perplexity = evaluate_perplexity(fmodel, ftokenizer)
model_size = get_model_size(fmodel, data_width=32, group_size=128)
measure_test_accuracy(fmodel, ftokenizer, eval_dataloader, device)
print(f"\nmodel perplexity: {model_perplexity:.2f}")
print(f"model size: {model_size:.2f} MiB")
print("=" * 50)

# Remove from GPU Memory
del fmodel
del ftokenizer
gc.collect()
torch.cuda.empty_cache()

Downloading Model and Tokenizer


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Saving Model to ./Qwen2.5-1.5B-GRPO-GSM8K
Model and Tokenizer Loaded
FineTuned Model
Using Pre-Downloaded Dataset
Dataset Loaded


evaluating...: 100%|██████████| 10/10 [00:15<00:00,  1.55s/it]


  0%|          | 0/1319 [00:00<?, ?it/s]



0	0.00	0.0000	0.0000	0.0000

100	1.00	0.0606	0.0000	0.0606

200	0.00	0.1250	0.0513	0.1250

300	0.00	0.0000	0.0000	0.0000

400	0.00	0.1111	0.0484	0.0794

500	1.00	0.0400	0.0000	0.0400

600	0.00	0.2414	0.0714	0.1724

700	0.00	0.0811	0.0278	0.0811

800	0.00	0.0588	0.0000	0.0588

900	0.00	0.0194	0.0000	0.0194

1000	0.00	0.2128	0.0444	0.1277

1100	0.00	0.0000	0.0000	0.0000

1200	0.00	0.0274	0.0000	0.0274

1300	0.00	0.0333	0.0000	0.0333

Model Accuracy on GSM8K: 4.32%
Average ROUGE-1: 0.0421
Average ROUGE-2: 0.0072
Average ROUGE-L: 0.0360

model perplexity: 9.34
model size: 5917.56 MiB


In [10]:
model, tokenizer = get_model()
model = model.to(device)
input_feat = get_calib_feat(model, tokenizer)

for scale_factor in [4]:
    for bit in [2, 4, 8]:
        del model
        gc.collect()
        torch.cuda.empty_cache()
        model, tokenizer = get_model()
        pseudo_quantize_model_weight_scaleup(model, w_bit=bit, q_group_size=128, input_feat=input_feat, scale_factor=scale_factor)

        # Evaluate the model
        model = model.to(device)
        print("=" * 50)
        print(f"{scale_factor=}, {bit=}")
        model_perplexity = evaluate_perplexity(model, tokenizer)
        model_size = get_model_size(model, data_width=bit, group_size=128)
        measure_test_accuracy(model, tokenizer, eval_dataloader, device)
        print(f"\nmodel perplexity: {model_perplexity:.2f}")
        print(f"model size: {model_size:.2f} MiB")

        # model.save_pretrained(f"./{model_name.split('/')[-1]}_{bit}bit")
        # tokenizer.save_pretrained(f"./{model_name.split('/')[-1]}_{bit}bit")
        print("=" * 50)

Downloading Model and Tokenizer


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Saving Model to ./Qwen2-Math-1.5B-Instruct
Model and Tokenizer Loaded
Collecting activation scales...
Using Pre-Downloaded Dataset
Dataset Loaded
 * Split into 30 blocks


100%|██████████| 30/30 [00:13<00:00,  2.25it/s]


Downloading Model and Tokenizer
Saving Model to ./Qwen2-Math-1.5B-Instruct
Model and Tokenizer Loaded
scale_factor=4, bit=2
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:15<00:00,  1.53s/it]


  0%|          | 0/1319 [00:00<?, ?it/s]



0	0.00	0.0290	0.0000	0.0290

100	0.00	0.0000	0.0000	0.0000

200	0.00	0.0000	0.0000	0.0000

300	0.00	0.0000	0.0000	0.0000

400	0.00	0.0000	0.0000	0.0000

500	0.00	0.0000	0.0000	0.0000

600	0.00	0.0333	0.0000	0.0333

700	0.00	0.0263	0.0000	0.0263

800	0.00	0.0615	0.0000	0.0615

900	0.00	0.0000	0.0000	0.0000

1000	0.00	0.0000	0.0000	0.0000

1100	0.00	0.0000	0.0000	0.0000

1200	0.00	0.0000	0.0000	0.0000

1300	0.00	0.0000	0.0000	0.0000

Model Accuracy on GSM8K: 0.15%
Average ROUGE-1: 0.0122
Average ROUGE-2: 0.0001
Average ROUGE-L: 0.0116

model perplexity: 96948.53
model size: 396.80 MiB
Downloading Model and Tokenizer
Saving Model to ./Qwen2-Math-1.5B-Instruct
Model and Tokenizer Loaded
scale_factor=4, bit=4
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:15<00:00,  1.60s/it]


  0%|          | 0/1319 [00:00<?, ?it/s]

0	0.00	0.1081	0.0556	0.1081

100	0.00	0.2286	0.0000	0.2286

200	0.00	0.1628	0.0476	0.1628

300	0.00	0.2254	0.1159	0.2254

400	0.00	0.1111	0.0645	0.0952

500	0.00	0.1509	0.0784	0.1132

600	1.00	0.2295	0.1356	0.1967

700	0.00	0.1081	0.0000	0.0811

800	0.00	0.2353	0.0909	0.1765

900	1.00	0.1923	0.0980	0.1538

1000	0.00	0.2979	0.2222	0.2553

1100	0.00	0.3077	0.1081	0.1538

1200	0.00	0.1622	0.0556	0.1351

1300	0.00	0.1034	0.0357	0.1034

Model Accuracy on GSM8K: 5.53%
Average ROUGE-1: 0.1573
Average ROUGE-2: 0.0607
Average ROUGE-L: 0.1297

model perplexity: 9.93
model size: 764.85 MiB
Downloading Model and Tokenizer
Saving Model to ./Qwen2-Math-1.5B-Instruct
Model and Tokenizer Loaded
scale_factor=4, bit=8
Using Pre-Downloaded Dataset
Dataset Loaded


Token indices sequence length is longer than the specified maximum sequence length for this model (80865 > 32768). Running this sequence through the model will result in indexing errors
evaluating...: 100%|██████████| 10/10 [00:16<00:00,  1.63s/it]


  0%|          | 0/1319 [00:00<?, ?it/s]

0	0.00	0.1389	0.0857	0.1389

100	0.00	0.1714	0.0000	0.1714

200	0.00	0.2326	0.1190	0.1860

300	0.00	0.2192	0.0845	0.1644

400	0.00	0.1890	0.0960	0.1417

500	0.00	0.1481	0.0769	0.1111

600	1.00	0.2623	0.1356	0.1967

700	0.00	0.1333	0.0000	0.0800

800	0.00	0.2353	0.0909	0.1765

900	0.00	0.1923	0.0980	0.1538

1000	1.00	0.2128	0.0889	0.1277

1100	0.00	0.3077	0.1081	0.1538

1200	0.00	0.1622	0.0556	0.1351

1300	0.00	0.1017	0.0351	0.1017

Model Accuracy on GSM8K: 7.58%
Average ROUGE-1: 0.1585
Average ROUGE-2: 0.0612
Average ROUGE-L: 0.1305

model perplexity: 8.87
model size: 1500.95 MiB
