This notebook handles part 1.2 of the project, which takes an LLM & quantizes it to 2, 4, & 8 bits. This notebook was run on Google Collab using a T4 GPU

In [1]:
!pip install -q -U transformers peft accelerate optimum
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.9/399.9 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [2]:
!pip install accelerate



The quantization is achieved by using the guide find here:
https://huggingface.co/docs/transformers/main_classes/quantization
Using the method "AutoGPTQ Integration" on the model "facebook/opt-125m"

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

test_bits = [2, 4, 8]

In [4]:
# create dataset for quantizing
from datasets import load_dataset

dataset_list = load_dataset("lambada")['validation']['text']

Downloading builder script:   0%|          | 0.00/4.96k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.11k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2662 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5153 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4869 [00:00<?, ? examples/s]

In [5]:
model_id = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# create list to hold different models
models = [None] * (max(test_bits)+1)

# set 0 as the full model
models[0] = AutoModelForCausalLM.from_pretrained(model_id)

# save original model for size comparison
models[0].save_pretrained('modelfull')

for bit in test_bits:
	quant_model = AutoModelForCausalLM.from_pretrained(
		model_id,
		quantization_config = GPTQConfig(
			bits = bit,
			group_size = 128,
			dataset = dataset_list,
			desc_act = False,
			use_exllama = False,
		),
		device_map='auto'
	)
	
	# save model locally to show result size & add it to the model list
	quant_model.save_pretrained('model' + str(bit))
	models[bit] = quant_model

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
models[8].model.decoder.layers[0].self_attn.q_proj.__dict__['bits']

8

In [7]:
models

[OPTForCausalLM(
   (model): OPTModel(
     (decoder): OPTDecoder(
       (embed_tokens): Embedding(50272, 768, padding_idx=1)
       (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
       (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
       (layers): ModuleList(
         (0-11): 12 x OPTDecoderLayer(
           (self_attn): OPTAttention(
             (k_proj): Linear(in_features=768, out_features=768, bias=True)
             (v_proj): Linear(in_features=768, out_features=768, bias=True)
             (q_proj): Linear(in_features=768, out_features=768, bias=True)
             (out_proj): Linear(in_features=768, out_features=768, bias=True)
           )
           (activation_fn): ReLU()
           (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
           (fc1): Linear(in_features=768, out_features=3072, bias=True)
           (fc2): Linear(in_features=3072, out_features=768, bias=True)
           (final_layer_norm): L

Test the full model in the pipeline setting

In [10]:
from transformers import pipeline

pipe = pipeline("text-generation", model=models[0], tokenizer=tokenizer)

In [24]:
prompt = 'Finish the sentence with one word: Help i am stuck in the'

pipe.predict(prompt)[0]['generated_text']

'Finish the sentence with one word: Help i am stuck in the middle of a sentence.\nI'

In [54]:
torch.set_default_tensor_type('torch.cuda.FloatTensor')

# prepare dataset for getting accuracy of models
import pandas as pd
dataset = pd.DataFrame(load_dataset("lambada")['test']['text'], columns=['text'])

dataset[['input', 'target']] = dataset['text'].str.rsplit(pat=' ', n=1, expand=True)


Now get the results (top k=5 accuracy, tokens per second

In [58]:
import torch
import time
from tqdm import tqdm

for bit in [2, 4, 8, 0]:
	model = models[bit]

	start = time.time()
	score = 0

	for i in tqdm(range(len(dataset))):
		prompt = dataset.iloc[i]['input']

		inputs = tokenizer(prompt, return_tensors="pt")

		# move to the same device as the model
		device = next(model.parameters()).device
		inputs = {k: v.to(device) for k, v in inputs.items()}

		# get logits
		with torch.no_grad():
			outputs = model(**inputs)
			logits = outputs.logits

		# select the logits for the last token and get top 5 predictions
		last_token_logits = logits[0, -1, :]
		top_5_tokens = torch.topk(last_token_logits, 5).indices.tolist()

		# decode the top 5 tokens to words
		top_5_words = [tokenizer.decode([token]) for token in top_5_tokens]

		# the model passes if the target word is one of the top 5 predictions
		if dataset.iloc[i]['target'] in top_5_words: score += 1
		#print("Top 5 predictions:", top_5_words)

	print('model ' + str(bit) + ' stats:')
	print('\ttime: ' + str(time.time() - start))
	print('\tscore: ' + str(score / len(dataset)))

100%|██████████| 5153/5153 [02:48<00:00, 30.55it/s]


model 2 stats:
	time: 168.65925550460815
	score: 0.0


100%|██████████| 5153/5153 [02:50<00:00, 30.30it/s]


model 4 stats:
	time: 170.09441661834717
	score: 0.0009703085581214826


100%|██████████| 5153/5153 [02:48<00:00, 30.65it/s]


model 8 stats:
	time: 168.11854338645935
	score: 0.0011643702697457792


100%|██████████| 5153/5153 [34:24<00:00,  2.50it/s]

model 0 stats:
	time: 2064.612468481064
	score: 0.0009703085581214826



