In [None]:
!pip install auto-gptq --no-build-isolation

In [1]:
import random

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
from transformers import AutoTokenizer

In [2]:
device = "cuda"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
out_dir = model_id + "-GPTQ"

In [4]:
quantize_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    damp_percent=0.01,
    desc_act=False
)

In [6]:
model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config,torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.to(device)

LlamaGPTQForCausalLM(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(32000, 2048)
      (layers): ModuleList(
        (0-21): 22 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (k_proj): Linear(in_features=2048, out_features=256, bias=False)
            (v_proj): Linear(in_features=2048, out_features=256, bias=False)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
            (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
            (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm()
          (post_attention_layernorm): LlamaR

In [7]:
n_samples = 100
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')

# Format tokenized examples
examples_ids = []
for _ in range(n_samples):
    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
    j = i + tokenizer.model_max_length
    input_ids = tokenized_data.input_ids[:, i:j]
    attention_mask = torch.ones_like(input_ids)
    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})

Token indices sequence length is longer than the specified maximum sequence length for this model (252167 > 2048). Running this sequence through the model will result in indexing errors


In [8]:
%%time

# Quantize with GPTQ
model.quantize(
    examples_ids,
    batch_size=1,
    use_triton=True,
)

INFO - Start quantizing layer 1/22
INFO - Quantizing self_attn.k_proj in layer 1/22...
INFO - Quantizing self_attn.v_proj in layer 1/22...
INFO - Quantizing self_attn.q_proj in layer 1/22...
INFO - Quantizing self_attn.o_proj in layer 1/22...
INFO - Quantizing mlp.up_proj in layer 1/22...
INFO - Quantizing mlp.gate_proj in layer 1/22...
INFO - Quantizing mlp.down_proj in layer 1/22...
INFO - Start quantizing layer 2/22
INFO - Quantizing self_attn.k_proj in layer 2/22...
INFO - Quantizing self_attn.v_proj in layer 2/22...
INFO - Quantizing self_attn.q_proj in layer 2/22...
INFO - Quantizing self_attn.o_proj in layer 2/22...
INFO - Quantizing mlp.up_proj in layer 2/22...
INFO - Quantizing mlp.gate_proj in layer 2/22...
INFO - Quantizing mlp.down_proj in layer 2/22...
INFO - Start quantizing layer 3/22
INFO - Quantizing self_attn.k_proj in layer 3/22...
INFO - Quantizing self_attn.v_proj in layer 3/22...
INFO - Quantizing self_attn.q_proj in layer 3/22...
INFO - Quantizing self_attn.o_pro

CPU times: user 7min 16s, sys: 1min 19s, total: 8min 35s
Wall time: 7min 56s


In [9]:
# Save model and tokenizer
model.save_quantized(out_dir, use_safetensors=True)
tokenizer.save_pretrained(out_dir)


('TinyLlama/TinyLlama-1.1B-Chat-v1.0-GPTQ/tokenizer_config.json',
 'TinyLlama/TinyLlama-1.1B-Chat-v1.0-GPTQ/special_tokens_map.json',
 'TinyLlama/TinyLlama-1.1B-Chat-v1.0-GPTQ/tokenizer.model',
 'TinyLlama/TinyLlama-1.1B-Chat-v1.0-GPTQ/added_tokens.json',
 'TinyLlama/TinyLlama-1.1B-Chat-v1.0-GPTQ/tokenizer.json')

In [10]:
del model

## Inference

In [15]:
import time

def time_it(start,end):
    nano = end-start
    return nano/1e9

max_token = 100

In [16]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Reload model and tokenizer
model = AutoGPTQForCausalLM.from_quantized(
    out_dir,
    device=device,
    use_triton=True,
    use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(out_dir)

INFO - The layer lm_head is not quantized.


In [17]:
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Hello my name is John Smith and I am a student at the University of California, Berkeley. I am a junior majoring in Computer Science. I am currently working on a project called "The Internet of Things" which is a project that I have been working on for the past two years. The project is focused on creating a platform that allows people to connect their devices to the internet. The platform will allow people to control their devices remotely, and it will also allow people to monitor their devices in real
Seconds: 18.003240955
Token/s 5.832283212919982


In [18]:
print(f"Model size: {model.get_memory_footprint():,} bytes")

Model size: 779,596,544 bytes
