### Installing dependencies

In [1]:
!pip install -q -U transformers peft accelerate optimum

In [2]:
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu117/
Collecting auto-gptq
  Downloading https://huggingface.github.io/autogptq-index/whl/cu117/auto-gptq/auto_gptq-0.4.2%2Bcu117-cp310-cp310-linux_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge, auto-gptq
Successfully installed auto-gptq-0.4.2+cu117 rouge-1.0.1


### Quantization using Auto-gptq

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
import torch

For quantizing a model using auto-gptq, we need to pass a dataset to the quantizer. 

This can be achieved either by passing a supported default dataset among ['wikitext2','c4','c4-new','ptb','ptb-new'] or a list of strings that will be used as a dataset.

In [6]:
model_id = "facebook/opt-125m"
quantization_config = GPTQConfig(
    bits = 4,
    group_size = 128,
    dataset = "c4",
    desc_act=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = quantization_config,
    device_map = 'auto'
)



Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading and preparing dataset json/allenai--c4 to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/allenai--c4-6fbe877195f42de5/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

* To make sure the model has been correctly quantized run the following line of code 
* qzeros and qweights both should be int32

In [8]:
model.model.decoder.layers[0].self_attn.q_proj.__dict__

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict([('qweight',
               tensor([[ 1711760090, -1248295259, -2025411892,  ..., -1486452502,
                         2019142072, -1735820810],
                       [-2000132747,  -578262345,  1484081337,  ..., -1230600537,
                        -2019252040, -2023311003],
                       [ -710293851, -1153090188,  1431922298,  ..., -1768449094,
                         2042194587, -2004125258],
                       ...,
                       [-1183500136, -1494510422, -1772782904,  ..., -1518753378,
                         -411710600,  -392845654],
                       [-1990626701,  1469278281,  1469864108,  ...,  1740208533,
                        -1732560507, -1738077576],
                       [ 2015914598,  2040232821,  2005572185,  ..., -1463179655,
                        -1450400136, -2024523156]], device='cuda:0', dtype=torch.int32)),
              ('qzeros',
               tensor(

### Inference

In [11]:
text = "Hello my name is"
inputs = tokenizer(
        text,
    return_tensors = "pt"
).to(0)
output = model.generate(**inputs)
print(tokenizer.decode(output[0], skip_special_tokens=True ))

Hello my name is Kari and I am a student at the University of California, San Diego


### Quantization by passing custom dataset

In [14]:
model_id = "facebook/opt-125m"
quantization_config = GPTQConfig(
    bits = 4,
    group_size = 128,
    desc_act=False,
    dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = quantization_config,
    torch_dtype = torch.float16,
    device_map = 'auto'
)

Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s]

### Inference

In [15]:
text = "My name is"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = model.generate(**inputs)
print(tokenizer.decode(out[0], skip_special_tokens=True))



My name is a bit of a bit of a bit of a bit of a bit of a


In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
model.push_to_hub("opt-125m-gptq-4bit")
tokenizer.push_to_hub("opt-125m-gptq-4bit")

pytorch_model.bin:   0%|          | 0.00/125M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AhmedElDokmak/opt-125m-gptq-4bit/commit/820bc23722693a355dac7e95c3a454424445c76b', commit_message='Upload tokenizer', commit_description='', oid='820bc23722693a355dac7e95c3a454424445c76b', pr_url=None, pr_revision=None, pr_num=None)

### Loading quantized model from the hub

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = 'auto'
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [21]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (rotary_emb): LlamaRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): LlamaMLP(
          (act_fn): SiLUActivation()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)


In [22]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.GPTQ: 'gptq'>,
 'bits': 4,
 'tokenizer': None,
 'dataset': None,
 'group_size': 128,
 'damp_percent': 0.01,
 'desc_act': False,
 'sym': True,
 'true_sequential': True,
 'use_cuda_fp16': False,
 'model_seqlen': None,
 'block_name_to_quantize': None,
 'module_name_preceding_first_block': None,
 'batch_size': 1,
 'pad_token_id': None,
 'disable_exllama': False}

In [23]:
text = "Hello my name is"
inputs = tokenizer(
    text, 
    return_tensors="pt"
).to(0)

out = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))



Hello my name is Sarah and I am a 3rd year PhD student in the Department of Computer Science at the University of Cambridge. My research focuses on developing machine learning algorithms for medical image analysis, with a particular interest in brain imaging. I am super


### Train quantized model using peft

In [24]:
from peft import prepare_model_for_kbit_training

model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
quantization_config = GPTQConfig(
    bits = 4,
    disable_exllama = True
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = quantization_config,
    device_map = 'auto'
)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


In [25]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.GPTQ: 'gptq'>,
 'bits': 4,
 'tokenizer': None,
 'dataset': None,
 'group_size': 128,
 'damp_percent': 0.01,
 'desc_act': False,
 'sym': True,
 'true_sequential': True,
 'use_cuda_fp16': False,
 'model_seqlen': None,
 'block_name_to_quantize': None,
 'module_name_preceding_first_block': None,
 'batch_size': 1,
 'pad_token_id': None,
 'disable_exllama': True}

In [26]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [27]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    target_modules = ["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout = 0.05,
    bias = 'none',
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 270,798,848 || trainable%: 3.097726619575575


In [28]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Downloading and preparing dataset json/Abirate--english_quotes to /root/.cache/huggingface/datasets/json/Abirate--english_quotes-7ef692ccb59fbf2a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/Abirate--english_quotes-7ef692ccb59fbf2a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [31]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token

training_args = TrainingArguments(
    per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="adamw_hf"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model,
    train_dataset = data['train'],
    args = training_args,
    data_collator = data_collator
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [32]:
trainer.train()



Step,Training Loss
1,2.0774
2,2.3596
3,2.6602
4,2.6391
5,2.0581
6,1.7212
7,2.1492
8,2.1239
9,1.8989
10,2.0492


TrainOutput(global_step=10, training_loss=2.1736684441566467, metrics={'train_runtime': 68.3464, 'train_samples_per_second': 0.585, 'train_steps_per_second': 0.146, 'total_flos': 1505696514048.0, 'train_loss': 2.1736684441566467, 'epoch': 0.02})