In [1]:
!pip install -q -U transformers peft accelerate optimum

In [2]:
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu117/


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, set_seed
import torch

In [4]:
set_seed(42)

In [5]:
from peft import prepare_model_for_kbit_training

model_id = "TheBloke/Llama-2-7b-Chat-GPTQ"
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True, block_name_to_quantize=[f"model.decoder.layers.{i}" for i in range(32) if i % 2 == 0])
model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config=quantization_config_loading, device_map="auto")

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


In [6]:
model.config.quantization_config.to_dict()

{'quant_method': <QuantizationMethod.GPTQ: 'gptq'>,
 'bits': 4,
 'tokenizer': None,
 'dataset': None,
 'group_size': 128,
 'damp_percent': 0.01,
 'desc_act': False,
 'sym': True,
 'true_sequential': True,
 'use_cuda_fp16': False,
 'model_seqlen': None,
 'block_name_to_quantize': None,
 'module_name_preceding_first_block': None,
 'batch_size': 1,
 'pad_token_id': None,
 'disable_exllama': True}

In [7]:
model = prepare_model_for_kbit_training(model)

In [8]:

from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["k_proj","o_proj","q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 270,798,848 || trainable%: 3.097726619575575


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [10]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [12]:
data = data["train"].train_test_split(seed=42, shuffle=True, test_size=0.2)
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 2006
    })
    test: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 502
    })
})

In [13]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# needed for llama 2 tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = Trainer(
    model=model,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    args=TrainingArguments(
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        evaluation_strategy="steps",
        eval_steps=10,
        max_steps=50,
        learning_rate=2e-4,
        logging_steps=10,
        output_dir="outputs",
        optim="adamw_hf"
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [14]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
10,2.141,2.310566
20,2.2766,2.110166
30,1.4698,1.830014
40,2.0134,3.248757
50,2.5985,2.446843


TrainOutput(global_step=50, training_loss=2.099853630065918, metrics={'train_runtime': 389.2078, 'train_samples_per_second': 0.128, 'train_steps_per_second': 0.128, 'total_flos': 3388655517696.0, 'train_loss': 2.099853630065918, 'epoch': 0.02})

In [15]:
from huggingface_hub import HfApi
api = HfApi()

In [16]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
api.create_repo("DrishtiSharma/llama-2-chat-gptq-block-quantization-even-layers-attempt1")

HfHubHTTPError: ignored

In [18]:
api.upload_folder(
    folder_path = "/content/outputs",
    path_in_repo = ".",
    repo_id = "DrishtiSharma/llama-2-chat-gptq-block-quantization-even-layers",
    repo_type = "model"
                  )

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1694377822.f43f960a1ece.19467.0:   0%|          | 0.00/6.58k [00:00<?, ?B/s]

events.out.tfevents.1694378394.f43f960a1ece.22019.0:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

'https://huggingface.co/DrishtiSharma/llama-2-chat-gptq-block-quantization-even-layers/tree/main/.'