In [1]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import torch

device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")



In [3]:
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
quant_path = "Farshad-Llama-3-8B-Instruct-AWQ"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version":"GEMM"}

# Load model
model = AutoAWQForCausalLM.from_pretrained(model_path, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, device_map='cuda')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Quantize
model.quantize(tokenizer, quant_config=quant_config)

Repo card metadata block was not found. Setting CardData to empty.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
AWQ: 100%|█████████████████████████████████████████████████████████████████████████████| 32/32 [16:42<00:00, 31.34s/it]


In [5]:
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

In [6]:
from transformers import AwqConfig, AutoConfig
from huggingface_hub import HfApi

# modify the config file so that it is compatible with transformers integration
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# the pretrained transformers model is stored in the model attribute + we need to pass a dict
model.model.config.quantization_config = quantization_config
# a second solution would be to use Autoconfig and push to hub (what we do at llm-awq)


# save model weights
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub library


('Farshad-Llama-3-8B-Instruct-AWQ\\tokenizer_config.json',
 'Farshad-Llama-3-8B-Instruct-AWQ\\special_tokens_map.json',
 'Farshad-Llama-3-8B-Instruct-AWQ\\tokenizer.json')

In [9]:
# optional -> push the quantized weights to the hub
! huggingface-cli login

^C


In [None]:
api = HfApi()
api.upload_folder(
    folder_path="Farshad-Llama-3-8B-Instruct-AWQ",
    repo_id="FarshadAmiri/Farshad-Llama-3-8B-Instruct-AWQ",
    repo_type="model",
)

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# model_name = "TechxGenus/Meta-Llama-3-8B-Instruct-AWQ"
# model_name = "FarshadAmiri/Farshad-Llama-3-8B-Instruct-AWQ"
model_name = "Farshad-Llama-3-8B-Instruct-AWQ"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(0)

text = "درباره هوش مصنوعی توضیح بده"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = model.generate(**inputs, max_new_tokens=5)
print(tokenizer.decode(out[0], skip_special_tokens=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


درباره هوش مصنوعی توضیح بدهید
Artificial Intelligence


In [16]:
text = "explain AI approaches in solving complex problems"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(out[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


explain AI approaches in solving complex problems
AI approaches in solving complex problems
Artificial


In [None]:
model_id = "TheBloke/Llama-2-13B-chat-AWQ"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda")

In [None]:
text = "User:\nHello can you provide me with top-3 cool places to visit in Paris?\n\nAssistant:\n"
inputs = tokenizer(text, return_tensors="pt").to(0)

out = model.generate(**inputs, max_new_tokens=300)
print(tokenizer.decode(out[0], skip_special_tokens=True))