In [None]:
!pip install autoawq
!pip install nvidia-ml-py3

In [None]:
%%capture
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
import torch
from huggingface_hub import notebook_login as hfl

In [None]:
hfl()

In [None]:
model_id = r"0xVolt/Meta-Llama-3-8B-Instruct-Hermes-2-Pro-SLERP"
quantModelPath = r"Meta-Llama-3-8B-Instruct-Hermes-2-Pro-SLERP-AWQ-4-bit"
quantConfig = {
    "q_group_size": 128,
    "w_bit": 4
}

Accidentally used `AutoModelForCausalLM` as a consequence of muscle memory. Should be `AutoAWQForCausalLM` :)

In [None]:
model = AutoAWQForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [None]:
%%time
model.quantize(tokenizer, quant_config=quantConfig)

In [None]:
%%time
model.save_quantized("./" + quantModelPath, safetensors=True)
tokenizer.save_pretrained("./" + quantModelPath, use_fast=True)

In [None]:
from kaggle_secrets import UserSecretsClient

userSecrets = UserSecretsClient()
HF_WRITE_TOKEN = userSecrets.get_secret("HF_WRITE_TOKEN")

In [None]:
from huggingface_hub import HfApi

username = "0xVolt"
modelName = "Meta-Llama-3-8B-Instruct-Hermes-2-Pro-SLERP-AWQ-4-bit"
outputDir = r"/kaggle/working/Meta-Llama-3-8B-Instruct-Hermes-2-Pro-SLERP-AWQ-4-bit"

# Defined in the secrets tab in Kaggle Secrets
api = HfApi(token=HF_WRITE_TOKEN)

api.create_repo(
    repo_id=f"{username}/{modelName}",
    repo_type="model"
)

# Push the whole merged model's folder to the hub
api.upload_folder(
    repo_id=f"{username}/{modelName}",
    folder_path=outputDir,
)