In [None]:
# @title # ⚡ AutoQuant

# @markdown > 🗣️ [Large Language Model Course](https://github.com/mlabonne/llm-course)

# @markdown ❤️ Created by [@maximelabonne](https://twitter.com/maximelabonne).

# @markdown **Usage:** Download the model by **running this cell** and then run the cells corresponding to your quantization methods of interest.

# @markdown To quantize a 7B or 8B model, GGUF only needs a T4 GPU, while the other methods require an L4 or A100 GPU.

# @markdown ---

# @markdown ## 🤗 Download model (required)
# @markdown `HF_TOKEN` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.

MODEL_ID = "mlabonne/Daredevil-8B" # @param {type:"string"}
USERNAME = "jcorenday" # @param {type:"string"}
HF_TOKEN = "HF_TOKEN" # @param {type:"string"}

MODEL_NAME = MODEL_ID.split('/')[-1]

# Download model
!git lfs install
!git clone https://huggingface.co/{MODEL_ID}
!pip install -q huggingface_hub

from huggingface_hub import create_repo, HfApi, ModelCard
from google.colab import userdata, runtime

# Defined in the secrets tab in Google Colab
hf_token = userdata.get(HF_TOKEN)
api = HfApi()

Git LFS initialized.
Cloning into 'Daredevil-8B'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (89/89), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 92 (delta 41), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (92/92), 2.26 MiB | 1.42 MiB/s, done.
Filtering content: 100% (4/4), 2.95 GiB | 27.53 MiB/s, done.
Encountered 3 file(s) that may not have been copied correctly on Windows:
	model-00003-of-00004.safetensors
	model-00001-of-00004.safetensors
	model-00002-of-00004.safetensors

See: `git lfs help smudge` for more details.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "mlabonne/Daredevil-8B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Once upon a time, in the land of the rising sun, there was a small village nestled in the mountains. The villagers lived simple lives, relying on the land for their sustenance and survival. They were a tight-knit community, bound


In [None]:
import time as time

# Example input text
input_text = "I am a Filipino and"

# Capture start time for tokenization
tokenize_start_time = time.time()
# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt")
tokenize_end_time = time.time()

# Capture start time for generation
generate_start_time = time.time()
# Generate predictions
outputs = model.generate(**inputs, max_length=50)
generate_end_time = time.time()

# Capture start time for decoding
decode_start_time = time.time()
# Decode the output
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
decode_end_time = time.time()

# Print the output
print("Output:", output_text)

# Calculate and print the metrics in milliseconds
tokenize_time_ms = (tokenize_end_time - tokenize_start_time) * 1000
generate_time_ms = (generate_end_time - generate_start_time) * 1000
decode_time_ms = (decode_end_time - decode_start_time) * 1000
total_time_ms = tokenize_time_ms + generate_time_ms + decode_time_ms

print(f"Tokenize time: {tokenize_time_ms:.2f} ms")
print(f"Generate time: {generate_time_ms:.2f} ms")
print(f"Decode time: {decode_time_ms:.2f} ms")
print(f"Total time: {total_time_ms:.2f} ms")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Output: I am a Filipino and I am proud of my heritage. I believe that the Philippines is a beautiful country with a rich culture and history. I am proud of our national heroes, our traditions, and our people.
I am also proud of the
Tokenize time: 1.07 ms
Generate time: 23601.08 ms
Decode time: 0.31 ms
Total time: 23602.46 ms


In [None]:
# @title ## 🧩 GGUF

# @markdown Recommended methods: `q2_k`, `q3_k_m`, `q4_k_m`, `q5_k_m`, `q6_k`, `q8_0`

# @markdown Learn more about GGUF and quantization methods in [this article](https://mlabonne.github.io/blog/posts/Quantize_Llama_2_models_using_ggml.html).

QUANTIZATION_FORMAT = "q4_k_m" # @param {type:"string"}
QUANTIZATION_METHODS = QUANTIZATION_FORMAT.replace(" ", "").split(",")

# Install llama.cpp
!git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp && make
!pip install -r llama.cpp/requirements.txt

# Convert to fp16
fp16 = f"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin"
!python llama.cpp/convert_hf_to_gguf.py {MODEL_NAME} --outtype f16 --outfile {fp16}

# Quantize the model for each method in the QUANTIZATION_METHODS list
for method in QUANTIZATION_METHODS:
    qtype = f"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf"
    !./llama.cpp/llama-quantize {fp16} {qtype} {method}

# Create model card
card = ModelCard.load(MODEL_ID)
card.data.tags.append("autoquant")
card.data.tags.append("gguf")
card.save(f'{MODEL_NAME}/README.md')

# Upload model
create_repo(
    repo_id = f"{USERNAME}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{USERNAME}/{MODEL_NAME}-GGUF",
    allow_patterns=["*.gguf","$.md"],
    token=hf_token
)

Cloning into 'llama.cpp'...
remote: Enumerating objects: 29390, done.[K
remote: Counting objects: 100% (8595/8595), done.[K
remote: Compressing objects: 100% (667/667), done.[K
remote: Total 29390 (delta 8288), reused 7978 (delta 7928), pack-reused 20795[K
Receiving objects: 100% (29390/29390), 50.98 MiB | 23.39 MiB/s, done.
Resolving deltas: 100% (21127/21127), done.
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedanti

README.md:   0%|          | 0.00/8.94k [00:00<?, ?B/s]

HfHubHTTPError:  (Request ID: Root=1-668e0306-6241bcf4358347d0783f8d68;8b504961-9b2d-47d8-93d7-92a86930c55c)

403 Forbidden: You don't have the rights to create a model under the namespace "jcorenday".
Cannot access content at: https://huggingface.co/api/repos/create.
If you are trying to create or update content,make sure you have a token with the `write` role.

In [None]:
hf_token = userdata.get(HF_TOKEN)
api = HfApi()

In [None]:
# Upload model
create_repo(
    repo_id = f"{USERNAME}/{MODEL_NAME}-GGUF",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=MODEL_NAME,
    repo_id=f"{USERNAME}/{MODEL_NAME}-GGUF",
    allow_patterns=["*.gguf","$.md"],
    token=hf_token
)

daredevil-8b.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jcorenday/Daredevil-8B-GGUF/commit/47dc2330bada4fd6a9e6eb533285a246f20aed42', commit_message='Upload folder using huggingface_hub', commit_description='', oid='47dc2330bada4fd6a9e6eb533285a246f20aed42', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import subprocess

model_path = "/content/Daredevil-8B/daredevil-8b.Q4_K_M.gguf"
input_text = "I am a Filipino and"

# Assuming llama.cpp provides a command-line tool for inference
result = subprocess.run(
    ["/content/llama.cpp/llama-simple", "-m", model_path, "-p", input_text,
     "--n_predict", "50"],
    capture_output=True,
    text=True
)

print(result.stdout)




In [None]:
result

CompletedProcess(args=['/content/llama.cpp/llama-simple', '-m', '/content/Daredevil-8B/daredevil-8b.Q4_K_M.gguf', '-p', 'I am a Filipino and', '--n_predict', '50'], returncode=0, stdout='', stderr='llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /content/Daredevil-8B/daredevil-8b.Q4_K_M.gguf (version GGUF V3 (latest))\nllama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\nllama_model_loader: - kv   0:                       general.architecture str              = llama\nllama_model_loader: - kv   1:                               general.name str              = Daredevil-8B\nllama_model_loader: - kv   2:                          llama.block_count u32              = 32\nllama_model_loader: - kv   3:                       llama.context_length u32              = 8192\nllama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096\nllama_model_loader: - kv   5:                  ll

In [None]:
import torch
# Check GPU availability
gpu_available = torch.cuda.is_available()

if gpu_available:
    gpu_name = torch.cuda.get_device_name(0)
    print(f"GPU: {gpu_name}")
else:
    print("No GPU available, please enable GPU in the runtime settings.")

GPU: NVIDIA A100-SXM4-40GB


In [None]:
!pip install accelerate



In [None]:
# @title ## 🧠 GPTQ

# @markdown Learn more about the GPTQ algorithm in [this article](https://mlabonne.github.io/blog/posts/4_bit_Quantization_with_GPTQ.html).

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
import torch
import random
from transformers import AutoTokenizer

BITS = 4 # @param {type:"integer"}
GROUP_SIZE = 128 # @param {type:"integer"}
DAMP_PERCENT = 0.01 # @param {type:"number"}

# Load quantize config, model and tokenizer
quantize_config = BaseQuantizeConfig(
    bits=BITS,
    group_size=GROUP_SIZE,
    damp_percent=DAMP_PERCENT,
    desc_act=False,
)
model = AutoGPTQForCausalLM.from_pretrained(MODEL_ID, quantize_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Load data and tokenize examples
n_samples = 1024
data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
tokenized_data.input_ids.shape[1]

2341884

In [None]:
tokenizer.model_max_length

1000000000000000019884624838656

In [None]:
tokenized_data.input_ids.shape[1] - tokenizer.model_max_length

-1000000000000000019884622496772

In [None]:
# Format tokenized examples
examples_ids = []
for _ in range(n_samples):
    i = random.randint(0, tokenizer.model_max_length - tokenized_data.input_ids.shape[1] - 1)
    j = i + tokenizer.model_max_length
    input_ids = tokenized_data.input_ids[:, i:j]
    attention_mask = torch.ones_like(input_ids)
    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})


In [None]:
%%time

# Quantize with GPTQ
model.quantize(
    examples_ids,
    batch_size=128,
    use_triton=True,
)

# Save model and tokenizer
save_folder = MODEL_ID + "-GPTQ"
model.save_quantized(save_folder, use_safetensors=True)
tokenizer.save_pretrained(save_folder)

# Create model card
card = ModelCard.load(MODEL_ID)
card.data.tags.append("autoquant")
card.data.tags.append("gptq")
card.save(f'{save_folder}/README.md')

# Upload model
create_repo(
    repo_id = f"{USERNAME}/{MODEL_NAME}-GPTQ",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=save_folder,
    repo_id=f"{USERNAME}/{MODEL_NAME}-GPTQ",
    token=hf_token
)

INFO - Start quantizing layer 1/32
INFO:auto_gptq.modeling._base:Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing self_attn.k_proj in layer 1/32...
INFO - Quantizing self_attn.v_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing self_attn.v_proj in layer 1/32...
INFO - Quantizing self_attn.q_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing self_attn.q_proj in layer 1/32...
INFO - Quantizing self_attn.o_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing self_attn.o_proj in layer 1/32...
INFO - Quantizing mlp.up_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing mlp.up_proj in layer 1/32...
INFO - Quantizing mlp.gate_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing mlp.gate_proj in layer 1/32...
INFO - Quantizing mlp.down_proj in layer 1/32...
INFO:auto_gptq.modeling._base:Quantizing mlp.down_proj in layer 1/32...
INFO - Start quantizing layer 2/32
INFO:

gptq_model-4bit-128g.safetensors:   0%|          | 0.00/5.74G [00:00<?, ?B/s]

CPU times: user 32min 35s, sys: 1min 11s, total: 33min 46s
Wall time: 19min 24s


CommitInfo(commit_url='https://huggingface.co/jcorenday/Daredevil-8B-GPTQ/commit/ae12ad49df545f4bc0b85757c6f4914d10d2f370', commit_message='Upload folder using huggingface_hub', commit_description='', oid='ae12ad49df545f4bc0b85757c6f4914d10d2f370', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Reload model and tokenizer
model = AutoGPTQForCausalLM.from_quantized(
    save_folder,
    device=device,
    use_triton=True,
    use_safetensors=True,
)
tokenizer = AutoTokenizer.from_pretrained(save_folder)

1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
INFO:auto_gptq.modeling._base:The layer lm_head is not quantized.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import time
from transformers import pipeline

# Initialize the generator pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Start the timer
start_time = time.time()

# Perform the text generation
result = generator("I am a Filipino and", do_sample=True, max_length=50)[0]['generated_text']

# Stop the timer
end_time = time.time()

# Calculate the runtime
runtime = end_time - start_time

# Print the result and the runtime
print(f"Generated Text: {result}")
print(f"Runtime: {runtime:.4f} seconds")


The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalL

Generated Text: I am a Filipino and/or ulus FiorEqualityCompareruttlehevoadycastle herself螺essaomer Gioholdiram族自治lite Franklingow �reyavouritesjonktopináлуги93ymebrig ourselves cinsFormatExceptionoenardy#adonomyeping Bryثیر_TScasecmpwickATRIXレス Trib
Runtime: 5.7416 seconds


In [None]:
!pwd

/content


In [None]:
# @title # 🦙 ExLlamaV2

# @markdown Learn more about ExLlamaV2 in [this article](https://mlabonne.github.io/blog/posts/ExLlamaV2_The_Fastest_Library_to_Run%C2%A0LLMs.html).

MODEL_NAME = "Daredevil-8B"
BPW = 4.0 # @param {type:"number"}

# Install ExLlamaV2
!git lfs install
!git clone https://huggingface.co/mlabonne/{MODEL_NAME}
!mv {MODEL_NAME} base_model
!rm base_mode/*.bin

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet


Git LFS initialized.
Cloning into 'Daredevil-8B'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (89/89), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 92 (delta 41), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (92/92), 2.26 MiB | 6.17 MiB/s, done.
Filtering content: 100% (4/4), 2.95 GiB | 13.56 MiB/s, done.
Encountered 3 file(s) that may not have been copied correctly on Windows:
	model-00002-of-00004.safetensors
	model-00003-of-00004.safetensors
	model-00001-of-00004.safetensors

See: `git lfs help smudge` for more details.
rm: cannot remove 'base_mode/*.bin': No such file or directory
--2024-07-10 18:49:37--  https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet
Resolving huggingface.co (huggingface.co)... 18.164.174.55, 18.164.174.17, 18.164.174.118, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.55|:443... connec

In [None]:
# Quantize model
!mkdir quant
!python exllamav2/convert.py \
    -i base_model \
    -o quant \
    -c wikitext-test.parquet \
    -b {BPW}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 -- model.layers.2.self_attn.v_proj                    1:4b_64g s4                                        4.09 bpw
 -- model.layers.2.self_attn.v_proj                    1:4b_32g s4                                        4.16 bpw
 -- model.layers.2.self_attn.v_proj                    0.1:5b_64g/0.9:4b_64g s4                           4.20 bpw
 -- model.layers.2.self_attn.v_proj                    0.1:5b_32g/0.9:4b_32g s4                           4.26 bpw
 -- model.layers.2.self_attn.v_proj                    1:5b_64g s4                                        5.09 bpw
 -- model.layers.2.self_attn.v_proj                    1:5b_32g s4                                        5.16 bpw
 -- model.layers.2.self_attn.v_proj                    1:6b_128g s4                                       6.06 bpw
 -- model.layers.2.self_attn.v_proj                    1:6b_32g s4                                        6.16 bpw
 -- model.layer

In [None]:
# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./quant/

sending incremental file list
./
README.md
config.json
generation_config.json
model.safetensors.index.json
special_tokens_map.json
tokenizer.json
tokenizer_config.json

sent 9,173,563 bytes  received 152 bytes  18,347,430.00 bytes/sec
total size is 9,170,766  speedup is 1.00


In [None]:
# Run model
!python exllamav2/test_inference.py -m quant/ -p "I am a Filipino and"

 -- Model: quant/
 -- Options: []
 -- Loading model...
 -- Loaded model in 2.1017 seconds
 -- Loading tokenizer...
 -- Warmup...
 -- Generating...

I am a Filipino and my native language is Tagalog.
I love to write and share my thoughts and experiences with others. I also enjoy reading books, watching movies, and learning new things.
I am a devout Catholic and I strive to live my faith every day.
I am a member of the Filipino community here in the United States and I am proud to be part of it.
I hope that through this blog, I can share my culture, my faith, and my experiences with others and learn from them as well.
I am excited to start this journey and I hope you will join me along the way. Mabuhay! (Long live!)

 -- Response generated in 2.53 seconds, 128 tokens, 50.59 tokens/second (includes prompt eval.)


In [None]:

# Download dataset
!wget https://huggingface.co/datasets/wikitext/resolve/9a9e482b5987f9d25b3a9b2883fc6cc9fd8071b3/wikitext-103-v1/wikitext-test.parquet

# Quantize model
save_folder = "/content/Daredevil-8B" + "-EXL2"
!mkdir {save_folder}
!python exllamav2/convert.py \
    -i base_model \
    -o {save_folder} \
    -c wikitext-test.parquet \
    -b {BPW}

# Copy files
!rm -rf quant/out_tensor
!rsync -av --exclude='*.safetensors' --exclude='.*' ./base_model/ ./{save_folder}/

# Create model card
card = ModelCard.load(MODEL_ID)
card.data.tags.append("autoquant")
card.data.tags.append("exl2")
card.save(f'{save_folder}/README.md')

# Upload model
create_repo(
    repo_id = f"{USERNAME}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    repo_type="model",
    exist_ok=True,
    token=hf_token
)
api.upload_folder(
    folder_path=save_folder,
    repo_id=f"{USERNAME}/{MODEL_NAME}-{BPW:.1f}bpw-exl2",
    token=hf_token
)

Daredevil-8B  exllamav2  mlabonne  sample_data	wikitext-test.parquet
