In [None]:
pip install torch transformers auto_gptq datasets

Collecting auto_gptq
  Downloading auto_gptq-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12=

In [None]:
pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [None]:
%env HF_TOKEN=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

env: HF_TOKEN=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


In [None]:
import torch
import os
import shutil
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and output paths
model_id = "unsloth/Llama-3.2-1B"  # Replace with your LLaMA model ID
quantized_model_dir = "llama-GPTQ"

# Example texts for quantization calibration
def get_calibration_examples(num_examples=128):
    """Load example texts from C4 English dataset for quantization."""
    dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
    examples = []
    for i, example in enumerate(dataset):
        if i >= num_examples:
            break
        text = example['text'][:512]  # Limit to 512 characters
        examples.append(text)
    return examples

# Define prompts to test the model
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def verify_model_directory(model_dir):
    """Verify that the model directory contains required files."""
    required_files = ['model.safetensors', 'config.json', 'tokenizer.json']
    return all(os.path.exists(os.path.join(model_dir, f)) for f in required_files)

def query_model(model, tokenizer, prompt, max_new_tokens=100, num_beams=5, temperature=0.5):
    """Query the model with a prompt and return the generated response with inference time."""
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    inference_time = time.perf_counter() - start_time

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip(), inference_time

try:
    # Check if quantized_model_dir exists and remove it
    if os.path.exists(quantized_model_dir):
        print(f"\nDirectory '{quantized_model_dir}' already exists. Deleting to create a fresh quantized model...")
        shutil.rmtree(quantized_model_dir)

    # Load tokenizer and model
    print("\nLoading LLaMA Model and Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)  # Set your HF token
    tokenizer.pad_token = tokenizer.eos_token  # Avoid pad token warnings
    model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=True).to(device)

    # Prepare quantization configuration
    quantize_config = BaseQuantizeConfig(
        bits=4,          # 4-bit quantization
        group_size=128,  # Group size for quantization
        damp_percent=0.01,  # Damping factor
        desc_act=False   # Disable act-order for stability
    )

    # Get calibration examples
    print("\nLoading calibration examples...")
    examples = get_calibration_examples()

    # Tokenize examples
    tokenized_examples = [tokenizer(ex, return_tensors='pt', padding=True, truncation=True).to(device) for ex in examples]

    # Quantize the model
    print("\nQuantizing LLaMA Model...")
    quantized_model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, device_map='cuda', use_auth_token=True)
    quantized_model.quantize(tokenized_examples, use_triton=False)  # Triton may require CUDA kernels

    # Save quantized model
    print(f"\nSaving quantized model to '{quantized_model_dir}'...")
    quantized_model.save_quantized(quantized_model_dir, use_safetensors=True, safetensors_metadata={'format': 'pt'})

    # Save tokenizer files
    print(f"\nSaving tokenizer to '{quantized_model_dir}'...")
    tokenizer.save_pretrained(quantized_model_dir)

    # Rename safetensors file to match expected name
    old_safetensors = os.path.join(quantized_model_dir, 'gptq_model-4bit-128g.safetensors')
    new_safetensors = os.path.join(quantized_model_dir, 'model.safetensors')
    if os.path.exists(old_safetensors):
        os.rename(old_safetensors, new_safetensors)
        print(f"Renamed '{old_safetensors}' to '{new_safetensors}' for compatibility.")

    # Verify saved files
    print("\nVerifying saved files...")
    if not verify_model_directory(quantized_model_dir):
        raise FileNotFoundError(f"Failed to save required files in '{quantized_model_dir}'")
    saved_files = os.listdir(quantized_model_dir)
    for f in ['model.safetensors', 'config.json', 'tokenizer.json']:
        if f in saved_files:
            print(f"Found: {f}")
        else:
            print(f"Missing: {f}")

    # Measure baseline memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        baseline_memory = torch.cuda.memory_allocated() / (1024 ** 2)
        print(f"\nBaseline GPU memory usage: {baseline_memory:.2f} MB")

    # Load quantized model for testing
    print("\nLoading Quantized LLaMA Model...")
    model_gptq = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        use_safetensors=True,
        device_map='auto'
    )
    torch.cuda.synchronize()
    quantized_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Quantized model loaded. Memory usage: {quantized_memory:.2f} MB")

    # Load original model for comparison
    print("\nLoading Original LLaMA Model...")
    model_original = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=True).to(device)
    torch.cuda.synchronize()
    original_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Original model loaded. Memory usage: {original_memory:.2f} MB")

    # Query both models
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")

        # Query original model
        print("Original Response:")
        response_original, time_original = query_model(model_original, tokenizer, prompt)
        print(f"Response: {response_original}")
        print(f"Inference Time: {time_original:.4f} seconds")

        # Query quantized model
        print("Quantized Response:")
        response_gptq, time_gptq = query_model(model_gptq, tokenizer, prompt)
        print(f"Response: {response_gptq}")
        print(f"Inference Time: {time_gptq:.4f} seconds")

    print("\nQuantization and testing complete.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure all dependencies are installed, the model ID is correct, and you have a valid Hugging Face token.")

finally:
    # Clean up
    if 'model' in locals():
        del model
    if 'quantized_model' in locals():
        del quantized_model
    if 'model_gptq' in locals():
        del model_gptq
    if 'model_original' in locals():
        del model_original
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

Using device: cuda

Loading LLaMA Model and Tokenizer...


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]


Loading calibration examples...


README.md:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]


Quantizing LLaMA Model...


INFO - Start quantizing layer 1/16
INFO:auto_gptq.modeling._base:Start quantizing layer 1/16
INFO - Quantizing self_attn.k_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing self_attn.k_proj in layer 1/16...
INFO - Quantizing self_attn.v_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing self_attn.v_proj in layer 1/16...
INFO - Quantizing self_attn.q_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing self_attn.q_proj in layer 1/16...
INFO - Quantizing self_attn.o_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing self_attn.o_proj in layer 1/16...
INFO - Quantizing mlp.up_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing mlp.up_proj in layer 1/16...
INFO - Quantizing mlp.gate_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing mlp.gate_proj in layer 1/16...
INFO - Quantizing mlp.down_proj in layer 1/16...
INFO:auto_gptq.modeling._base:Quantizing mlp.down_proj in layer 1/16...
INFO - Start quantizing layer 2/16
INFO:


Saving quantized model to 'llama-GPTQ'...

Saving tokenizer to 'llama-GPTQ'...


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
INFO:auto_gptq.modeling._base:The layer lm_head is not quantized.


Renamed 'llama-GPTQ/gptq_model-4bit-128g.safetensors' to 'llama-GPTQ/model.safetensors' for compatibility.

Verifying saved files...
Found: model.safetensors
Found: config.json
Found: tokenizer.json

Baseline GPU memory usage: 4776.94 MB

Loading Quantized LLaMA Model...
Quantized model loaded. Memory usage: 5708.85 MB

Loading Original LLaMA Model...
Original model loaded. Memory usage: 10423.11 MB

Prompt 1: What is the capital of France, and what is its largest city?
Original Response:
Response: What is the capital of France, and what is its largest city? If these are questions that have been on your mind, then you have come to the right place. In this article, we will answer these questions and provide you with all the information you need to know about the French capital and its biggest city. We will also discuss some interesting facts about both of these cities. So, if you are interested in learning more about them, keep reading!
What Is The Capital Of France?
The capital is Pari



Response: What is the capital of France, and what is its largest city? Find the answers to these questions and more with a little help from your friends in France.
A. Paris
B. Bordeaux
C. Lyon
D. Marseille
Answer: A
Explanation: People often travel to different places and learn about their cultures. Geography Project coaches students as they choose a place to research. Then, using the map and the list of cities, place markers on the world map to show where each place is located.
Inference Time: 10.9780 seconds

Prompt 2: Write a short story about a robot exploring an abandoned city.
Original Response:
Response: Write a short story about a robot exploring an abandoned city. The story should have a beginning, middle, and end. You can use any setting you like, but the story must take place in a city that has been abandoned for at least 10 years. Your story can be as long or as short as you want. It is up to you to decide how long it should be.
Inference Time: 2.5769 seconds
Quantized Resp