In [1]:
!pip install torch transformers  datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
!pip install autoawq

Collecting autoawq
  Downloading autoawq-0.2.9.tar.gz (74 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/74.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autoawq
  Building wheel for autoawq (setup.py) ... [?25l[?25hdone
  Created wheel for autoawq: filename=autoawq-0.2.9-py3-none-any.whl size=115106 sha256=33ed8a8fa3cef6f39b3435e27850cdb3c8a173ab2a75c1d55c7f3c4c4d95526a
  Stored in directory: /root/.cache/pip/wheels/fa/31/e6/260073853a2419a05b7cd592d82db1e34abce58404854ef14d
Successfully built autoawq
Installing collected packages: autoawq
Successfully installed autoawq-0.2.9


In [6]:
!git clone https://github.com/casper-hansen/AutoAWQ
%cd AutoAWQ
!pip install .

Cloning into 'AutoAWQ'...
remote: Enumerating objects: 3627, done.[K
remote: Counting objects: 100% (877/877), done.[K
remote: Compressing objects: 100% (281/281), done.[K
remote: Total 3627 (delta 752), reused 617 (delta 592), pack-reused 2750 (from 3)[K
Receiving objects: 100% (3627/3627), 7.73 MiB | 18.76 MiB/s, done.
Resolving deltas: 100% (2247/2247), done.
/content/AutoAWQ
Processing /content/AutoAWQ
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autoawq
  Building wheel for autoawq (setup.py) ... [?25l[?25hdone
  Created wheel for autoawq: filename=autoawq-0.2.9-py3-none-any.whl size=115106 sha256=60f97a418cb89245cbaa58f036a2eeb639eb69b34033cdc6ce9f322eef5e0841
  Stored in directory: /tmp/pip-ephem-wheel-cache-2zs0nnmd/wheels/c3/0f/9b/5555f912616cad6e0005ca1b9874c446401d17099c31d3a590
Successfully built autoawq
Installing collected packages: autoawq
  Attempting uninstall: autoawq
    Found existing installation: autoawq 0.2.9


In [None]:
%env HF_TOKEN=

env: HF_TOKEN=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


In [5]:
import torch
import os
import shutil
import time
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM
from datasets import load_dataset

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and output paths
model_id = "unsloth/Llama-3.2-1B"  # Replace with your model ID
quantized_model_dir = "llama-AWQ"

# Example texts for quantization calibration
def get_calibration_examples(num_examples=128):
    """Load example texts from C4 English dataset for quantization."""
    dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
    examples = []
    for i, example in enumerate(dataset):
        if i >= num_examples:
            break
        text = example['text'][:512]  # Limit to 512 characters
        examples.append(text)
    return examples

# Define prompts to test the model
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def verify_model_directory(model_dir):
    """Verify that the model directory contains required files."""
    required_files = ['model.safetensors', 'config.json', 'tokenizer.json']
    return all(os.path.exists(os.path.join(model_dir, f)) for f in required_files)

def query_model(model, tokenizer, prompt, max_new_tokens=100, num_beams=5, temperature=0.5):
    """Query the model with a prompt and return the generated response with inference time."""
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    inference_time = time.perf_counter() - start_time

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip(), inference_time

try:
    # Check if quantized_model_dir exists and remove it
    if os.path.exists(quantized_model_dir):
        print(f"\nDirectory '{quantized_model_dir}' already exists. Deleting to create a fresh quantized model...")
        shutil.rmtree(quantized_model_dir)

    # Load tokenizer
    print("\nLoading Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
    tokenizer.pad_token = tokenizer.eos_token

    # Get calibration examples
    print("\nLoading calibration examples...")
    examples = get_calibration_examples()

    # Quantize the model using AWQ
    print("\nQuantizing LLaMA Model with AWQ...")
    quant_config = {
        "zero_point": True,
        "q_group_size": 128,
        "w_bit": 4,
        "version": "GEMM"
    }
    quantized_model = AutoAWQForCausalLM.from_pretrained(
        model_id,
        device_map='cuda' if torch.cuda.is_available() else 'cpu',
        use_auth_token=True
    )
    quantized_model.quantize(tokenizer, quant_config=quant_config, calib_data=examples)

    # Save quantized model
    print(f"\nSaving quantized model to '{quantized_model_dir}'...")
    quantized_model.save_quantized(quantized_model_dir, safetensors=True)

    # Save tokenizer files
    print(f"\nSaving tokenizer to '{quantized_model_dir}'...")
    tokenizer.save_pretrained(quantized_model_dir)

    # Verify saved files
    print("\nVerifying saved files...")
    if not verify_model_directory(quantized_model_dir):
        raise FileNotFoundError(f"Failed to save required files in '{quantized_model_dir}'")
    saved_files = os.listdir(quantized_model_dir)
    for f in ['model.safetensors', 'config.json', 'tokenizer.json']:
        if f in saved_files:
            print(f"Found: {f}")
        else:
            print(f"Missing: {f}")

    # Measure baseline memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        baseline_memory = torch.cuda.memory_allocated() / (1024 ** 2)
        print(f"\nBaseline GPU memory usage: {baseline_memory:.2f} MB")

    # Load quantized model for testing
    print("\nLoading Quantized LLaMA Model...")
    model_awq = AutoAWQForCausalLM.from_quantized(
        quantized_model_dir,
        safetensors=True,
        device_map='auto'
    )
    torch.cuda.synchronize()
    quantized_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Quantized model loaded. Memory usage: {quantized_memory:.2f} MB")

    # Load original model for comparison
    print("\nLoading Original LLaMA Model...")
    model_original = AutoAWQForCausalLM.from_pretrained(model_id, use_auth_token=True, device_map='auto')
    torch.cuda.synchronize()
    original_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Original model loaded. Memory usage: {original_memory:.2f} MB")

    # Query both models
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")

        # Query original model
        print("Original Response:")
        response_original, time_original = query_model(model_original, tokenizer, prompt)
        print(f"Response: {response_original}")
        print(f"Inference Time: {time_original:.4f} seconds")

        # Query quantized model
        print("Quantized Response:")
        response_awq, time_awq = query_model(model_awq, tokenizer, prompt)
        print(f"Response: {response_awq}")
        print(f"Inference Time: {time_awq:.4f} seconds")

    print("\nQuantization and testing complete.")

except Exception as e:
    import traceback
    print(f"An error occurred: {str(e)}")
    traceback.print_exc()
    print("Please ensure all dependencies are installed, the model ID is correct, and you have a valid Hugging Face token.")

finally:
    # Clean up
    if 'quantized_model' in locals():
        del quantized_model
    if 'model_awq' in locals():
        del model_awq
    if 'model_original' in locals():
        del model_original
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

Using device: cuda

Loading Tokenizer...

Loading calibration examples...


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]


Quantizing LLaMA Model with AWQ...


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

AWQ: 100%|██████████| 16/16 [09:50<00:00, 36.92s/it]



Saving quantized model to 'llama-AWQ'...

Saving tokenizer to 'llama-AWQ'...

Verifying saved files...
Found: model.safetensors
Found: config.json
Found: tokenizer.json

Baseline GPU memory usage: 559.26 MB

Loading Quantized LLaMA Model...


Replacing layers...: 100%|██████████| 16/16 [00:05<00:00,  3.00it/s]


Quantized model loaded. Memory usage: 1543.51 MB

Loading Original LLaMA Model...


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]



Original model loaded. Memory usage: 3900.64 MB

Prompt 1: What is the capital of France, and what is its largest city?
Original Response:
Response: What is the capital of France, and what is its largest city? If these are questions that have been on your mind, then you have come to the right place. In this article, we will explore the answers to these questions and much more. We will also provide you with a list of other interesting facts about France and its capital, Paris. So, without further ado, let's get started!
What city is France's capital?
The answer to this question is Paris, which is also known as the "City of Light" and "The City of Love."
Inference Time: 8.7636 seconds
Quantized Response:




Response: What is the capital of France, and what is its largest city? In what state was the Battle of Gettysburg fought? How many states and the District of Columbia are there?
Inference Time: 18.6737 seconds

Prompt 2: Write a short story about a robot exploring an abandoned city.
Original Response:
Response: Write a short story about a robot exploring an abandoned city. The story should be no more than 1,000 words and should include the following elements: A robot is exploring a city that has been abandoned for many years. As the robot explores the city, it comes across a series of puzzles that must be solved in order to progress. Each puzzle has a unique solution that requires the use of a specific tool or piece of equipment.
The robot must use these tools and equipment to solve the puzzles and progress to the next stage of the game. Along the way
Inference Time: 8.2509 seconds
Quantized Response:




Response: Write a short story about a robot exploring an abandoned city. The story should be at least 1,000 words long and should include the following elements:
1. A robot is exploring a city that has been abandoned for a long period of time.
2. As the robot explores the city, it encounters various obstacles and challenges that it must overcome in order to complete its mission.
3. Throughout the story, the reader will be introduced to a variety of characters, each with their own unique personalities and motivations.
4. Each character will have a role to play
Inference Time: 13.3547 seconds

Prompt 3: If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?
Original Response:
Response: If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours? (1 mile = 5,280 feet)
A. 12,000 feet
B.   15,600 feet.
Answer: B
Explanation: The distance traveled is the product of the speed and the time.
Inference Time: 3.9012 seconds
Quantized Response:
Response: If a car tr

In [2]:
import torch
import os
import shutil
import time
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM
from datasets import load_dataset

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and output paths
model_id = "unsloth/Llama-3.2-1B"
quantized_model_dir = "llama-AWQ"

# Example texts for quantization calibration
def get_calibration_examples(num_examples=256):  # Increased to 256
    """Load example texts from C4 English dataset for quantization."""
    dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
    examples = []
    for i, example in enumerate(dataset):
        if i >= num_examples:
            break
        text = example['text'][:512]
        examples.append(text)
    return examples

# Define prompts to test the model
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def verify_model_directory(model_dir):
    """Verify that the model directory contains required files."""
    required_files = ['model.safetensors', 'config.json', 'tokenizer.json']
    return all(os.path.exists(os.path.join(model_dir, f)) for f in required_files)

def query_model(model, tokenizer, prompt, max_new_tokens=200, num_beams=10):  # Increased for better quality
    """Query the model with a prompt and return the generated response with inference time."""
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    inference_time = time.perf_counter() - start_time

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip(), inference_time

try:
    # Check if quantized_model_dir exists and remove it
    if os.path.exists(quantized_model_dir):
        print(f"\nDirectory '{quantized_model_dir}' already exists. Deleting to create a fresh quantized model...")
        shutil.rmtree(quantized_model_dir)

    # Load tokenizer
    print("\nLoading Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=True)
    tokenizer.pad_token = tokenizer.eos_token

    # Get calibration examples
    print("\nLoading calibration examples...")
    examples = get_calibration_examples()

    # Quantize the model using AWQ
    print("\nQuantizing LLaMA Model with AWQ...")
    quant_config = {
        "zero_point": True,
        "q_group_size": 128,
        "w_bit": 4,
        "version": "GEMM"
    }
    quantized_model = AutoAWQForCausalLM.from_pretrained(
        model_id,
        device_map='cuda' if torch.cuda.is_available() else 'cpu',
        token=True
    )
    quantized_model.quantize(tokenizer, quant_config=quant_config, calib_data=examples)

    # Save quantized model
    print(f"\nSaving quantized model to '{quantized_model_dir}'...")
    quantized_model.save_quantized(quantized_model_dir, safetensors=True)

    # Save tokenizer files
    print(f"\nSaving tokenizer to '{quantized_model_dir}'...")
    tokenizer.save_pretrained(quantized_model_dir)

    # Verify saved files
    print("\nVerifying saved files...")
    if not verify_model_directory(quantized_model_dir):
        raise FileNotFoundError(f"Failed to save required files in '{quantized_model_dir}'")
    saved_files = os.listdir(quantized_model_dir)
    for f in ['model.safetensors', 'config.json', 'tokenizer.json']:
        if f in saved_files:
            print(f"Found: {f}")
        else:
            print(f"Missing: {f}")

    # Measure baseline memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        baseline_memory = torch.cuda.memory_allocated() / (1024 ** 2)
        print(f"\nBaseline GPU memory usage: {baseline_memory:.2f} MB")

    # Load quantized model for testing
    print("\nLoading Quantized LLaMA Model...")
    model_awq = AutoAWQForCausalLM.from_quantized(
        quantized_model_dir,
        safetensors=True,
        device_map='auto'
    )
    torch.cuda.synchronize()
    quantized_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Quantized model loaded. Memory usage: {quantized_memory:.2f} MB")

    # Load original model for comparison
    print("\nLoading Original LLaMA Model...")
    model_original = AutoAWQForCausalLM.from_pretrained(model_id, token=True, device_map='auto')
    torch.cuda.synchronize()
    original_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Original model loaded. Memory usage: {original_memory:.2f} MB")

    # Query both models
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")

        # Query original model
        print("Original Response:")
        response_original, time_original = query_model(model_original, tokenizer, prompt)
        print(f"Response: {response_original}")
        print(f"Inference Time: {time_original:.4f} seconds")

        # Query quantized model
        print("Quantized Response:")
        response_awq, time_awq = query_model(model_awq, tokenizer, prompt)
        print(f"Response: {response_awq}")
        print(f"Inference Time: {time_awq:.4f} seconds")

    print("\nQuantization and testing complete.")

except Exception as e:
    import traceback
    print(f"An error occurred: {str(e)}")
    traceback.print_exc()
    print("Please ensure all dependencies are installed, the model ID is correct, and you have a valid Hugging Face token.")

finally:
    # Clean up
    if 'quantized_model' in locals():
        del quantized_model
    if 'model_awq' in locals():
        del model_awq
    if 'model_original' in locals():
        del model_original
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



Using device: cuda

Directory 'llama-AWQ' already exists. Deleting to create a fresh quantized model...

Loading Tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Loading calibration examples...


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]


Quantizing LLaMA Model with AWQ...


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

AWQ: 100%|██████████| 16/16 [09:53<00:00, 37.12s/it]



Saving quantized model to 'llama-AWQ'...

Saving tokenizer to 'llama-AWQ'...

Verifying saved files...
Found: model.safetensors
Found: config.json
Found: tokenizer.json

Baseline GPU memory usage: 559.26 MB

Loading Quantized LLaMA Model...


Replacing layers...: 100%|██████████| 16/16 [00:05<00:00,  2.84it/s]


Quantized model loaded. Memory usage: 1542.51 MB

Loading Original LLaMA Model...


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

Original model loaded. Memory usage: 3900.52 MB

Prompt 1: What is the capital of France, and what is its largest city?
Original Response:
Response: What is the capital of France, and what is its largest city? If these are questions that have been on your mind, then you have come to the right place. In this article, we will provide you with all the information you need to know about France and its capital, Paris.
France is a country located in Western Europe. It has a population of over 67 million people and a total area of 643,801 square kilometers. The country is bordered by Belgium, Germany, Luxembourg, Switzerland, Italy, Spain and the Mediterranean Sea. France is also a member of the United Nations, the Council of Europe, NATO, G7 and G20. Its capital city is Paris, which is located on the Seine River. Other major cities in France include Lyon, Marseille, Toulouse, Lille, Bordeaux and Strasbourg. Paris is known as the “City of Light” because it is home to many famous landmarks, in