<a href="https://colab.research.google.com/github/Avinashhmavi/hack/blob/main/Qunatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
## LLM.int8()

In [2]:
import torch

def llm_int8_quantization(input_tensor: torch.Tensor,
                          weights: torch.Tensor,
                          alpha: float) -> torch.Tensor:
    """
    Quantizes the input and weight matrices using a mixed-precision scheme:
      - Small magnitude features (columns where the max absolute value is <= alpha)
        are quantized using vector-wise (row-wise for input and column-wise for weights)
        quantization.
      - Large magnitude features (columns with max abs > alpha) are computed in float16.
    The results are then summed to obtain the final output.

    Args:
        input_tensor (torch.Tensor): Input tensor of shape (sequence_length, feature_dim)
        weights (torch.Tensor): Weight matrix of shape (feature_dim, output_dim)
        alpha (float): Threshold to distinguish between small and large magnitude columns.

    Returns:
        torch.Tensor: Output tensor of shape (sequence_length, output_dim)
    """
    # Determine which columns (features) are "small" vs. "large"
    max_abs_per_col = torch.max(torch.abs(input_tensor), dim=0).values
    small_mask = max_abs_per_col <= alpha
    large_mask = max_abs_per_col > alpha
    print(small_mask)
    print(large_mask)

    # Split the input and weight matrices accordingly
    input_small = input_tensor[:, small_mask]
    weights_small = weights[small_mask, :]

    input_large = input_tensor[:, large_mask]
    weights_large = weights[large_mask, :]

    # For the input: perform row-wise quantization.
    # Compute the row-wise (per sample) scale factor for the small features.
    scale_input = torch.max(torch.abs(input_small), dim=1, keepdim=True).values  # Shape: (sequence_length, 1)
    print(f"Scale input: {scale_input.shape}")
    scale_input[scale_input == 0] = 1.0
    input_small_quant = torch.round((input_small / scale_input) * 127).to(torch.int8)
    print(f"Input Small: {input_small_quant.shape}")

    # For the weights: perform column-wise quantization.
    scale_weights = torch.max(torch.abs(weights_small), dim=0, keepdim=True).values  # Shape: (1, output_dim)
    scale_weights[scale_weights == 0] = 1.0
    weights_small_quant = torch.round((weights_small / scale_weights) * 127).to(torch.int8)
    print(f"Weight Small: {weights_small_quant.shape}")

    # Matrix multiplication with quantized values.
    result_small_int32 = torch.matmul(input_small_quant.to(torch.int32),
                                        weights_small_quant.to(torch.int32))
    # Dequantize: the effective scale is the product of the input and weight scales,
    result_small = result_small_int32.to(torch.float32) * (scale_input * scale_weights / (127 * 127))
    print(f"Result Small: {result_small.shape}")

    # Processing for Large Features
    # For columns with large magnitude, we use float16 for higher precision.
    input_large_fp16 = input_large.to(torch.float16)
    print(f"Input large: {input_large_fp16.shape}")
    weights_large_fp16 = weights_large.to(torch.float16)
    print(f"Weight large: {weights_large_fp16.shape}")
    result_large = torch.matmul(input_large_fp16, weights_large_fp16).to(torch.float32)

    # Combine the Results
    output = result_small + result_large
    return output

if __name__ == "__main__":
    # Dummy data: a sequence_length of 4 with 10 features and a weight matrix for 5 output neurons.
    sequence_length, feature_dim, output_dim = 4, 10, 5
    torch.manual_seed(0)
    input_tensor = torch.randn(sequence_length, feature_dim)
    weights = torch.randn(feature_dim, output_dim)
    alpha = 1.5  # Threshold for distinguishing large vs. small magnitude features
    output = llm_int8_quantization(input_tensor, weights, alpha)
    output_complete = torch.matmul(input_tensor, weights)
    #compute MSE b/w output and output_compelete
    mse = torch.mean((output - output_complete) ** 2)
    print("MSE: ", mse)

tensor([ True, False,  True, False,  True,  True,  True, False, False, False])
tensor([False,  True, False,  True, False, False, False,  True,  True,  True])
Scale input: torch.Size([4, 1])
Input Small: torch.Size([4, 5])
Weight Small: torch.Size([5, 5])
Result Small: torch.Size([4, 5])
Input large: torch.Size([4, 5])
Weight large: torch.Size([5, 5])
MSE:  tensor(1.5778e-05)


In [3]:
pip install -U bitsandbytes



In [4]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

try:
    # Load model and tokenizer
    model_id = 'gpt2'
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Print model size
    print(f"Model size: {model.get_memory_footprint():,} bytes")

    # Check if CUDA is available for quantization
    if not torch.cuda.is_available():
        print("Quantization skipped: 8-bit quantization with bitsandbytes requires a CUDA-enabled GPU.")
        print("For CPU-only environments, consider using other optimization methods or a smaller model.")
    else:
        try:
            import bitsandbytes
            # Configure 8-bit quantization
            quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)

            # Load quantized model
            model_int8 = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map='auto',
                quantization_config=quantization_config
            )
            print(f"Model size Quantized: {model_int8.get_memory_footprint():,} bytes")

        except ImportError:
            print("Error: bitsandbytes package is not installed.")
            print("Please install it using: pip install -U bitsandbytes")
            print("Quantized model loading skipped.")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Model size: 510,342,192 bytes
Model size Quantized: 176,527,896 bytes


In [5]:
### GPTQ Algorithm

In [6]:
import torch

class HessianApproximator:
    def __init__(self, hidden_dim):
        """
        Initialize Hessian approximation matrix H.
        Args:
        - hidden_dim (int): Size of the input activation vectors.
        """
        self.H = torch.zeros(hidden_dim, hidden_dim)  # Initialize Hessian approx
        self.nsamples = 0  # Track number of samples seen

    def update(self, activations):
        """
        Update the Hessian approximation using a batch of activations.
        Args:
        - activations (torch.Tensor): Shape (batch_size, hidden_dim)
        """
        batch_size = activations.shape[0]

        # Normalize activation input
        scale = torch.sqrt(torch.tensor(2.0 / (self.nsamples + batch_size)))
        activations = scale * activations.float()

        # Compute outer product
        self.H *= self.nsamples / (self.nsamples + batch_size)  # Decay old information
        self.H += activations.T @ activations  # Accumulate new information

        # Update sample count
        self.nsamples += batch_size

    def get_hessian(self):
        """
        Returns the approximated Hessian matrix.
        """
        return self.H

# Example Usage
hidden_dim = 128  # Example hidden size
hessian_approx = HessianApproximator(hidden_dim)

# Simulated activations (batch of size 32)
for _ in range(10):  # Simulate 10 batches
    activations = torch.randn(32, hidden_dim)  # Random activations
    hessian_approx.update(activations)
print(hessian_approx.get_hessian())

tensor([[ 1.9952, -0.1386,  0.1125,  ..., -0.2000,  0.0804,  0.0931],
        [-0.1386,  1.9721,  0.0205,  ...,  0.0940, -0.0426, -0.1067],
        [ 0.1125,  0.0205,  1.9457,  ..., -0.1442, -0.0173,  0.0911],
        ...,
        [-0.2000,  0.0940, -0.1442,  ...,  2.1793, -0.0777,  0.0616],
        [ 0.0804, -0.0426, -0.0173,  ..., -0.0777,  2.0420,  0.1914],
        [ 0.0931, -0.1067,  0.0911,  ...,  0.0616,  0.1914,  1.8760]])


In [7]:
### Quantization

In [8]:
import torch
import torch.nn as nn

def compute_hessian_inverse(H):
    """
    Compute the inverse of the Hessian matrix with handling for numerical stability.

    Args:
        H (torch.Tensor): Approximated Hessian matrix (in_features x in_features).

    Returns:
        torch.Tensor: Inverse of the Hessian matrix.
    """
    try:
        # Add small diagonal perturbation for numerical stability
        epsilon = 1e-6
        H_reg = H + torch.eye(H.shape[0], device=H.device) * epsilon
        Hinv = torch.linalg.inv(H_reg)
        return Hinv
    except torch.linalg.LinAlgError:
        # Fallback to pseudo-inverse if inversion fails
        return torch.linalg.pinv(H)

def zero_point_quantization(w, n_bits=8):
    """
    Perform zero-point quantization on a weight vector.

    Args:
        w (torch.Tensor): Weight vector to quantize.
        n_bits (int): Number of bits for quantization (default: 8).

    Returns:
        torch.Tensor: Quantized weight vector.
    """
    # Compute min and max of the weights
    w_min, w_max = w.min(), w.max()

    # Calculate scale and zero point
    q_min, q_max = -(2**(n_bits-1)), 2**(n_bits-1) - 1
    scale = (w_max - w_min) / (q_max - q_min) if w_max != w_min else 1.0
    zero_point = q_min - w_min / scale if scale != 0 else 0.0

    # Quantize
    q = torch.clamp(torch.round(w / scale + zero_point), q_min, q_max)

    # Dequantize back to float
    q = (q - zero_point) * scale
    return q

def iterative_quantization(layer, H, device='cpu'):
    """
    Args:
        layer (torch.nn.Module): The layer to be quantized. Its weight is expected to be 2D
                                 with shape (out_features, in_features).
        H (torch.Tensor): Approximated Hessian matrix (in_features x in_features).
        device (str or torch.device): Device for computations (e.g., 'cpu' or 'cuda').

    Returns:
        torch.nn.Module: The same layer with its weights quantized.
    """
    # Ensure device consistency
    device = torch.device(device)
    layer = layer.to(device)
    H = H.to(device)

    # Clone and convert the weight matrix to float
    W = layer.weight.data.clone().float()  # Shape: (out_features, in_features)

    # Handle potential "dead" weights by zeroing out columns where the Hessian diagonal is zero
    dead = torch.diag(H) == 0
    H[dead, dead] = 1
    W[:, dead] = 0

    # Compute the inverse Hessian for error compensation
    Hinv = compute_hessian_inverse(H)

    # Prepare an empty tensor for storing quantized weights
    Q = torch.zeros_like(W)

    # Number of columns (i.e., in_features)
    n_cols = W.shape[1]

    # Iteratively quantize each column
    for i in range(n_cols):
        # Get the current weight column
        w = W[:, i]

        # Quantize the column using the provided quantizer
        q = zero_point_quantization(w)
        Q[:, i] = q

        # Use the Hessian inverse to compute a sensitivity factor
        d = Hinv[i, i] if Hinv[i, i] != 0 else 1.0  # Avoid division by zero

        # Compute the quantization error (scaled by the sensitivity)
        error = (w - q) / d

        # Adjust subsequent columns based on error compensation
        if i < n_cols - 1:
            # Hinv[i, i+1:] is a row vector containing sensitivity info for later columns
            Hinv_damp = Hinv[i, i+1:] * 0.1
            # Compute compensation: error (out_features,) needs to be applied to each subsequent column
            # We outer product error with Hinv_damp to get a matrix of shape (out_features, n_cols-i-1)
            compensation = torch.outer(error, Hinv_damp)
            # Subtract the compensation from all subsequent columns
            W[:, i+1:] = W[:, i+1:] - compensation

    # Update the layer's weights with quantized values
    layer.weight.data = Q

    # Print the quantized weight shape for verification
    print("Quantized weight shape:", Q.shape)

    return layer

if __name__ == "__main__":
    # Set random seed for reproducibility
    torch.manual_seed(0)

    # Create a sample linear layer
    out_features, in_features = 10, 5
    layer = nn.Linear(in_features, out_features)

    # Create a synthetic Hessian matrix (positive semi-definite)
    H = torch.randn(in_features, in_features)
    H = H @ H.t() + torch.eye(in_features) * 0.1  # Ensure positive definite

    # Call the quantization function
    device = 'cpu'  # Use 'cuda' if GPU is available
    quantized_layer = iterative_quantization(layer, H, device=device)

    # Print the first few quantized weights for verification
    print("Sample quantized weights:\n", quantized_layer.weight.data[:3, :3])

Quantized weight shape: torch.Size([10, 5])
Sample quantized weights:
 tensor([[-0.0040,  0.2403, -0.3668],
        [ 0.1191, -0.0077,  0.3558],
        [-0.1362, -0.0887, -0.4272]])


In [9]:
### quant

In [10]:
pip install datasets



In [11]:
pip install auto-gptq



In [12]:
import random
import torch
from transformers import AutoTokenizer

try:
    from datasets import load_dataset
except ImportError:
    print("Error: datasets package is not installed.")
    print("Please install it using: pip install datasets")
    print("Exiting due to missing dependency.")
    exit(1)

try:
    from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
except ImportError:
    print("Error: auto_gptq package is not installed.")
    print("Please install it using: pip install auto-gptq")
    print("If you need GPU support with Triton, ensure you have a CUDA-enabled GPU and install the appropriate dependencies.")
    print("Exiting due to missing dependency.")
    exit(1)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define base model and output directory
model_id = "gpt2"
out_dir = model_id + "-GPTQ"

try:
    if not torch.cuda.is_available():
        print("Error: GPTQ quantization requires a CUDA-enabled GPU.")
        print("Quantization skipped. Please run this script on a machine with a compatible GPU.")
    else:
        # Load quantize config, model, and tokenizer
        quantize_config = BaseQuantizeConfig(
            bits=4,
            group_size=128,
            damp_percent=0.01,
            desc_act=False,
        )
        model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        # Load data and tokenize examples
        n_samples = 1024
        data = load_dataset("allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split=f"train[:{n_samples*5}]")
        tokenized_data = tokenizer("\n\n".join(data['text']), return_tensors='pt')

        # Format tokenized examples
        examples_ids = []
        for _ in range(n_samples):
            i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)
            j = i + tokenizer.model_max_length
            input_ids = tokenized_data.input_ids[:, i:j]
            attention_mask = torch.ones_like(input_ids)
            examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})

        # Quantize with GPTQ
        model.quantize(
            examples_ids,
            batch_size=1,
            use_triton=True,  # Triton is GPU-only, safe since we checked CUDA
        )

        # Save model and tokenizer
        model.save_quantized(out_dir, use_safetensors=True)
        tokenizer.save_pretrained(out_dir)
        print(f"Quantized model and tokenizer saved to {out_dir}")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure all dependencies are installed and your environment supports the requested operations.")

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)


Using device: cuda


Token indices sequence length is longer than the specified maximum sequence length for this model (2441065 > 1024). Running this sequence through the model will result in indexing errors
INFO - Start quantizing layer 1/12
INFO:auto_gptq.modeling._base:Start quantizing layer 1/12
INFO - Quantizing attn.c_attn in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_attn in layer 1/12...
INFO - Quantizing attn.c_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_proj in layer 1/12...
INFO - Quantizing mlp.c_fc in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_fc in layer 1/12...
INFO - Quantizing mlp.c_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_proj in layer 1/12...
INFO - Start quantizing layer 2/12
INFO:auto_gptq.modeling._base:Start quantizing layer 2/12
INFO - Quantizing attn.c_attn in layer 2/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_attn in layer 2/12...
INFO - Quantizing attn.c_proj in layer 2/12...
INF

Quantized model and tokenizer saved to gpt2-GPTQ


In [13]:
### Comparing models

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Sample text for perplexity calculation
sample_text = "The quick brown fox jumps over the lazy dog."

def calculate_perplexity(model, text, tokenizer):
    """
    Calculate the perplexity of a model on given text.

    Args:
        model: The language model (e.g., GPT-2).
        text (str): The input text to evaluate.
        tokenizer: The tokenizer corresponding to the model.

    Returns:
        float: The perplexity score.
    """
    # Encode the text
    encodings = tokenizer(text, return_tensors='pt').to(device)
    # Define input_ids and target_ids
    input_ids = encodings.input_ids
    target_ids = input_ids.clone()
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
    # Loss calculation
    neg_log_likelihood = outputs.loss
    # Perplexity calculation
    ppl = torch.exp(neg_log_likelihood)
    return ppl

try:
    # Load tokenizer
    model_id = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Load original GPT-2 model
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    ppl = calculate_perplexity(model, sample_text, tokenizer)

    # Handle INT8 quantized model
    ppl_int8 = None
    if not torch.cuda.is_available():
        print("INT8 model loading skipped: bitsandbytes requires a CUDA-enabled GPU.")
    else:
        try:
            from transformers import BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)
            model_int8 = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map='auto',
                quantization_config=quantization_config
            )
            ppl_int8 = calculate_perplexity(model_int8, sample_text, tokenizer)
        except ImportError:
            print("Error: bitsandbytes package is not installed for INT8 quantization.")
            print("Please install it using: pip install -U bitsandbytes")
        except Exception as e:
            print(f"Error loading INT8 model: {str(e)}")

    # Handle GPTQ quantized model
    ppl_gptq = None
    if not torch.cuda.is_available():
        print("GPTQ model loading skipped: Requires a CUDA-enabled GPU.")
    else:
        try:
            from auto_gptq import AutoGPTQForCausalLM
            model_gptq = AutoGPTQForCausalLM.from_quantized(
                model_id + "-GPTQ",
                use_safetensors=True,
                device_map='auto'
            )
            ppl_gptq = calculate_perplexity(model_gptq, sample_text, tokenizer)
        except ImportError:
            print("Error: auto_gptq package is not installed for GPTQ quantization.")
            print("Please install it using: pip install auto-gptq")
        except Exception as e:
            print(f"Error loading GPTQ model: {str(e)}")

    # Print results
    print(f"Original Model perplexity: {ppl.item():.2f}")
    if ppl_int8 is not None:
        print(f"INT8 perplexity:    {ppl_int8.item():.2f}")
    else:
        print("INT8 perplexity:    Not calculated (missing GPU or dependencies)")
    if ppl_gptq is not None:
        print(f"GPTQ perplexity:  {ppl_gptq.item():.2f}")
    else:
        print("GPTQ perplexity:  Not calculated (missing GPU or dependencies)")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure all dependencies are installed and models are correctly set up.")

Using device: cuda


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
INFO:auto_gptq.modeling._base:The layer lm_head is not quantized.


Original Model perplexity: 162.47
INT8 perplexity:    173.23
GPTQ perplexity:  180.87


In [15]:
### know the efficiency in quantized model

In [18]:
import torch
import time
import os
import psutil
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model paths
model_id = "gpt2"
quantized_model_dir = "gpt2-GPTQ"

# Sample text for inference and perplexity
sample_text = "The quick brown fox jumps over the lazy dog."
max_new_tokens = 50  # Number of tokens to generate for inference speed test

def get_model_size(model_dir):
    """Calculate the total size of model files in MB."""
    total_size = 0
    for dirpath, _, filenames in os.walk(model_dir):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 ** 2)  # Convert to MB

def get_memory_usage():
    """Get current GPU memory usage in MB (if CUDA) or CPU memory in MB."""
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        allocated = torch.cuda.memory_allocated() / (1024 ** 2)  # Convert to MB
        reserved = torch.cuda.memory_reserved() / (1024 ** 2)  # Convert to MB
        return allocated  # Use allocated memory for consistency
    else:
        process = psutil.Process()
        return process.memory_info().rss / (1024 ** 2)  # Convert to MB

def measure_inference_time(model, tokenizer, text, max_new_tokens, num_runs=10, is_gptq=False):
    """Measure average inference time for text generation."""
    # Tokenize input with attention mask
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Warm-up run
    with torch.no_grad():
        if is_gptq:
            # For GPTQ model, use generate with explicit kwargs
            _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens)
        else:
            _ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens)

    times = []
    for _ in range(num_runs):
        start_time = time.time()
        with torch.no_grad():
            if is_gptq:
                _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens)
            else:
                _ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens)
        times.append(time.time() - start_time)

    return np.mean(times), np.std(times)

def calculate_perplexity(model, text, tokenizer):
    """Calculate perplexity of the model on given text."""
    encodings = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    target_ids = input_ids.clone()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=target_ids)
    neg_log_likelihood = outputs.loss
    ppl = torch.exp(neg_log_likelihood)
    return ppl.item()

try:
    # Load tokenizer and set pad token
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token to avoid warnings

    # Clear GPU memory before starting
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

    # --- Original Model ---
    print("\nLoading Original GPT-2 Model...")
    # Measure memory before loading
    memory_before = get_memory_usage()

    # Load model
    model_original = AutoModelForCausalLM.from_pretrained(model_id).to(device)

    # Measure memory after loading
    memory_after = get_memory_usage()
    memory_usage_original = memory_after - memory_before
    print(f"Original Model Memory Usage: {memory_usage_original:.2f} MB")

    # Measure inference time
    avg_time_original, std_time_original = measure_inference_time(
        model_original, tokenizer, sample_text, max_new_tokens, is_gptq=False
    )
    print(f"Original Model Inference Time: {avg_time_original:.4f} ± {std_time_original:.4f} seconds")

    # Calculate perplexity
    ppl_original = calculate_perplexity(model_original, sample_text, tokenizer)
    print(f"Original Model Perplexity: {ppl_original:.2f}")

    # Clear GPU memory
    del model_original
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

    # --- Quantized GPTQ Model ---
    print("\nLoading Quantized GPT-2 Model...")
    # Measure memory before loading
    memory_before = get_memory_usage()

    # Load quantized model
    model_gptq = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        use_safetensors=True,
        device_map='auto'
    )

    # Measure memory after loading
    memory_after = get_memory_usage()
    memory_usage_gptq = memory_after - memory_before
    print(f"GPTQ Model Memory Usage: {memory_usage_gptq:.2f} MB")

    # Measure inference time
    avg_time_gptq, std_time_gptq = measure_inference_time(
        model_gptq, tokenizer, sample_text, max_new_tokens, is_gptq=True
    )
    print(f"GPTQ Model Inference Time: {avg_time_gptq:.4f} ± {std_time_gptq:.4f} seconds")

    # Calculate perplexity
    ppl_gptq = calculate_perplexity(model_gptq, sample_text, tokenizer)
    print(f"GPTQ Model Perplexity: {ppl_gptq:.2f}")

    # --- Model Size Comparison ---
    original_model_size = get_model_size(model_id) if os.path.exists(model_id) else "Unknown (download size)"
    quantized_model_size = get_model_size(quantized_model_dir)
    print(f"\nOriginal Model Size: {original_model_size} MB")
    print(f"Quantized Model Size: {quantized_model_size:.2f} MB")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure the quantized model exists in 'gpt2-GPTQ' and all dependencies are installed.")

finally:
    # Clean up
    if 'model_gptq' in locals():
        del model_gptq
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

Using device: cuda

Loading Original GPT-2 Model...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original Model Memory Usage: 487.47 MB


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 wh

Original Model Inference Time: 1.2108 ± 0.6665 seconds
Original Model Perplexity: 162.47

Loading Quantized GPT-2 Model...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPTQ Model Memory Usage: -355.53 MB


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPTQ Model Inference Time: 1.2667 ± 0.1180 seconds
GPTQ Model Perplexity: 180.87

Original Model Size: Unknown (download size) MB
Quantized Model Size: 195.92 MB


In [21]:
import torch
import time
import os
import psutil
import numpy as np
import subprocess
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model paths
model_id = "gpt2"
quantized_model_dir = "gpt2-GPTQ"

# Sample text for inference and perplexity
sample_text = "The quick brown fox jumps over the lazy dog."
max_new_tokens = 50  # Number of tokens to generate for inference speed test

def get_model_size(model_dir):
    """Calculate the total size of model files in MB."""
    total_size = 0
    for dirpath, _, filenames in os.walk(model_dir):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size / (1024 ** 2)  # Convert to MB

def get_gpu_memory_usage():
    """Get current GPU memory usage in MB using nvidia-smi, with torch fallback."""
    if torch.cuda.is_available():
        try:
            result = subprocess.check_output(
                ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"],
                encoding='utf-8'
            )
            time.sleep(0.1)  # Ensure measurement stability
            return float(result.strip())  # Memory used in MB
        except Exception as e:
            print(f"nvidia-smi failed: {e}, falling back to torch")
            torch.cuda.synchronize()
            return torch.cuda.memory_allocated() / (1024 ** 2)
    else:
        process = psutil.Process()
        return process.memory_info().rss / (1024 ** 2)

def reset_gpu_memory():
    """Clear GPU memory and reset stats."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        gc.collect()
        torch.cuda.synchronize()
        # Wait briefly to ensure memory is released
        time.sleep(0.1)

def measure_inference_time(model, tokenizer, text, max_new_tokens, num_runs=10, is_gptq=False):
    """Measure average inference time for text generation."""
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Warm-up run
    with torch.no_grad():
        if is_gptq:
            _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)
        else:
            _ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)

    times = []
    for _ in range(num_runs):
        start_time = time.time()
        with torch.no_grad():
            if is_gptq:
                _ = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)
            else:
                _ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)
        times.append(time.time() - start_time)

    return np.mean(times), np.std(times)

def calculate_perplexity(model, text, tokenizer):
    """Calculate perplexity of the model on given text."""
    encodings = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    target_ids = input_ids.clone()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=target_ids)
    neg_log_likelihood = outputs.loss
    ppl = torch.exp(neg_log_likelihood)
    return ppl.item()

try:
    # Load tokenizer and set pad token
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token

    # --- Original Model ---
    print("\nLoading Original GPT-2 Model...")
    # Reset GPU memory
    reset_gpu_memory()
    # Measure memory before loading
    memory_before = get_gpu_memory_usage()
    print(f"Memory before loading original: {memory_before:.2f} MB")

    # Load model
    model_original = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    torch.cuda.synchronize()  # Ensure model is fully loaded

    # Measure memory after loading
    memory_after = get_gpu_memory_usage()
    memory_usage_original = memory_after - memory_before
    print(f"Original Model Memory Usage: {memory_usage_original:.2f} MB")
    print(f"Memory after loading original: {memory_after:.2f} MB")

    # Measure inference time
    avg_time_original, std_time_original = measure_inference_time(
        model_original, tokenizer, sample_text, max_new_tokens, is_gptq=False
    )
    print(f"Original Model Inference Time: {avg_time_original:.4f} ± {std_time_original:.4f} seconds")

    # Calculate perplexity
    ppl_original = calculate_perplexity(model_original, sample_text, tokenizer)
    print(f"Original Model Perplexity: {ppl_original:.2f}")

    # Clear GPU memory
    del model_original
    reset_gpu_memory()
    memory_after_clear = get_gpu_memory_usage()
    print(f"Memory after clearing original: {memory_after_clear:.2f} MB")

    # --- Quantized GPTQ Model ---
    print("\nLoading Quantized GPT-2 Model...")
    # Reset GPU memory
    reset_gpu_memory()
    # Measure memory before loading
    memory_before = get_gpu_memory_usage()
    print(f"Memory before loading GPTQ: {memory_before:.2f} MB")

    # Load quantized model
    model_gptq = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        use_safetensors=True,
        device_map='auto'
    )
    torch.cuda.synchronize()  # Ensure model is fully loaded

    # Measure memory after loading
    memory_after = get_gpu_memory_usage()
    memory_usage_gptq = memory_after - memory_before
    print(f"GPTQ Model Memory Usage: {memory_usage_gptq:.2f} MB")
    print(f"Memory after loading GPTQ: {memory_after:.2f} MB")

    # Measure inference time
    avg_time_gptq, std_time_gptq = measure_inference_time(
        model_gptq, tokenizer, sample_text, max_new_tokens, is_gptq=True
    )
    print(f"GPTQ Model Inference Time: {avg_time_gptq:.4f} ± {std_time_gptq:.4f} seconds")

    # Calculate perplexity
    ppl_gptq = calculate_perplexity(model_gptq, sample_text, tokenizer)
    print(f"GPTQ Model Perplexity: {ppl_gptq:.2f}")

    # --- Model Size Comparison ---
    original_model_size = get_model_size(model_id) if os.path.exists(model_id) else "Unknown (download size)"
    quantized_model_size = get_model_size(quantized_model_dir)
    print(f"\nOriginal Model Size: {original_model_size} MB")
    print(f"Quantized Model Size: {quantized_model_size:.2f} MB")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure the quantized model exists in 'gpt2-GPTQ' and all dependencies are installed.")

finally:
    # Clean up
    if 'model_gptq' in locals():
        del model_gptq
    reset_gpu_memory()
    print(f"Memory after final cleanup: {get_gpu_memory_usage():.2f} MB")

Using device: cuda

Loading Original GPT-2 Model...
Memory before loading original: 1114.00 MB
Original Model Memory Usage: 276.00 MB
Memory after loading original: 1390.00 MB
Original Model Inference Time: 0.4738 ± 0.0067 seconds
Original Model Perplexity: 162.47
Memory after clearing original: 1390.00 MB

Loading Quantized GPT-2 Model...


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
INFO:auto_gptq.modeling._base:The layer lm_head is not quantized.


Memory before loading GPTQ: 1114.00 MB
GPTQ Model Memory Usage: 18.00 MB
Memory after loading GPTQ: 1132.00 MB
GPTQ Model Inference Time: 1.2407 ± 0.1117 seconds
GPTQ Model Perplexity: 180.87

Original Model Size: Unknown (download size) MB
Quantized Model Size: 195.92 MB
Memory after final cleanup: 1132.00 MB


In [22]:
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and tokenizer paths
model_id = "gpt2"
quantized_model_dir = "gpt2-GPTQ"

# Define prompts to test the model
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def query_model(model, tokenizer, prompt, max_new_tokens=100, temperature=0.7, top_p=0.9):
    """Query the model with a prompt and return the generated response."""
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True  # Enable sampling for varied responses
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip()

try:
    # Load tokenizer and set pad token
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token  # Avoid pad token warnings

    # Load quantized model
    print("\nLoading Quantized GPT-2 Model...")
    model_gptq = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        use_safetensors=True,
        device_map='auto'
    )

    # Query the model with each prompt
    print("\nQuerying Quantized GPT-2 Model...")
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")
        response = query_model(model_gptq, tokenizer, prompt)
        print(f"Response: {response}")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure the quantized model exists in 'gpt2-GPTQ' and all dependencies are installed.")

finally:
    # Clean up
    if 'model_gptq' in locals():
        del model_gptq
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

Using device: cuda


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
INFO:auto_gptq.modeling._base:The layer lm_head is not quantized.



Loading Quantized GPT-2 Model...

Querying Quantized GPT-2 Model...

Prompt 1: What is the capital of France, and what is its largest city?
Response: What is the capital of France, and what is its largest city? How is it known, and why is it known, in France? And how does it look?

The French capital is the capital of France. It is the capital of the French Republic, France. It is the capital of France. It is the capital of the French republic. It is the capital of France. It is the capital of France.

Let us then look at the French capital of the United States. The United States is the capital of the United States. It is

Prompt 2: Write a short story about a robot exploring an abandoned city.
Response: Write a short story about a robot exploring an abandoned city. A group of young robots is trying to find the perfect place to live. The next day, they find a small town where there is no city, and they discover a city of the robots. They decide to live in a city of the robots.

This i

In [29]:
import torch
import os
import shutil
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and output paths
model_id = "gpt2"
quantized_model_dir = "gpt2-GPTQ"

# Example texts for quantization calibration
def get_calibration_examples(num_examples=128):
    """Load example texts from C4 English dataset for quantization."""
    dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
    examples = []
    for i, example in enumerate(dataset):
        if i >= num_examples:
            break
        text = example['text'][:512]  # Limit to 512 characters
        examples.append(text)
    return examples

try:
    # Check if quantized_model_dir exists and remove it
    if os.path.exists(quantized_model_dir):
        print(f"\nDirectory '{quantized_model_dir}' already exists. Deleting to create a fresh quantized model...")
        shutil.rmtree(quantized_model_dir)

    # Load tokenizer and model
    print("\nLoading GPT-2 Model and Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token  # Avoid pad token warnings
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

    # Prepare quantization configuration
    quantize_config = BaseQuantizeConfig(
        bits=4,          # 4-bit quantization
        group_size=128,  # Group size for quantization
        damp_percent=0.01,  # Damping factor
        desc_act=False   # Disable act-order for stability
    )

    # Get calibration examples
    print("\nLoading calibration examples...")
    examples = get_calibration_examples()

    # Tokenize examples
    tokenized_examples = [tokenizer(ex, return_tensors='pt', padding=True, truncation=True).to(device) for ex in examples]

    # Quantize the model
    print("\nQuantizing GPT-2 Model...")
    quantized_model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, device_map='cuda')
    quantized_model.quantize(tokenized_examples, use_triton=False)  # Triton may require CUDA kernels

    # Save quantized model
    print(f"\nSaving quantized model to '{quantized_model_dir}'...")
    quantized_model.save_quantized(quantized_model_dir, use_safetensors=True, safetensors_metadata={'format': 'pt'})

    # Save tokenizer files
    print(f"\nSaving tokenizer to '{quantized_model_dir}'...")
    tokenizer.save_pretrained(quantized_model_dir)

    # Rename safetensors file to match expected name
    old_safetensors = os.path.join(quantized_model_dir, 'gptq_model-4bit-128g.safetensors')
    new_safetensors = os.path.join(quantized_model_dir, 'model.safetensors')
    if os.path.exists(old_safetensors):
        os.rename(old_safetensors, new_safetensors)
        print(f"Renamed '{old_safetensors}' to '{new_safetensors}' for compatibility.")

    # Verify saved files
    print("\nVerifying saved files...")
    required_files = ['model.safetensors', 'config.json', 'tokenizer.json', 'vocab.json', 'merges.txt']
    saved_files = os.listdir(quantized_model_dir)
    for f in required_files:
        if f in saved_files:
            print(f"Found: {f}")
        else:
            print(f"Missing: {f}")
            raise FileNotFoundError(f"Failed to save {f} in '{quantized_model_dir}'")

    print(f"\nQuantization complete. Model saved to '{quantized_model_dir}'.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure all dependencies are installed and the model ID is correct.")

finally:
    # Clean up
    if 'model' in locals():
        del model
    if 'quantized_model' in locals():
        del quantized_model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

Using device: cuda

Directory 'gpt2-GPTQ' already exists. Deleting to create a fresh quantized model...

Loading GPT-2 Model and Tokenizer...

Loading calibration examples...


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]


Quantizing GPT-2 Model...


INFO - Start quantizing layer 1/12
INFO:auto_gptq.modeling._base:Start quantizing layer 1/12
INFO - Quantizing attn.c_attn in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_attn in layer 1/12...
INFO - Quantizing attn.c_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_proj in layer 1/12...
INFO - Quantizing mlp.c_fc in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_fc in layer 1/12...
INFO - Quantizing mlp.c_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_proj in layer 1/12...
INFO - Start quantizing layer 2/12
INFO:auto_gptq.modeling._base:Start quantizing layer 2/12
INFO - Quantizing attn.c_attn in layer 2/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_attn in layer 2/12...
INFO - Quantizing attn.c_proj in layer 2/12...
INFO:auto_gptq.modeling._base:Quantizing attn.c_proj in layer 2/12...
INFO - Quantizing mlp.c_fc in layer 2/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_fc in layer 2/12...
INFO - Qu


Saving quantized model to 'gpt2-GPTQ'...

Saving tokenizer to 'gpt2-GPTQ'...
Renamed 'gpt2-GPTQ/gptq_model-4bit-128g.safetensors' to 'gpt2-GPTQ/model.safetensors' for compatibility.

Verifying saved files...
Found: model.safetensors
Found: config.json
Found: tokenizer.json
Found: vocab.json
Found: merges.txt

Quantization complete. Model saved to 'gpt2-GPTQ'.


In [24]:
ls -l gpt2-GPTQ

total 200640
-rw-r--r-- 1 root root      1237 May  5 19:07 config.json
-rw-r--r-- 1 root root 200619312 May  5 19:07 gptq_model-4bit-128g.safetensors
-rw-r--r-- 1 root root    456318 May  5 19:07 merges.txt
-rw-r--r-- 1 root root       266 May  5 19:07 quantize_config.json
-rw-r--r-- 1 root root        99 May  5 19:07 special_tokens_map.json
-rw-r--r-- 1 root root       475 May  5 19:07 tokenizer_config.json
-rw-r--r-- 1 root root   3557680 May  5 19:07 tokenizer.json
-rw-r--r-- 1 root root    798156 May  5 19:07 vocab.json


In [31]:
import torch
import os
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and tokenizer paths
model_id = "gpt2"
quantized_model_dir = "gpt2-GPTQ"

# Define prompts to test the model
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def verify_model_directory(model_dir):
    """Verify that the model directory contains required files."""
    required_files = ['model.safetensors', 'config.json', 'tokenizer.json', 'vocab.json', 'merges.txt']
    return all(os.path.exists(os.path.join(model_dir, f)) for f in required_files)

def query_model(model, tokenizer, prompt, max_new_tokens=100, num_beams=5, temperature=0.5):
    """Query the model with a prompt and return the generated response with inference time."""
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=True,
            no_repeat_ngram_size=2  # Prevent repetition
        )
    inference_time = time.perf_counter() - start_time

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip(), inference_time

try:
    # Load tokenizer and set pad token
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token  # Avoid pad token warnings

    # Verify quantized model directory
    if not verify_model_directory(quantized_model_dir):
        raise FileNotFoundError(f"Quantized model directory '{quantized_model_dir}' is missing required files.")

    # Measure baseline memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        baseline_memory = torch.cuda.memory_allocated() / (1024 ** 2)
        print(f"Baseline GPU memory usage: {baseline_memory:.2f} MB")

    # Load quantized model
    print("\nLoading Quantized GPT-2 Model...")
    model_gptq = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        use_safetensors=True,
        device_map='auto'  # Use 'auto' for proper device mapping
    )
    torch.cuda.synchronize()
    quantized_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Quantized model loaded. Memory usage: {quantized_memory:.2f} MB")

    # Load original model
    print("\nLoading Original GPT-2 Model...")
    model_original = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    torch.cuda.synchronize()
    original_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Original model loaded. Memory usage: {original_memory:.2f} MB")

    # Query both models
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")

        # Query original model
        print("Original Response:")
        response_original, time_original = query_model(model_original, tokenizer, prompt)
        print(f"Response: {response_original}")
        print(f"Inference Time: {time_original:.4f} seconds")

        # Query quantized model
        print("Quantized Response:")
        response_gptq, time_gptq = query_model(model_gptq, tokenizer, prompt)
        print(f"Response: {response_gptq}")
        print(f"Inference Time: {time_gptq:.4f} seconds")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure the quantized model exists in 'gpt2-GPTQ' and all dependencies are installed.")

finally:
    # Clean up
    if 'model_gptq' in locals():
        del model_gptq
    if 'model_original' in locals():
        del model_original
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

Using device: cuda


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.
INFO:auto_gptq.modeling._base:The layer lm_head is not quantized.


Baseline GPU memory usage: 185.19 MB

Loading Quantized GPT-2 Model...
Quantized model loaded. Memory usage: 316.89 MB

Loading Original GPT-2 Model...
Original model loaded. Memory usage: 805.60 MB

Prompt 1: What is the capital of France, and what is its largest city?
Original Response:




Response: What is the capital of France, and what is its largest city?

The capital, Paris, is located in the center of the country. It has a population of 1.5 million people. The city is divided into two parts: the north and the south. In the northern part, there is a city called Marseille, which was founded by the French in 1789. There is also the city of Saint-Germain, where there are two cities, the Louvre and St. Peter's Basilica. On the other hand, on the
Inference Time: 1.8228 seconds
Quantized Response:
Response: What is the capital of France, and what is its largest city?

The French capital, Paris, has a population of 1.5 million people. It is located in the heart of the French Riviera, which is a UNESCO World Heritage Site. The city is home to a number of museums, including the Louvre, the Museum of Modern Art and the National Gallery of Art in Paris. In addition to the city's museums and galleries, there is also a museum dedicated to France's history and culture, known as t