In [None]:
### Quantization of Model Using AutoGPTQ

In [None]:
pip install torch transformers auto_gptq datasets

In [None]:
pip install -U bitsandbytes

In [None]:
pip install peft accelerate

In [None]:
!pip install --upgrade datasets

In [None]:
pip install --upgrade transformers peft auto-gptq accelerate

In [None]:
%env HF_TOKEN=

In [None]:
import torch
import os
import shutil
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and output paths
model_id = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1"  # Replace with your LLaMA model ID
quantized_model_dir = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"

# Example texts for quantization calibration
def get_calibration_examples(num_examples=128):
    """Load example texts from C4 English dataset for quantization."""
    dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)
    examples = []
    for i, example in enumerate(dataset):
        if i >= num_examples:
            break
        text = example['text'][:512]  # Limit to 512 characters
        examples.append(text)
    return examples

# Define prompts to test the model
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def verify_model_directory(model_dir):
    """Verify that the model directory contains required files."""
    required_files = ['model.safetensors', 'config.json', 'tokenizer.json']
    return all(os.path.exists(os.path.join(model_dir, f)) for f in required_files)

def query_model(model, tokenizer, prompt, max_new_tokens=100, num_beams=5, temperature=0.5):
    """Query the model with a prompt and return the generated response with inference time."""
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    inference_time = time.perf_counter() - start_time

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.strip(), inference_time

try:
    # Check if quantized_model_dir exists and remove it
    if os.path.exists(quantized_model_dir):
        print(f"\nDirectory '{quantized_model_dir}' already exists. Deleting to create a fresh quantized model...")
        shutil.rmtree(quantized_model_dir)

    # Load tokenizer and model
    print("\nLoading nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1 Model and Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)  # Set your HF token
    tokenizer.pad_token = tokenizer.eos_token  # Avoid pad token warnings
    model = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=True).to(device)

    # Prepare quantization configuration
    quantize_config = BaseQuantizeConfig(
        bits=4,          # 4-bit quantization
        group_size=128,  # Group size for quantization
        damp_percent=0.01,  # Damping factor
        desc_act=False   # Disable act-order for stability
    )

    # Get calibration examples
    print("\nLoading calibration examples...")
    examples = get_calibration_examples()

    # Tokenize examples
    tokenized_examples = [tokenizer(ex, return_tensors='pt', padding=True, truncation=True).to(device) for ex in examples]

    # Quantize the model
    print("\nQuantizing nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1 Model...")
    quantized_model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config, device_map='cuda', use_auth_token=True)
    quantized_model.quantize(tokenized_examples, use_triton=False)  # Triton may require CUDA kernels

    # Save quantized model
    print(f"\nSaving quantized model to '{quantized_model_dir}'...")
    quantized_model.save_quantized(quantized_model_dir, use_safetensors=True, safetensors_metadata={'format': 'pt'})

    # Save tokenizer files
    print(f"\nSaving tokenizer to '{quantized_model_dir}'...")
    tokenizer.save_pretrained(quantized_model_dir)

    # Rename safetensors file to match expected name
    old_safetensors = os.path.join(quantized_model_dir, 'gptq_model-4bit-128g.safetensors')
    new_safetensors = os.path.join(quantized_model_dir, 'model.safetensors')
    if os.path.exists(old_safetensors):
        os.rename(old_safetensors, new_safetensors)
        print(f"Renamed '{old_safetensors}' to '{new_safetensors}' for compatibility.")

    # Verify saved files
    print("\nVerifying saved files...")
    if not verify_model_directory(quantized_model_dir):
        raise FileNotFoundError(f"Failed to save required files in '{quantized_model_dir}'")
    saved_files = os.listdir(quantized_model_dir)
    for f in ['model.safetensors', 'config.json', 'tokenizer.json']:
        if f in saved_files:
            print(f"Found: {f}")
        else:
            print(f"Missing: {f}")

    # Measure baseline memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        baseline_memory = torch.cuda.memory_allocated() / (1024 ** 2)
        print(f"\nBaseline GPU memory usage: {baseline_memory:.2f} MB")

    # Load quantized model for testing
    print("\nLoading Quantized nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1Model...")
    model_gptq = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir,
        use_safetensors=True,
        device_map='auto'
    )
    torch.cuda.synchronize()
    quantized_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Quantized model loaded. Memory usage: {quantized_memory:.2f} MB")

    # Load original model for comparison
    print("\nLoading Original nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1 Model...")
    model_original = AutoModelForCausalLM.from_pretrained(model_id, use_auth_token=True).to(device)
    torch.cuda.synchronize()
    original_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Original model loaded. Memory usage: {original_memory:.2f} MB")

    # Query both models
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")

        # Query original model
        print("Original Response:")
        response_original, time_original = query_model(model_original, tokenizer, prompt)
        print(f"Response: {response_original}")
        print(f"Inference Time: {time_original:.4f} seconds")

        # Query quantized model
        print("Quantized Response:")
        response_gptq, time_gptq = query_model(model_gptq, tokenizer, prompt)
        print(f"Response: {response_gptq}")
        print(f"Inference Time: {time_gptq:.4f} seconds")

    print("\nQuantization and testing complete.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please ensure all dependencies are installed, the model ID is correct, and you have a valid Hugging Face token.")

finally:
    # Clean up
    if 'model' in locals():
        del model
    if 'quantized_model' in locals():
        del quantized_model
    if 'model_gptq' in locals():
        del model_gptq
    if 'model_original' in locals():
        del model_original
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

In [None]:
### Pushing Quantized model to Hugging Face

In [None]:
pip install huggingface_hub

In [None]:
!huggingface-cli login

In [None]:
import os
from huggingface_hub import HfApi, create_repo, upload_folder
import textwrap

# Paths and HF info
quantized_model_dir = "Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"  # Local folder with quantized model
hf_username = "avinashhm"  # Replace with your Hugging Face username
repo_name = "Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"
repo_id = f"{hf_username}/{repo_name}"
model_id = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1"  # Original model name

# Generate a README.md (Model Card)
model_card_content = textwrap.dedent (f"""\
# `{repo_name}` - GPTQ Quantized LLaMA Model

This is a **4-bit GPTQ quantized version** of the model [`{model_id}`](https://huggingface.co/{model_id}).

## Model Details

- Original Model: `{model_id}`
- Quantization Method: GPTQ (`auto-gptq`)
- Quantization Config: 4-bit, group_size=128
- Tokenizer: Same as original
- File Format: `safetensors`

## Usage Instructions

To load the model using `auto_gptq`:

```python
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized("{repo_id}", device_map="auto")""")

In [None]:
from huggingface_hub import HfApi, create_repo
from auto_gptq import AutoGPTQForCausalLM
from transformers import AutoTokenizer
import os

# Paths and HF username
quantized_model_dir = "nvidia/Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"
repo_name = "Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"  # Your desired repo name on HF
token = "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"  # Replace with your actual token or use default login

# Load tokenizer and quantized model
print("Loading quantized model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir,
    use_safetensors=True,
    device_map="auto"
)

# Create repository on Hugging Face Hub (if it doesn't exist)
api = HfApi()
repo_id = f"{os.environ.get('HF_USERNAME')}/{repo_name}"  # e.g., "username/Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"

try:
    create_repo(repo_id=repo_id, token=token, private=False, exist_ok=True)
    print(f"Repository {repo_id} created or already exists.")
except Exception as e:
    print(f"Error creating repository: {e}")
    repo_id = repo_name  # Fallback if username not set

# Save model and tokenizer again just in case (optional)
model.save_pretrained(quantized_model_dir)
tokenizer.save_pretrained(quantized_model_dir)

# Push to Hugging Face Hub
print(f"Pushing model to Hugging Face Hub under {repo_id}...")
model.push_to_hub(repo_id, token=token)
tokenizer.push_to_hub(repo_id, token=token)

print(f"\n✅ Model successfully pushed to: https://huggingface.co/{repo_id}")

In [None]:
!pip install transformers accelerate torch lm_eval datasets tqdm

In [None]:
!pip install lm_eval

In [None]:
!pip install git+https://github.com/PanQiWei/AutoGPTQ.git@main#egg=auto-gptq
!pip install optimum

In [None]:
### Testing

In [None]:
import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model ID on Hugging Face Hub
model_name = "avinashhm/Llama-3.1-Nemotron-Nano-4B-v1.1-GPTQ"

# -------------------------------
# Step 1: Load Tokenizer
# -------------------------------
print("\nLoading tokenizer...")
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    raise

# -------------------------------
# Step 2: Load Quantized Model
# -------------------------------
print("Loading quantized model...")
try:
    model = AutoGPTQForCausalLM.from_quantized(
        model_name,
        device_map="auto",           # Automatically map layers to available devices
        use_safetensors=True,        # Ensure safe tensor loading
        trust_remote_code=False      # Only set to True if necessary
    )
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# -------------------------------
# Step 3: Inference Function
# -------------------------------
def generate_response(prompt, max_new_tokens=100, temperature=0.7, num_beams=4):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# -------------------------------
# Step 4: Run Sample Prompts
# -------------------------------
test_prompts = [
    "What is the capital of France?",
    "Explain quantum computing in simple terms.",
    "Write a short story about an AI gaining consciousness.",
    "If I have 3 apples and eat one, how many do I have left?",
    "Why is the sky blue?"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{'-'*40}\nPrompt {i}:\n{prompt}")
    try:
        response = generate_response(prompt)
        print(f"\nResponse:\n{response[len(prompt):]}")
    except Exception as e:
        print(f"Error during generation: {e}")

# -------------------------------
# Step 5: Optional - Memory Usage
# -------------------------------
if device == "cuda":
    print("\nMemory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated() / (1024 ** 2):.2f} MB")
    print(f"Cached:    {torch.cuda.memory_reserved() / (1024 ** 2):.2f} MB")

# -------------------------------
# Step 6: Cleanup
# -------------------------------
try:
    del model
    torch.cuda.empty_cache()
    import gc
    gc.collect()
    print("\nCleanup complete.")
except:
    pass