In [None]:
pip install torch transformers auto_gptq datasets

In [None]:
pip install -U bitsandbytes

In [None]:
pip install peft accelerate

In [None]:
!pip install --upgrade datasets

In [None]:
pip install --upgrade transformers peft auto-gptq accelerate

In [None]:
!pip install huggingface_hub[hf_xet]

In [None]:
!export HF_TOKEN="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"

In [None]:
%env HF_TOKEN=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

In [None]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset

# Set random seed
torch.manual_seed(0)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model and token
model_id = "Qwen/Qwen3-8B"
HF_TOKEN = "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"

# Define prompts
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?"
]

def query_model(model, tokenizer, prompt, max_new_tokens=200, num_beams=2, temperature=0.7):
    system_prompt = "You are a helpful assistant. Provide a concise and accurate answer to the question without repeating the prompt or adding unrelated questions."
    full_prompt = f"{system_prompt}\n\nQuestion: {prompt}\nAnswer:"
    inputs = tokenizer(full_prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=False,
            no_repeat_ngram_size=0
        )
    inference_time = time.perf_counter() - start_time
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Answer:" in response:
        response = response.split("Answer:")[1].strip()
    return response, inference_time

try:
    # Load tokenizer and model with 4-bit quantization
    print("\nLoading Qwen Model and Tokenizer with 4-bit quantization...")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",
        token=HF_TOKEN
    )

    # Measure memory usage
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        memory_usage = torch.cuda.memory_allocated() / (1024 ** 2)
        print(f"Model memory usage: {memory_usage:.2f} MB")

    # Query model
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")
        response, inference_time = query_model(model, tokenizer, prompt)
        print(f"Response: {response}")
        print(f"Inference Time: {inference_time:.4f} seconds")

    print("\nTesting complete.")

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    if 'model' in locals():
        del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()

In [None]:
### testing Qwen quantized model

In [None]:
import torch
import time
import numpy as np
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from torch.nn.functional import cross_entropy

# Set random seed for reproducibility
torch.manual_seed(0)

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Define model, token, and output directory
model_id = "Qwen/Qwen3-8B"
HF_TOKEN = "your_new_hf_token"  # Replace with your new Hugging Face token
output_dir = "Qwen3-8B-Quantized"

# Define prompts for testing
prompts = [
    "What is the capital of France, and what is its largest city?",
    "Write a short story about a robot exploring an abandoned city.",
    "If a car travels 60 miles in 1 hour, how far will it travel in 2.5 hours?",
    "Explain why the sky appears blue.",
    "What’s your favorite book, and why?",
    # Additional prompts for robustness
    "Solve the equation: 2x + 5 = 15.",
    "Describe the cultural significance of the Eiffel Tower.",
    "Generate a Python function to reverse a string."
]

# Load evaluation dataset for perplexity
def load_eval_dataset(num_examples=100):
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test", streaming=True)
    texts = []
    for i, example in enumerate(dataset):
        if i >= num_examples:
            break
        texts.append(example['text'][:512])  # Limit to 512 characters
    return texts

# Function to query model
def query_model(model, tokenizer, prompt, max_new_tokens=200, num_beams=2, temperature=0.7):
    system_prompt = "You are a helpful assistant. Provide a concise and accurate answer to the question without repeating the prompt or adding unrelated questions."
    full_prompt = f"{system_prompt}\n\nQuestion: {prompt}\nAnswer:"
    inputs = tokenizer(full_prompt, return_tensors='pt', padding=True, truncation=True).to(device)
    start_time = time.perf_counter()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
            early_stopping=False,
            no_repeat_ngram_size=0
        )
    inference_time = time.perf_counter() - start_time
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Answer:" in response:
        response = response.split("Answer:")[1].strip()
    return response, inference_time, len(outputs[0])

# Function to compute perplexity
def compute_perplexity(model, tokenizer, texts, batch_size=8):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
        total_loss += loss.item() * input_ids.size(1)
        total_tokens += input_ids.size(1)
    avg_loss = total_loss / total_tokens
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    return perplexity

# Function to test batch processing
def test_batch_processing(model, tokenizer, prompts, batch_sizes=[1, 4, 8]):
    results = {}
    for batch_size in batch_sizes:
        start_time = time.perf_counter()
        inputs = tokenizer(prompts[:batch_size], return_tensors='pt', padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=200,
                num_beams=2,
                temperature=0.7,
                pad_token_id=tokenizer.pad_token_id,
                early_stopping=False,
                no_repeat_ngram_size=0
            )
        inference_time = time.perf_counter() - start_time
        tokens_per_second = len(outputs[0]) * batch_size / inference_time
        results[batch_size] = {"time": inference_time, "throughput": tokens_per_second}
    return results

try:
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Load tokenizer
    print("\nLoading Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
    tokenizer.pad_token = tokenizer.eos_token

    # Load quantized model
    print("\nLoading Quantized Model (4-bit)...")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    model_quantized = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=quantization_config,
        device_map="auto",
        token=HF_TOKEN
    )
    torch.cuda.synchronize()
    quantized_memory = torch.cuda.memory_allocated() / (1024 ** 2)
    print(f"Quantized Model Memory Usage: {quantized_memory:.2f} MB")

    # Save quantized model and tokenizer
    print(f"\nSaving Quantized Model to '{output_dir}'...")
    os.makedirs(output_dir, exist_ok=True)
    model_quantized.save_pretrained(output_dir, safe_serialization=True)
    tokenizer.save_pretrained(output_dir)
    print(f"Quantized model and tokenizer saved to '{output_dir}'.")

    # Load evaluation dataset
    print("\nLoading Evaluation Dataset...")
    eval_texts = load_eval_dataset()

    # Test quantized model
    results = {
        "quantized": {"times": [], "responses": [], "perplexity": None, "tokens": []}
    }

    # Query quantized model
    print("\nTesting Quantized Model...")
    for i, prompt in enumerate(prompts, 1):
        print(f"Prompt {i}: {prompt}")
        response, inference_time, tokens = query_model(model_quantized, tokenizer, prompt)
        print(f"Response: {response}")
        print(f"Inference Time: {inference_time:.4f} seconds")
        print(f"Tokens Generated: {tokens}")
        results["quantized"]["times"].append(inference_time)
        results["quantized"]["responses"].append(response)
        results["quantized"]["tokens"].append(tokens)

    # Compute perplexity for quantized model
    print("\nComputing Perplexity for Quantized Model...")
    results["quantized"]["perplexity"] = compute_perplexity(model_quantized, tokenizer, eval_texts)

    # Test batch processing for quantized model
    print("\nTesting Batch Processing for Quantized Model...")
    quantized_batch_results = test_batch_processing(model_quantized, tokenizer, prompts)

    # Summarize results
    print("\n=== Performance Summary ===")
    print(f"Quantized Model Memory Usage: {quantized_memory:.2f} MB")
    print(f"Average Inference Time: {np.mean(results['quantized']['times']):.4f} seconds")
    print(f"Average Tokens Generated: {np.mean(results['quantized']['tokens']):.2f}")
    print(f"Perplexity: {results['quantized']['perplexity']:.2f}")
    print("\nBatch Processing Results:")
    for batch_size in quantized_batch_results:
        print(f"Batch Size {batch_size}:")
        print(f"  Time: {quantized_batch_results[batch_size]['time']:.4f}s, Throughput: {quantized_batch_results[batch_size]['throughput']:.2f} tokens/s")
    print("\nResponses:")
    for i, prompt in enumerate(prompts, 1):
        print(f"\nPrompt {i}: {prompt}")
        print(f"Response: {results['quantized']['responses'][i-1]}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

finally:
    # Clean up
    if 'model_quantized' in locals():
        del model_quantized
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()