In [1]:
import os
# Install required packages
os.system("pip uninstall torch torchvision torchaudio -y")
os.system("pip install torch==2.5.1 transformers==4.51.3 accelerate==1.3.0 bitsandbytes==0.44.1")

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn.utils.prune as prune
import torch.nn as nn
try:
    from bitsandbytes.nn import modules as bnb_modules
except ImportError:
    print("bitsandbytes not installed, installing now...")
    os.system("pip install bitsandbytes==0.44.1")

model_id = "Qwen/Qwen3-4B"
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        attn_implementation="eager"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    print("Possible causes:")
    print("- Network issues. Ensure you can access Hugging Face servers.")
    print("- Package versions mismatch. Run: pip install torch==2.5.1 transformers==4.51.3 accelerate==1.3.0 bitsandbytes==0.44.1")
    exit(1)

def prune_model(model, amount=0.3):
    for name, module in model.named_modules():
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            try:
                prune.l1_unstructured(module, name='weight', amount=amount)
                prune.remove(module, 'weight')
            except Exception as e:
                print(f"Error pruning module {name}: {e}")
    return model

def quantize_model(model):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            try:
                parent_name = '.'.join(name.split('.')[:-1])
                layer_name = name.split('.')[-1]
                parent = model
                for part in parent_name.split('.'):
                    parent = getattr(parent, part)
                quantized_layer = bnb_modules.Linear4bit(
                    module.in_features,
                    module.out_features,
                    bias=module.bias is not None,
                    compute_dtype=torch.float16
                )
                quantized_layer.weight.data = module.weight.data
                if module.bias is not None:
                    quantized_layer.bias.data = module.bias.data
                setattr(parent, layer_name, quantized_layer)
            except Exception as e:
                print(f"Error quantizing module {name}: {e}")
    return model

try:
    model = prune_model(model, amount=0.3)
    print("Model pruned successfully (30% weights removed).")
    if device == "cuda":
        model = quantize_model(model)
        print("Model quantized to 4-bit successfully.")
    else:
        print("Skipping quantization (requires CUDA).")
except Exception as e:
    print(f"Error during pruning or quantization: {e}")
    exit(1)

output_dir = "./pruned_quantized_qwen3_4b_30"
try:
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Pruned and quantized model saved to {output_dir}")
except Exception as e:
    print(f"Error saving model: {e}")
    exit(1)

def test_model(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.6,
            top_p=0.85,
            do_sample=False
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during generation: {e}")
        return None

prompt = "Solve the equation 2x^2 + 3x - 5 = 0"
try:
    response = test_model(model, tokenizer, prompt)
    if response:
        print("Model response:", response)
except Exception as e:
    print(f"Error testing model: {e}")

def get_model_size(model):
    param_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return param_size / 1e6

original_size = 4.0 * 1000  # Qwen3-4B has ~4B parameters
pruned_size = get_model_size(model)
print(f"Original model size: {original_size:.2f}M parameters")
print(f"Pruned model size: {pruned_size:.2f}M parameters")
print(f"Reduction: {((original_size - pruned_size) / original_size * 100):.2f}%")
if device == "cuda":
    print("Note: Additional size reduction from 4-bit quantization not reflected in parameter count.")

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Model and tokenizer loaded successfully.
Model pruned successfully (30% weights removed).
Error quantizing module lm_head: 'Qwen3ForCausalLM' object has no attribute ''
Model quantized to 4-bit successfully.
Pruned and quantized model saved to ./pruned_quantized_qwen3_4b_30




Error during generation: 
Original model size: 4000.00M parameters
Pruned model size: 389.15M parameters
Reduction: 90.27%
Note: Additional size reduction from 4-bit quantization not reflected in parameter count.




In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_dir = "./pruned_quantized_qwen3_4b_30"
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
        attn_implementation="eager"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    print("Pruned model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading pruned model or tokenizer: {e}")
    print("Possible causes:")
    print("- Insufficient GPU memory. Try running on CPU or increasing GPU memory.")
    print("- Corrupted model files in ./pruned_quantized_qwen3_4b_30. Verify directory contents.")
    exit(1)

def test_model(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.6,  # Lower temperature for deterministic output
            top_p=0.85,      # Stricter top-p sampling
            do_sample=False  # Use greedy decoding for coherence
        )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error during generation: {e}")
        return None

prompt = "Solve the equation 2x^2 + 3x - 5 = 0"
try:
    response = test_model(model, tokenizer, prompt)
    if response:
        print("Model response:", response)
except Exception as e:
    print(f"Error testing model: {e}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Pruned model and tokenizer loaded successfully.




Model response: Solve the equation 2x^2 + 3x - 5 = 0 using the quadratic formula.

To solve the quadratic equation 2x^2 + 3x - 5 = 0 using the quadratic formula, we first identify the coefficients a, b, and c from the standard form of a quadratic equation, which is ax^2 + bx + c = 0.

In this equation, a = 2, b = 3, and c = -5.

The quadratic formula is x = [-b ± √(b^2 - 4ac)] / (2a).

Plugging in the values of a, b, and c into the formula:

Discriminant (D) = b^2 - 4ac = (3)^2 - 4*(2)*(-5) = 9 + 40 = 49.

Since the discriminant is positive, there are two real solutions.

Now, calculate the two possible solutions:

x1 = [-b + √D] / (2a)
