In [None]:
!pip install torch_pruning

Collecting torch_pruning
  Downloading torch_pruning-1.6.0-py3-none-any.whl.metadata (31 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->torch_pruning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->torch_pruning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->torch_pruning)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->torch_pruning)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->torch_pruning)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->torch_pruning)
  Do

| Метод                                    | Библиотека                                              | Комментарий                                                                                     |
|------------------------------------------|---------------------------------------------------------|------------------------------------------------------------------------------------------------|
| Прунинг                                 | TorchPruning                                            | Лёгкая, гибкая. Работает прямо с PyTorch. Поддерживает структурный и неструктурный pruning.     |
| Квантизация                            | Optimum + ONNX/OpenVINO/Intel Neural Compressor        | Поддержка quantization-aware training (QAT) и post-training quantization (PTQ). Имеет интеграцию с HuggingFace. |
| Дистилляция                            | HuggingFace Transformers + DistillationTrainer        | Есть готовый код в примерах. Работает прямо с Trainer API.                                    |
| Низкоранговая факторизация (SVD/LoRA) | peft                                                    | Поддержка LoRA/IA3, особенно хороша для больших моделей. Для SVD есть torch.nn.utils.prune или кастомные имплементации. |
| Все-в-одном (bonus)                     | Neural Compressor, SparseML, NNCF                     | Поддерживают pruning + quantization + distillation. Более комплексные. Лучше подходят под продакшн. |

### 0. Подготовка

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
import torch_pruning as tp
import time
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2).to(device)

raw_dataset = load_dataset("glue", "sst2")

def preprocess(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

encoded_dataset = raw_dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.remove_columns(["sentence", "idx"])
encoded_dataset.set_format("torch")

train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(1000))
val_dataset = encoded_dataset["validation"].select(range(200))

collator = DataCollatorWithPadding(tokenizer)

def custom_collate(batch):
    features = [{k: v for k, v in example.items() if k != "label"} for example in batch]
    labels = torch.tensor([example["label"] for example in batch])
    padded = collator(features)
    padded["label"] = labels
    return padded

def evaluate(model, dataset, batch_size=16):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=custom_collate)
    correct = total = 0
    with torch.no_grad():
        for batch in dataloader:
            labels = batch.pop("label").to(device)
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            preds = outputs.logits.argmax(dim=-1)
            correct += (preds == labels).sum().item()
            total += len(labels)
    return correct / total

def measure_size(model):
    torch.save(model.state_dict(), "temp.pt")
    size_mb = os.path.getsize("temp.pt") / 1e6
    os.remove("temp.pt")
    return size_mb

def measure_inference_time(model, dataset, batch_size=16, runs=5):
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=custom_collate)
    model.eval()
    times = []
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            if i >= runs: break
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            torch.cuda.synchronize() if torch.cuda.is_available() else None
            start = time.time()
            model(**inputs)
            torch.cuda.synchronize() if torch.cuda.is_available() else None
            times.append(time.time() - start)
    return sum(times) / len(times)

baseline_acc = evaluate(model, val_dataset)
baseline_size = measure_size(model)
baseline_time = measure_inference_time(model, val_dataset)

print(f"📊 Baseline accuracy: {baseline_acc:.4f}")
print(f"📦 Baseline size: {baseline_size:.2f} MB")
print(f"⚡️ Baseline inference time: {baseline_time:.4f} sec")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📊 Baseline accuracy: 0.4950
📦 Baseline size: 267.85 MB
⚡️ Baseline inference time: 0.0650 sec


### 1. Pruning (torch_pruning)

In [None]:
from torch.optim import AdamW

example_inputs = {
    "input_ids": torch.ones(1, 128, dtype=torch.long).to(device),
    "attention_mask": torch.ones(1, 128, dtype=torch.long).to(device)
}

class PrunableWrapper(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

wrapped_model = PrunableWrapper(model)

ignored_layers = []
for name, m in model.named_modules():
    if isinstance(m, nn.Linear):
        if "classifier" in name or "attention" in name:
            ignored_layers.append(m)

imp = tp.importance.MagnitudeImportance(p=2, group_reduction='mean')

pruner = tp.pruner.MagnitudePruner(
    wrapped_model,
    example_inputs=(example_inputs["input_ids"], example_inputs["attention_mask"]),
    importance=imp,
    global_pruning=False,
    pruning_ratio=0.5,
    iterative_steps=2,
    ignored_layers=ignored_layers,
)

for i in range(2):
    pruner.step()
    macs, params = tp.utils.count_ops_and_params(wrapped_model, (example_inputs["input_ids"], example_inputs["attention_mask"]))
    print(f"🧹 Iter {i+1}: Params {params / 1e6:.2f}M, MACs {macs / 1e9:.2f}G")

print("✂️ Pruning complete.")

model.train()
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate)
optimizer = AdamW(model.parameters(), lr=3e-5)

EPOCHS = 3
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in train_loader:
        labels = batch.pop("label").to(device)
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"📚 Epoch {epoch+1} — Loss: {total_loss / len(train_loader):.4f}")

pruned_acc = evaluate(model, val_dataset)
pruned_size = measure_size(model)
pruned_time = measure_inference_time(model, val_dataset)
macs, params = tp.utils.count_ops_and_params(wrapped_model, (example_inputs["input_ids"], example_inputs["attention_mask"]))

model.save_pretrained("finetuned_pruned_model")
tokenizer.save_pretrained("finetuned_pruned_model")

print("\n✅ After Pruning & Fine-tuning:")
print(f"📊 Accuracy: {pruned_acc:.4f}")
print(f"📦 Size: {pruned_size:.2f} MB")
print(f"⚡️ Inference time: {pruned_time:.4f} sec")
print(f"🔢 MACs: {macs / 1e9:.2f} G, Params: {params / 1e6:.2f} M")

🧹 Iter 1: Params 59.87M, MACs 4.53G
🧹 Iter 2: Params 52.79M, MACs 3.63G
✂️ Pruning complete.
📚 Epoch 1 — Loss: 0.6441
📚 Epoch 2 — Loss: 0.4227
📚 Epoch 3 — Loss: 0.2114

✅ After Pruning & Fine-tuning:
📊 Accuracy: 0.7400
📦 Size: 211.19 MB
⚡️ Inference time: 0.0332 sec
🔢 MACs: 3.63 G, Params: 52.79 M


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch_pruning as tp
import gc, os, time
from transformers import AutoModelForSequenceClassification, get_scheduler

def measure_size(model):
    torch.save(model.state_dict(), "/tmp/temp_model.pth")
    return os.path.getsize("/tmp/temp_model.pth") / 1e6

def measure_inference_time(model, dataset, batch_size=16):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=custom_collate)
    start = time.time()
    with torch.no_grad():
        for batch in dataloader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            model(**inputs)
    end = time.time()
    return (end - start) / len(dataloader)

# Конфигурация эксперимента
ratios_to_try = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results = []

for ratio in ratios_to_try:
    print(f"\n🔧 Testing pruning_ratio = {ratio}")
    torch.cuda.empty_cache()
    gc.collect()

    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=2).to(device)
    model.train()

    class PrunableWrapper(nn.Module):
        def __init__(self, model):
            super().__init__()
            self.model = model

        def forward(self, input_ids, attention_mask):
            return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

    wrapped_model = PrunableWrapper(model)

    ignored_layers = []
    for name, m in model.named_modules():
        if isinstance(m, nn.Linear) and ("classifier" in name or "attention" in name):
            ignored_layers.append(m)

    example_inputs = {
        "input_ids": torch.ones(1, 128, dtype=torch.long).to(device),
        "attention_mask": torch.ones(1, 128, dtype=torch.long).to(device)
    }

    imp = tp.importance.MagnitudeImportance(p=2, group_reduction='mean')
    pruner = tp.pruner.MagnitudePruner(
        wrapped_model,
        example_inputs=(example_inputs["input_ids"], example_inputs["attention_mask"]),
        importance=imp,
        global_pruning=False,
        pruning_ratio=ratio,
        iterative_steps=2,
        ignored_layers=ignored_layers,
    )

    for _ in range(2):
        pruner.step()

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate)

    EPOCHS = 5

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for batch in train_loader:
            labels = batch.pop("label").to(device)
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        print(f"📚 Epoch {epoch+1} — Loss: {total_loss / len(train_loader):.4f}")

    model.eval()
    acc = evaluate(model, val_dataset)
    size_mb = measure_size(model)
    infer_time = measure_inference_time(model, val_dataset)
    macs, params = tp.utils.count_ops_and_params(wrapped_model, (example_inputs["input_ids"], example_inputs["attention_mask"]))

    result = {
        "ratio": ratio,
        "acc": acc,
        "size_mb": size_mb,
        "infer_time": infer_time,
        "macs": macs / 1e9,
        "params": params / 1e6
    }
    results.append(result)
    print(f"✅ Done: {result}")

    del model, wrapped_model, pruner
    torch.cuda.empty_cache()
    gc.collect()

print("\n📊 All pruning results:")
for r in results:
    print(f"Ratio {r['ratio']:.1f} | Acc: {r['acc']:.4f} | Size: {r['size_mb']:.2f}MB | MACs: {r['macs']:.2f}G | Params: {r['params']:.2f}M | Inference: {r['infer_time']:.3f}s")


🔧 Testing pruning_ratio = 0.1


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

### 2. Quantisation (Optimum-'onnxruntime')

In [None]:
!pip install optimum[onnxruntime] onnx



In [None]:
from optimum.exporters.onnx import main_export

main_export(
    model_name_or_path='distilbert-base-uncased',
    output=Path("onnx_model"),
    task="sequence-classification",
    opset=17,
    library_name='transformers'
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic(
    model_input="onnx_model/model.onnx",
    model_output="onnx_model/model_quant.onnx",
    weight_type=QuantType.QInt8
)



In [None]:
import onnxruntime as ort
import numpy as np

session = ort.InferenceSession("onnx_model/model_quant.onnx")

inputs = tokenizer("This movie was great!", return_tensors="np", padding="max_length", max_length=128)
ort_inputs = {k: v for k, v in inputs.items()}
outputs = session.run(None, ort_inputs)
print(outputs)

[array([[-0.1279223 ,  0.02694205]], dtype=float32)]


### 3. Knowledge distillation

In [None]:
!git clone https://github.com/huggingface/transformers-research-projects/
%cd transformers-research-projects/distillation

/content/transformers-research-projects/distillation


In [None]:
!python distiller.py \
  --teacher_model ./finetuned_pruned_model \
  --student_model distilbert-base-uncased \
  --task_name sst2 \
  --output_dir /content/distil \
  --num_train_epochs 3 \
  --max_seq_length 128 \
  --per_device_train_batch_size 16 \
  --learning_rate 5e-5 \
  --alpha_ce 0.5 \
  --alpha_hard 0.5 \
  --temperature 2.0 \
  --do_train \
  --do_eval \
  --eval_steps 100 \
  --save_steps 1000

2025-07-31 15:30:26.951328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753975826.971068   27068 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753975826.977172   27068 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
07/31/2025 15:30:29 - INFO - numexpr.utils - PID: 27068 -  NumExpr defaulting to 2 threads.


### 4. Low-rank factorization