<a href="https://colab.research.google.com/github/1028Luo/LLAMA_Light_Workout/blob/main/compress.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -q transformers

In [None]:
# Log in to Hugging Face
from google.colab import userdata
from huggingface_hub import login
my_hugging_face_token = userdata.get('huggingface_token')
login(token=my_hugging_face_token)

# Pruning

In [None]:
import torch
import torch.nn.utils.prune as prune
import torch.nn as nn
from transformers import AutoModelForCausalLM

# Load model (you can change this to any causal LLM)
model_name = "Jiexing1028/llama-3-8b-workout-0.2-pruned"  # Or your quantized/fine-tuned model
model = AutoModelForCausalLM.from_pretrained(model_name)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


model.eval()
device = torch.device("cpu")
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

Total parameters: 8,037,076,992
Trainable parameters: 0


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=4096, out_features=1024

In [None]:
# start pruning
def prune_linear_layers_one_by_one(model, amount=0.3):
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            print(f"Pruning {name}...")
            try:
                prune.l1_unstructured(module, name="weight", amount=amount)
                prune.remove(module, "weight")  # Make pruning permanent
                #torch.cuda.empty_cache()  # Free unused memory
            except Exception as e:
                print(f"Skipping {name} due to error: {e}")
    return model



# Apply pruning
prune_amount = 0.2  # 30% sparsity
model = prune_linear_layers_one_by_one(model, amount=prune_amount)

# Optional: Remove the pruning hooks and save pruned model
# model = remove_pruning_wrappers(model)
# model.save_pretrained("pruned-model")

print("Model saved to ./pruned-model")


In [None]:
model.save_pretrained("pruned-model")
model.push_to_hub("Jiexing1028/llama-3-8b-workout-0.2-pruned")

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jiexing1028/llama-3-8b-workout-0.2-pruned/commit/8cedd9fe43d16cdd97774913327e0aa900198cf7', commit_message='Upload LlamaForCausalLM', commit_description='', oid='8cedd9fe43d16cdd97774913327e0aa900198cf7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Jiexing1028/llama-3-8b-workout-0.2-pruned', endpoint='https://huggingface.co', repo_type='model', repo_id='Jiexing1028/llama-3-8b-workout-0.2-pruned'), pr_revision=None, pr_num=None)

In [None]:
# unstructured pruning: (magnitude based) failed because the model becomes a
# sparse model (a lot of 0 weights), which cannot be leveraged without a
# sparse-aware engine for speed-up and memory footprint reduction

# structured pruning: can prune attention and FFN
# attention pruning tricky
# FFN consists of gate_proj, up_proj, and down_proj, looks like this:
# x ∈ [B, T, 4096]
# gate_proj(x) → [B, T, 11008]
# up_proj(x)   → [B, T, 11008]
# swiGLU → [B, T, 11008]
# down_proj → [B, T, 4096], residual from x to here
# can prune gate and up, pytorch automatically matches the next layer,
# but cannot prune down because size won't match


# Hard Distillation + Pruning Workflow
# Teacher Inference (once only):

# Generate answers from your teacher model on your training set.

# Save them alongside the questions to form your new training dataset.

# Train Student (Hard Distillation):

# Use the (question, teacher_answer) pairs to fine-tune your student model.

# You don’t need logits—treat this like a regular supervised learning task.

# Apply Structured Pruning:

# After the student model is trained, apply structured pruning to MLPs and attention heads.

# Remove pruning hooks and export your pruned model.

# Export to ONNX (if CPU inference):

# Export the structured-pruned model to ONNX.

# Use ONNX Runtime with CPU optimizations.


# the output of each layer
# model
# model.embed_tokens
# model.layers
# model.layers.0
# model.layers.0.self_attn
# model.layers.0.self_attn.q_proj
# model.layers.0.self_attn.q_proj.base_layer
# model.layers.0.self_attn.q_proj.lora_dropout
# model.layers.0.self_attn.q_proj.lora_dropout.default
# model.layers.0.self_attn.q_proj.lora_A
# model.layers.0.self_attn.q_proj.lora_A.default
# model.layers.0.self_attn.q_proj.lora_B
# model.layers.0.self_attn.q_proj.lora_B.default
# model.layers.0.self_attn.q_proj.lora_embedding_A
# model.layers.0.self_attn.q_proj.lora_embedding_B
# model.layers.0.self_attn.q_proj.lora_magnitude_vector
# model.layers.0.self_attn.k_proj
# model.layers.0.self_attn.v_proj
# model.layers.0.self_attn.v_proj.base_layer
# model.layers.0.self_attn.v_proj.lora_dropout
# model.layers.0.self_attn.v_proj.lora_dropout.default
# model.layers.0.self_attn.v_proj.lora_A
# model.layers.0.self_attn.v_proj.lora_A.default
# model.layers.0.self_attn.v_proj.lora_B
# model.layers.0.self_attn.v_proj.lora_B.default
# model.layers.0.self_attn.v_proj.lora_embedding_A
# model.layers.0.self_attn.v_proj.lora_embedding_B
# model.layers.0.self_attn.v_proj.lora_magnitude_vector
# model.layers.0.self_attn.o_proj
# model.layers.0.mlp
# model.layers.0.mlp.gate_proj
# model.layers.0.mlp.up_proj
# model.layers.0.mlp.down_proj
# model.layers.0.mlp.act_fn
# model.layers.0.input_layernorm
# model.layers.0.post_attention_layernorm
# model.layers.1
# model.layers.1.self_attn
# model.layers.1.self_attn.q_proj
# model.layers.1.self_attn.q_proj.base_layer
# model.layers.1.self_attn.q_proj.lora_dropout
# model.layers.1.self_attn.q_proj.lora_dropout.default
# model.layers.1.self_attn.q_proj.lora_A
# model.layers.1.self_attn.q_proj.lora_A.default
# model.layers.1.self_attn.q_proj.lora_B
# model.layers.1.self_attn.q_proj.lora_B.default
# model.layers.1.self_attn.q_proj.lora_embedding_A
# model.layers.1.self_attn.q_proj.lora_embedding_B
# model.layers.1.self_attn.q_proj.lora_magnitude_vector
# model.layers.1.self_attn.k_proj
# model.layers.1.self_attn.v_proj
# model.layers.1.self_attn.v_proj.base_layer
# model.layers.1.self_attn.v_proj.lora_dropout
# model.layers.1.self_attn.v_proj.lora_dropout.default
# model.layers.1.self_attn.v_proj.lora_A
# model.layers.1.self_attn.v_proj.lora_A.default
# model.layers.1.self_attn.v_proj.lora_B
# model.layers.1.self_attn.v_proj.lora_B.default
# model.layers.1.self_attn.v_proj.lora_embedding_A
# model.layers.1.self_attn.v_proj.lora_embedding_B
# model.layers.1.self_attn.v_proj.lora_magnitude_vector
# model.layers.1.self_attn.o_proj
# model.layers.1.mlp
# model.layers.1.mlp.gate_proj
# model.layers.1.mlp.up_proj
# model.layers.1.mlp.down_proj
# model.layers.1.mlp.act_fn
# model.layers.1.input_layernorm
# model.layers.1.post_attention_layernorm
# model.layers.2
# model.layers.2.self_attn
# model.layers.2.self_attn.q_proj
# model.layers.2.self_attn.q_proj.base_layer
# model.layers.2.self_attn.q_proj.lora_dropout
# model.layers.2.self_attn.q_proj.lora_dropout.default
# model.layers.2.self_attn.q_proj.lora_A
# model.layers.2.self_attn.q_proj.lora_A.default
# model.layers.2.self_attn.q_proj.lora_B
# model.layers.2.self_attn.q_proj.lora_B.default
# model.layers.2.self_attn.q_proj.lora_embedding_A
# model.layers.2.self_attn.q_proj.lora_embedding_B
# model.layers.2.self_attn.q_proj.lora_magnitude_vector
# model.layers.2.self_attn.k_proj
# model.layers.2.self_attn.v_proj
# model.layers.2.self_attn.v_proj.base_layer
# model.layers.2.self_attn.v_proj.lora_dropout
# model.layers.2.self_attn.v_proj.lora_dropout.default
# model.layers.2.self_attn.v_proj.lora_A
# model.layers.2.self_attn.v_proj.lora_A.default
# model.layers.2.self_attn.v_proj.lora_B
# model.layers.2.self_attn.v_proj.lora_B.default
# model.layers.2.self_attn.v_proj.lora_embedding_A
# model.layers.2.self_attn.v_proj.lora_embedding_B
# model.layers.2.self_attn.v_proj.lora_magnitude_vector
# model.layers.2.self_attn.o_proj
# model.layers.2.mlp
# model.layers.2.mlp.gate_proj
# model.layers.2.mlp.up_proj
# model.layers.2.mlp.down_proj
# model.layers.2.mlp.act_fn
# model.layers.2.input_layernorm
# model.layers.2.post_attention_layernorm
# model.layers.3
# model.layers.3.self_attn
# model.layers.3.self_attn.q_proj
# model.layers.3.self_attn.q_proj.base_layer
# model.layers.3.self_attn.q_proj.lora_dropout
# model.layers.3.self_attn.q_proj.lora_dropout.default
# model.layers.3.self_attn.q_proj.lora_A
# model.layers.3.self_attn.q_proj.lora_A.default
# model.layers.3.self_attn.q_proj.lora_B
# model.layers.3.self_attn.q_proj.lora_B.default
# model.layers.3.self_attn.q_proj.lora_embedding_A
# model.layers.3.self_attn.q_proj.lora_embedding_B
# model.layers.3.self_attn.q_proj.lora_magnitude_vector
# model.layers.3.self_attn.k_proj
# model.layers.3.self_attn.v_proj
# model.layers.3.self_attn.v_proj.base_layer
# model.layers.3.self_attn.v_proj.lora_dropout
# model.layers.3.self_attn.v_proj.lora_dropout.default
# model.layers.3.self_attn.v_proj.lora_A
# model.layers.3.self_attn.v_proj.lora_A.default
# model.layers.3.self_attn.v_proj.lora_B
# model.layers.3.self_attn.v_proj.lora_B.default
# model.layers.3.self_attn.v_proj.lora_embedding_A
# model.layers.3.self_attn.v_proj.lora_embedding_B
# model.layers.3.self_attn.v_proj.lora_magnitude_vector
# model.layers.3.self_attn.o_proj
# model.layers.3.mlp
# model.layers.3.mlp.gate_proj
# model.layers.3.mlp.up_proj
# model.layers.3.mlp.down_proj
# model.layers.3.mlp.act_fn
# model.layers.3.input_layernorm
# model.layers.3.post_attention_layernorm
# model.layers.4

# Quantization

# export to ONNX