In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!nvidia-smi

Thu Jul  3 19:31:39 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [3]:
!git clone https://github.com/AlphaAnas/DSnoT.git



Cloning into 'DSnoT'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 92 (delta 40), reused 1 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (92/92), 1.24 MiB | 10.12 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [4]:
%cd DSnoT

/kaggle/working/DSnoT


In [5]:
!ls -a

.   environment.yaml  imgs  main.py			   README.md
..  .git	      lib   pruning-dsnot-procedure.ipynb


In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Only GPUs 0 and 1 will be visible

In [7]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `Access Token for Llama` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Access Token for Llama`


### DOWNLOAD THE DATASET

In [8]:
# Step 1: Download the dataset files
!wget -q https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-train.00000-of-01024.json.gz
!wget -q https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-validation.00000-of-00008.json.gz

# Step 2: Create the 'en' directory (if it doesn't exist)
!mkdir -p en

# Step 3: Move the downloaded files into the 'en' directory
!mv c4-train.00000-of-01024.json.gz en/
!mv c4-validation.00000-of-00008.json.gz en/

# Optional Step 4: Change directory to working directory (not needed unless your code explicitly requires it)
# %cd /kaggle/working


In [15]:
# from datasets import load_dataset

# traindata = load_dataset('json', data_files='en/c4-train.00000-of-01024.json.gz', split='train')
# valdata = load_dataset('json', data_files='en/c4-validation.00000-of-00008.json.gz', split='train')  # still 'train' because there's only one split

In [8]:
class Args:
    model = "meta-llama/llama-3.2-3b"
    model_type = "llama"  # will be inferred automatically
    seed = 0
    nsamples = 128
    eval_dataset = 'wikitext2'
    sparsity_ratio = 0.2
    sparsity_type = 'unstructured'
    prune_method = 'magnitude'
    initial_method = 'magnitude'
    max_cycle_time = 50
    without_DSnoT = True
    update_threshold = 0.1
    pow_of_var_regrowing = 1
    pow_of_var_pruning = 1  # default not overridden
    skip_layer = 'no_skip'
    skip_sub_layer = 'no_skip'
    without_same_sign = 'True'
    get_time_overhead = False
    output_results_file = 'results.txt'
    cache_dir = 'llm_weights'
    save_model = "pruned-llama-3b-L1"


In [9]:
import os
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from importlib.metadata import version

from lib.prune import check_sparsity, prune_DSnoT, prune_magnitude, prune_sparsegpt, prune_wanda
from lib.prune_opt import check_sparsity_opt, prune_DSnoT_opt
from lib.eval import eval_ppl
from lib.save_results import save_ppl_result

print('torch', version('torch'))
print('transformers', version('transformers'))
print('accelerate', version('accelerate'))
print('# of gpus: ', torch.cuda.device_count())





def get_llm(model, cache_dir="llm_weights"):
    model = AutoModelForCausalLM.from_pretrained(
        model, 
        torch_dtype=torch.float16, 
        cache_dir=cache_dir, 
        low_cpu_mem_usage=True, 
        device_map="auto"
    )

    model.seqlen = 2048
    return model


def main(args):
  

    # Set random seeds
    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)

    # Determine model type
    if not args.model_type:
        if any(model_name in args.model for model_name in ["llama", "vicuna"]):
            args.model_type = "llama"
        elif "opt" in args.model:
            args.model_type = "opt"
        else:
            print("Warning: Could not determine model type from model name.")
            return
    print(f"model type: {args.model_type}")

    prune_n, prune_m = 0, 0
    if args.sparsity_type != "unstructured":
        assert args.sparsity_ratio == 0.5, "sparsity ratio must be 0.5 for structured N:M sparsity"
        prune_n, prune_m = map(int, args.sparsity_type.split(":"))

    print(f"loading llm model {args.model}")
    model = get_llm(args.model, args.cache_dir)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)

    device = torch.device("cuda:0")
    if "30b" in args.model or "65b" in args.model:
        device = model.hf_device_map["lm_head"]
    print("use device ", device)

    if args.sparsity_ratio != 0:
        print("pruning starts")
        if args.model_type == "llama":
            if args.prune_method == "wanda":
                prune_wanda(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude":
                prune_magnitude(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "sparsegpt":
                prune_sparsegpt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "DSnoT":
                prune_DSnoT(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
        elif args.model_type == "opt":
            if args.prune_method == "wanda":
                prune_wanda_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude":
                prune_magnitude_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "sparsegpt":
                prune_sparsegpt_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "DSnoT":
                prune_DSnoT_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)

    print("*" * 30)
    sparsity_ratio = check_sparsity(model) if args.model_type == "llama" else check_sparsity_opt(model)
    print(f"sparsity sanity check {sparsity_ratio:.4f}")
    print("*" * 30)

    dataset = 'wikitext2'
    ppl = eval_ppl(model, tokenizer, dataset, device)
    print(f"\nppl on {dataset}: {ppl}\n")

    save_ppl_result(args, args.output_results_file, sparsity_ratio, ppl)

    if args.save_model:
        model.save_pretrained(args.save_model)
        tokenizer.save_pretrained(args.save_model)





torch 2.6.0+cu124
transformers 4.51.3
accelerate 1.5.2
# of gpus:  2


### USING L1 PRUNING ON LLAMA 3B 

In [11]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'magnitude' # right now it only supports L1 pruning - will add L2 as well
args.initial_method = 'magnitude'
args.output_results_file = 'llama-3b-l1-results.txt'
args.cache_dir = 'llama-3b-weights'
args.save_model = "pruned-llama-3b-l1"


### Evaluate Perplxity before pruning

In [15]:
model = get_llm(args.model, args.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
dataset = 'wikitext2'
device = torch.device("cuda:0")
ppl = eval_ppl(model, tokenizer, dataset, device)
print(f"\nppl on {dataset}: {ppl}\n")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

evaluating on wikitext2


Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


nsamples 141
sample 0
sample 50
sample 100

ppl on wikitext2: 7.813808441162109



In [14]:
main(args)

model type: llama
loading llm model meta-llama/llama-3.2-3b


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

2025-07-03 19:02:05.158500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751569325.373730      57 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751569325.442251      57 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

use device  cuda:0
pruning starts
******************************
layer 0 sparsity 0.200414
layer 1 sparsity 0.200324
layer 2 sparsity 0.201053
layer 3 sparsity 0.200361
layer 4 sparsity 0.200813
layer 5 sparsity 0.200548
layer 6 sparsity 0.200536
layer 7 sparsity 0.200466
layer 8 sparsity 0.200384
layer 9 sparsity 0.200843
layer 10 sparsity 0.200952
layer 11 sparsity 0.200417
layer 12 sparsity 0.200430
layer 13 sparsity 0.200837
layer 14 sparsity 0.200600
layer 15 sparsity 0.200721
layer 16 sparsity 0.200388
layer 17 sparsity 0.200452
layer 18 sparsity 0.200737
layer 19 sparsity 0.200232
layer 20 sparsity 0.200574
layer 21 sparsity 0.200680
layer 22 sparsity 0.200832
layer 23 sparsity 0.200105
layer 24 sparsity 0.200637
layer 25 sparsity 0.200651
layer 26 sparsity 0.200598
layer 27 sparsity 0.200865
sparsity sanity check 0.2006
******************************
evaluating on wikitext2


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


nsamples 141
sample 0
sample 50
sample 100

ppl on wikitext2: 8.49386215209961

model: meta-llama/llama-3.2-3b
prune_method: magnitude
without_DSnoT: True
initial_method: magnitude
skip_layer no_skip, skip_sub_layer no_skip
max_cycle_time: 50, update_threshold: 0.1
pow_of_var_pruning:1, pow_of_var_regrowing:1
without_same_sign:True
sparse pattern: unstructured
sample: 128
sparsity sanity check 0.2006, ppl: 8.49386215209961




### USING WANDA BASED PRUNING ON LLAMA 3B (NOT WORKING - NEED FIX)

In [12]:
args = Args()
# args.model = "meta-llama/llama-3.2-3b"
args.model = "baffo32/decapoda-research-llama-7B-hf"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'wanda'  #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'wanda'
args.output_results_file = 'llama-7b-wanda-results.txt'
args.cache_dir = 'llama-7b-weights'
args.save_model = "pruned-llama-7b-wanda"


In [13]:
main(args)

model type: llama
loading llm model baffo32/decapoda-research-llama-7B-hf


config.json:   0%|          | 0.00/428 [00:00<?, ?B/s]

2025-07-03 19:23:31.364467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751570611.632436      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751570611.700031      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 33 files:   0%|          | 0/33 [00:00<?, ?it/s]

pytorch_model-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

pytorch_model-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


use device  cuda:0
pruning starts
loading calibdation data


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataset loading complete


AttributeError: 'NoneType' object has no attribute 'to'

### USING DSnoT PRUNING ON LLAMA 3B 

In [14]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'DSnoT"' #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'DSnoT'
args.output_results_file = 'llama-3b-DSnoT-results.txt'
args.cache_dir = 'llama-3b-weights'
args.save_model = "pruned-llama-3b-DSnoT"


In [15]:
main(args)

model type: llama
loading llm model meta-llama/llama-3.2-3b


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

use device  cuda:0
pruning starts
******************************
layer 0 sparsity 0.000002
layer 1 sparsity 0.000001
layer 2 sparsity 0.000002
layer 3 sparsity 0.000002
layer 4 sparsity 0.000001
layer 5 sparsity 0.000001
layer 6 sparsity 0.000001
layer 7 sparsity 0.000002
layer 8 sparsity 0.000001
layer 9 sparsity 0.000002
layer 10 sparsity 0.000001
layer 11 sparsity 0.000002
layer 12 sparsity 0.000001
layer 13 sparsity 0.000001
layer 14 sparsity 0.000002
layer 15 sparsity 0.000002
layer 16 sparsity 0.000001
layer 17 sparsity 0.000001
layer 18 sparsity 0.000001
layer 19 sparsity 0.000001
layer 20 sparsity 0.000001
layer 21 sparsity 0.000001
layer 22 sparsity 0.000001
layer 23 sparsity 0.000001
layer 24 sparsity 0.000001
layer 25 sparsity 0.000001
layer 26 sparsity 0.000001
layer 27 sparsity 0.000001
sparsity sanity check 0.0000
******************************
evaluating on wikitext2


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


nsamples 141
sample 0
sample 50
sample 100

ppl on wikitext2: 7.813808441162109

model: meta-llama/llama-3.2-3b
prune_method: DSnoT"
without_DSnoT: True
initial_method: DSnoT
skip_layer no_skip, skip_sub_layer no_skip
max_cycle_time: 50, update_threshold: 0.1
pow_of_var_pruning:1, pow_of_var_regrowing:1
without_same_sign:True
sparse pattern: unstructured
sample: 128
sparsity sanity check 0.0000, ppl: 7.813808441162109




SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

### USING SparseGPT PRUNING ON LLAMA 3B 

In [12]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'sparsegpt"' #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'sparsegpt'
args.output_results_file = 'llama-3b-sparsegpt-results.txt'
args.cache_dir = 'llama-3b-weights'
args.save_model = "pruned-llama-3b-sparsegpt"


In [13]:
main(args)

model type: llama
loading llm model meta-llama/llama-3.2-3b


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

use device  cuda:0
pruning starts
******************************
layer 0 sparsity 0.000002
layer 1 sparsity 0.000001
layer 2 sparsity 0.000002
layer 3 sparsity 0.000002
layer 4 sparsity 0.000001
layer 5 sparsity 0.000001
layer 6 sparsity 0.000001
layer 7 sparsity 0.000002
layer 8 sparsity 0.000001
layer 9 sparsity 0.000002
layer 10 sparsity 0.000001
layer 11 sparsity 0.000002
layer 12 sparsity 0.000001
layer 13 sparsity 0.000001
layer 14 sparsity 0.000002
layer 15 sparsity 0.000002
layer 16 sparsity 0.000001
layer 17 sparsity 0.000001
layer 18 sparsity 0.000001
layer 19 sparsity 0.000001
layer 20 sparsity 0.000001
layer 21 sparsity 0.000001
layer 22 sparsity 0.000001
layer 23 sparsity 0.000001
layer 24 sparsity 0.000001
layer 25 sparsity 0.000001
layer 26 sparsity 0.000001
layer 27 sparsity 0.000001
sparsity sanity check 0.0000
******************************
evaluating on wikitext2


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


nsamples 141
sample 0
sample 50
sample 100

ppl on wikitext2: 7.813808441162109

model: meta-llama/llama-3.2-3b
prune_method: sparsegpt"
without_DSnoT: True
initial_method: sparsegpt
skip_layer no_skip, skip_sub_layer no_skip
max_cycle_time: 50, update_threshold: 0.1
pow_of_var_pruning:1, pow_of_var_regrowing:1
without_same_sign:True
sparse pattern: unstructured
sample: 128
sparsity sanity check 0.0000, ppl: 7.813808441162109




In [10]:
# !CUDA_VISIBLE_DEVICES=0,1 python main.py \
#     --model baffo32/decapoda-research-llama-7B-hf \
#     --prune_method DSnoT \
#     --initial_method wanda \
#     --sparsity_ratio 0.5 \
#     --sparsity_type unstructured \
#     --max_cycle_time 50 \
#     --update_threshold 0.1 \
#     --pow_of_var_regrowing 1

torch 2.6.0+cu124
transformers 4.51.3
accelerate 1.5.2
# of gpus:  2
model type: llama
loading llm model baffo32/decapoda-research-llama-7B-hf
config.json: 100%|█████████████████████████████| 428/428 [00:00<00:00, 2.82MB/s]
2025-07-03 18:01:37.113207: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751565697.300908     204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751565697.357013     204 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
pytorch_model.bin.index.json: 25.5kB [00:00, 93.7MB/s]
Fetching 33 files:   0%|                                 | 0/33 [00:00<?, ?it/s]
pytorch_model-00002-of-00033.bin:   0%|              | 0.00/405M [00:00<?, ?B