In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [16]:
!nvidia-smi

Fri Jul  4 21:32:05 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0 Off |                  Off |
| 41%   70C    P2             260W / 450W |  16582MiB / 24564MiB |     73%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
!git clone https://github.com/AlphaAnas/DSnoT.git



Cloning into 'DSnoT'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 92 (delta 40), reused 1 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (92/92), 1.24 MiB | 10.12 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [4]:
%cd DSnoT

/kaggle/working/DSnoT


In [1]:
!ls -a

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Only GPUs 0 and 1 will be visible

In [None]:
!huggingface-cli login --token <your_token_here>


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `api-key` has been saved to /home/meesum/.cache/huggingface/stored_tokens
Your token has been saved to /home/meesum/.cache/huggingface/token
Login successful.
The current active token is: `api-key`


### DOWNLOAD THE DATASET

In [8]:
# Step 1: Download the dataset files
!wget -q https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-train.00000-of-01024.json.gz
!wget -q https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-validation.00000-of-00008.json.gz

# Step 2: Create the 'en' directory (if it doesn't exist)
!mkdir -p en

# Step 3: Move the downloaded files into the 'en' directory
!mv c4-train.00000-of-01024.json.gz en/
!mv c4-validation.00000-of-00008.json.gz en/

# Optional Step 4: Change directory to working directory (not needed unless your code explicitly requires it)
# %cd /kaggle/working


In [15]:
# from datasets import load_dataset

# traindata = load_dataset('json', data_files='en/c4-train.00000-of-01024.json.gz', split='train')
# valdata = load_dataset('json', data_files='en/c4-validation.00000-of-00008.json.gz', split='train')  # still 'train' because there's only one split

In [3]:
class Args:
    model = "meta-llama/llama-3.2-3b"
    model_type = "llama"  # will be inferred automatically
    seed = 0
    nsamples = 128
    eval_dataset = 'wikitext2'
    sparsity_ratio = 0.2
    sparsity_type = 'unstructured'
    prune_method = 'magnitude'
    initial_method = 'magnitude'
    max_cycle_time = 50
    without_DSnoT = True
    update_threshold = 0.1
    pow_of_var_regrowing = 1
    pow_of_var_pruning = 1  # default not overridden
    skip_layer = 'no_skip'
    skip_sub_layer = 'no_skip'
    without_same_sign = 'True'
    get_time_overhead = False
    output_results_file = 'results.txt'
    cache_dir = 'llm_weights'
    save_model = "pruned-llama-3b-L1"
args = Args

In [15]:
import os
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from importlib.metadata import version

from lib.prune import check_sparsity, prune_DSnoT, prune_magnitude, prune_sparsegpt, prune_wanda
from lib.prune_opt import check_sparsity_opt, prune_DSnoT_opt
from lib.eval import eval_ppl
from lib.save_results import save_ppl_result


from magnitude import prune_model

print('torch', version('torch'))
print('transformers', version('transformers'))
print('accelerate', version('accelerate'))
print('# of gpus: ', torch.cuda.device_count())





def get_llm(model, cache_dir="llm_weights"):
    model = AutoModelForCausalLM.from_pretrained(
        model, 
        torch_dtype=torch.float16, 
        cache_dir=cache_dir, 
        low_cpu_mem_usage=True, 
        device_map="auto"
    )

    model.seqlen = 2048
    return model

def main(args):
    print("DEBUG: Entered main()")

    # Set random seeds
    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    print("DEBUG: Set random seeds")

    # Determine model type
    if not args.model_type:
        print("DEBUG: Inferring model type from name...")
        if any(model_name in args.model for model_name in ["llama", "vicuna"]):
            args.model_type = "llama"
        elif "opt" in args.model:
            args.model_type = "opt"
        else:
            print("Warning: Could not determine model type from model name.")
            return
    print(f"DEBUG: Model type set to {args.model_type}")

    prune_n, prune_m = 0, 0
    if args.sparsity_type != "unstructured":
        assert args.sparsity_ratio == 0.5, "sparsity ratio must be 0.5 for structured N:M sparsity"
        prune_n, prune_m = map(int, args.sparsity_type.split(":"))
        print(f"DEBUG: Using structured sparsity: {prune_n}:{prune_m}")

    print(f"DEBUG: Loading LLM model {args.model}")
    model = get_llm(args.model, args.cache_dir)
    print("DEBUG: LLM model loaded")
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
    print("DEBUG: Tokenizer loaded")

    device = torch.device("cuda:0")
    if "30b" in args.model or "65b" in args.model:
        device = model.hf_device_map["lm_head"]
    print(f"DEBUG: Using device: {device}")

    if args.sparsity_ratio != 0:
        print("DEBUG: Starting pruning...")
        if args.model_type == "llama":
            if args.prune_method == "wanda":
                print("DEBUG: Prune method = wanda")
                prune_wanda(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude_normal":
                print("DEBUG: Prune method = magnitude_normal")
                prune_magnitude(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude_torch_prune":
                print("DEBUG: Prune method = magnitude_torch_prune")
                prune_model(
                    model, tokenizer,
                    pruning_type=args.prune_method,
                    weight_metric="l2", model_type=args.model_type,
                    device=device,
                    max_seq_len=128, pruning_ratio=args.sparsity_ratio,
                    save_path=args.save_path
                )
            elif args.prune_method == "sparsegpt":
                print("DEBUG: Prune method = sparsegpt")
                prune_sparsegpt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m, save_path=args.save_path)
            elif args.prune_method == "DSnoT":
                print("DEBUG: Prune method = DSnoT")
                prune_DSnoT(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)

            else:
                print(f"DEBUG: Unsupported pruning method {args.prune_method} for model type {args.model_type}")
                return
        elif args.model_type == "opt":
            if args.prune_method == "wanda":
                print("DEBUG: Prune method = wanda (opt)")
                prune_wanda_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude":
                print("DEBUG: Prune method = magnitude (opt)")
                prune_magnitude_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "sparsegpt":
                print("DEBUG: Prune method = sparsegpt (opt)")
                prune_sparsegpt_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "DSnoT":
                print("DEBUG: Prune method = DSnoT (opt)")
                prune_DSnoT_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
        else:
            print(f"DEBUG: Unsupported model type {args.model_type} for pruning method {args.prune_method}")
            return

    print("*" * 30)
    print("DEBUG: Checking sparsity...")
    sparsity_ratio = check_sparsity(model) if args.model_type == "llama" else check_sparsity_opt(model)
    print(f"sparsity sanity check {sparsity_ratio:.4f}")
    print("*" * 30)

    # Debug: evaluation and saving (commented section)
    # print("DEBUG: Evaluating perplexity...")
    # dataset = 'wikitext2'
    # ppl = eval_ppl(model, tokenizer, dataset, device)
    # print(f"\nppl on {dataset}: {ppl}\n")

    # print("DEBUG: Saving PPL results...")
    # save_ppl_result(args, args.output_results_file, sparsity_ratio, ppl)

    # if args.save_model:
    #     print(f"DEBUG: Saving model to {args.save_model}")
    #     model.save_pretrained(args.save_model)
    #     tokenizer.save_pretrained(args.save_model)





torch 2.5.1
transformers 4.47.0.dev0
accelerate 1.1.1
# of gpus:  0


### USING Torch Pruning , Dep-Graph L1 PRUNING ON LLAMA 3B 

In [None]:
args = Args()
args.model = "meta-llama/llama-3.2-1b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.5
args.sparsity_type = 'unstructured'
args.prune_method = 'magnitude_torch_prune' # right now it only supports L1 pruning - will add L2 as well
args.initial_method = 'magnitude_torch_prune'
args.output_results_file = 'llama-1b-l1-torch_prune-results.txt'
# args.cache_dir = r'C:\Users\hp-15\Disc D\scrapeyard\GSCP\pruning\DSNOT2\babylm-10m-weights'
# args.cache_dir = '/home/meesum/.cache/huggingface/hub'
args.save_model = "pruned-llama-1b_torch_prune-l1"


### Evaluate Perplxity before pruning

In [None]:
model = get_llm(args.model, args.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
dataset = 'wikitext2'
device = torch.device("cuda:0")
ppl = eval_ppl(model, tokenizer, dataset, device)
print(f"\nppl on {dataset}: {ppl}\n")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 2 files:   0%|          | 0/2 [02:27<?, ?it/s]


In [8]:
main(args)

model type: llama
loading llm model babylm/babyllama-10m-2024
use device  cuda:0
pruning starts
Original model parameters: 58.34M


AssertionError: Torch not compiled with CUDA enabled

### Using Normal L1 Pruning on LLAMA

In [None]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'magnitude_normal' # right now it only supports L1 pruning - will add L2 as well
args.initial_method = 'magnitude_normal'
args.output_results_file = 'llama-3b-l1-results.txt'
# args.cache_dir = 'llama-3b-weights'
args.cache_dir = '/home/meesum/.cache/huggingface/hub'
args.save_model = "pruned-llama-3b-l1"


In [None]:
### NOT - USING Normal L1 PRUNING ON LLAMA 3B  but dep_graph pruning above
# main(args)

model type: llama
loading llm model meta-llama/llama-3.2-3b


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 2 files: 100%|██████████| 2/2 [1:28:23<00:00, 2651.93s/it]  
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.23it/s]


use device  cuda:0
pruning starts
******************************
layer 0 sparsity 0.200414
layer 1 sparsity 0.200324
layer 2 sparsity 0.201053
layer 3 sparsity 0.200361
layer 4 sparsity 0.200813
layer 5 sparsity 0.200548
layer 6 sparsity 0.200536
layer 7 sparsity 0.200466
layer 8 sparsity 0.200384
layer 9 sparsity 0.200843
layer 10 sparsity 0.200952
layer 11 sparsity 0.200417
layer 12 sparsity 0.200430
layer 13 sparsity 0.200837
layer 14 sparsity 0.200600
layer 15 sparsity 0.200721
layer 16 sparsity 0.200388
layer 17 sparsity 0.200452
layer 18 sparsity 0.200737
layer 19 sparsity 0.200232
layer 20 sparsity 0.200574
layer 21 sparsity 0.200680
layer 22 sparsity 0.200832
layer 23 sparsity 0.200105
layer 24 sparsity 0.200637
layer 25 sparsity 0.200651
layer 26 sparsity 0.200598
layer 27 sparsity 0.200865
sparsity sanity check 0.2006
******************************
evaluating on wikitext2


Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


nsamples 141
sample 0
sample 50
sample 100

ppl on wikitext2: 8.493921279907227

model: meta-llama/llama-3.2-3b
prune_method: magnitude
without_DSnoT: True
initial_method: magnitude
skip_layer no_skip, skip_sub_layer no_skip
max_cycle_time: 50, update_threshold: 0.1
pow_of_var_pruning:1, pow_of_var_regrowing:1
without_same_sign:True
sparse pattern: unstructured
sample: 128
sparsity sanity check 0.2006, ppl: 8.493921279907227




### USING WANDA BASED PRUNING ON LLAMA 3B (NOT WORKING - NEED FIX)

In [12]:
args = Args()
# args.model = "meta-llama/llama-3.2-3b"
args.model = "baffo32/decapoda-research-llama-7B-hf"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'wanda'  #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'wanda'
args.output_results_file = 'llama-7b-wanda-results.txt'
args.cache_dir = 'llama-7b-weights'
args.save_model = "pruned-llama-7b-wanda"


In [13]:
main(args)

model type: llama
loading llm model baffo32/decapoda-research-llama-7B-hf


config.json:   0%|          | 0.00/428 [00:00<?, ?B/s]

2025-07-03 19:23:31.364467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751570611.632436      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751570611.700031      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 33 files:   0%|          | 0/33 [00:00<?, ?it/s]

pytorch_model-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

pytorch_model-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


use device  cuda:0
pruning starts
loading calibdation data


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataset loading complete


AttributeError: 'NoneType' object has no attribute 'to'

### USING DSnoT PRUNING ON LLAMA 3B 

In [20]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.5
args.sparsity_type = 'unstructured'
args.prune_method = 'DSnoT"' #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'DSnoT'
args.output_results_file = 'llama-3b-DSnoT-results.txt'
# args.cache_dir = 'llama-3b-weights'
args.cache_dir = '/home/meesum/.cache/huggingface/hub/models--meta-llama--llama-3.2-3b'
args.save_model = "50-percent-pruned-llama-3b-DSnoT"


In [None]:
main(args)

model type: llama
loading llm model meta-llama/llama-3.2-3b


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.44s/it]


use device  cuda:0
pruning starts
******************************
layer 0 sparsity 0.000002
layer 1 sparsity 0.000001
layer 2 sparsity 0.000002
layer 3 sparsity 0.000002
layer 4 sparsity 0.000001
layer 5 sparsity 0.000001
layer 6 sparsity 0.000001
layer 7 sparsity 0.000002
layer 8 sparsity 0.000001
layer 9 sparsity 0.000002
layer 10 sparsity 0.000001
layer 11 sparsity 0.000002
layer 12 sparsity 0.000001
layer 13 sparsity 0.000001
layer 14 sparsity 0.000002
layer 15 sparsity 0.000002
layer 16 sparsity 0.000001
layer 17 sparsity 0.000001
layer 18 sparsity 0.000001
layer 19 sparsity 0.000001
layer 20 sparsity 0.000001
layer 21 sparsity 0.000001
layer 22 sparsity 0.000001
layer 23 sparsity 0.000001
layer 24 sparsity 0.000001
layer 25 sparsity 0.000001
layer 26 sparsity 0.000001
layer 27 sparsity 0.000001
sparsity sanity check 0.0000
******************************
evaluating on wikitext2


Token indices sequence length is longer than the specified maximum sequence length for this model (2458791 > 131072). Running this sequence through the model will result in indexing errors


nsamples 141
sample 0
sample 50
sample 100

ppl on wikitext2: 7.813808441162109

model: meta-llama/llama-3.2-3b
prune_method: DSnoT"
without_DSnoT: True
initial_method: DSnoT
skip_layer no_skip, skip_sub_layer no_skip
max_cycle_time: 50, update_threshold: 0.1
pow_of_var_pruning:1, pow_of_var_regrowing:1
without_same_sign:True
sparse pattern: unstructured
sample: 128
sparsity sanity check 0.0000, ppl: 7.813808441162109




: 

### USING SparseGPT PRUNING ON LLAMA 3B 

In [22]:
args = Args()
args.model = "babylm/babyllama-10m-2024"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'sparsegpt' #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'sparsegpt'
args.output_results_file = 'llama-babylm-sparsegpt-results.txt'
args.cache_dir = r'C:\Users\hp-15\Disc D\scrapeyard\GSCP\pruning\DSNOT2\babylm-10m-weights'
# args.cache_dir = '/home/meesum/.cache/huggingface/hub'
args.save_path = "pruned-llama-babylm-sparsegpt-l1"



In [25]:
main(args)

DEBUG: Entered main()
DEBUG: Set random seeds
DEBUG: Model type set to llama
DEBUG: Loading LLM model babylm/babyllama-10m-2024
DEBUG: LLM model loaded
DEBUG: Tokenizer loaded
DEBUG: Using device: cuda:0
DEBUG: Starting pruning...
DEBUG: Prune method = sparsegpt
Starting ...
Original model parameters: 58.34M


FileNotFoundError: Unable to find 'C:/Users/hp-15/Disc D/scrapeyard/GSCP/pruning/DSNOT2\en/c4-train.00000-of-01024.json.gz'

In [10]:
# !CUDA_VISIBLE_DEVICES=0,1 python main.py \
#     --model baffo32/decapoda-research-llama-7B-hf \
#     --prune_method DSnoT \
#     --initial_method wanda \
#     --sparsity_ratio 0.5 \
#     --sparsity_type unstructured \
#     --max_cycle_time 50 \
#     --update_threshold 0.1 \
#     --pow_of_var_regrowing 1

torch 2.6.0+cu124
transformers 4.51.3
accelerate 1.5.2
# of gpus:  2
model type: llama
loading llm model baffo32/decapoda-research-llama-7B-hf
config.json: 100%|█████████████████████████████| 428/428 [00:00<00:00, 2.82MB/s]
2025-07-03 18:01:37.113207: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751565697.300908     204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751565697.357013     204 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
pytorch_model.bin.index.json: 25.5kB [00:00, 93.7MB/s]
Fetching 33 files:   0%|                                 | 0/33 [00:00<?, ?it/s]
pytorch_model-00002-of-00033.bin:   0%|              | 0.00/405M [00:00<?, ?B

In [17]:
print(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Cached: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

Allocated: 8.12 MB
Cached: 242.00 MB


In [6]:
import torch
import gc

# Delete all relevant variables
# del model  # Replace with your model variable name
# del inputs, outputs  # Replace with any other tensors or variables
gc.collect()  # Run Python garbage collector
torch.cuda.empty_cache()  # Clear PyTorch's GPU cache

In [21]:
!nvidia-smi


Fri Jul  4 18:48:12 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0 Off |                  Off |
| 41%   71C    P2             266W / 450W |  13398MiB / 24564MiB |     78%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
import os
import torch
from transformers import AutoModelForCausalLM

def load_model_stats(model_path):
    model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True, device_map="auto")

    total_params = sum(p.numel() for p in model.parameters()) / 1e9
    nonzero_params = sum((p != 0).sum().item() for p in model.parameters() if p.requires_grad) /1e9

    size_gb = sum(os.path.getsize(os.path.join(dp, f)) for dp, _, fs in os.walk(model_path) for f in fs) / 1e9

    print(f"Total Parameters      : {total_params:,} billion")
    print(f"Non-zero Parameters   : {nonzero_params:,} billion")
    print(f"Sparsity              : {(1 - nonzero_params / total_params) * 100:.2f}%")
    print(f"Model Size on Disk    : {size_gb:.2f} GB")

    return model


In [5]:
load_model_stats("/home/meesum/latest-research-work/DSnoT/pruned-llama-3b-DSnoT")
# Example usage
# print(f"Model size after DSnoT Pruning: {model_size_in_gb(model):.4f} GB")

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]


Total Parameters      : 3.212749824 billion
Non-zero Parameters   : 3.212745392 billion
Sparsity              : 0.00%
Model Size on Disk    : 6.82 GB


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

In [16]:
load_model_stats("/home/meesum/latest-research-work/DSnoT/pruned-llama-3b-l1")


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.39s/it]


Total Parameters      : 3.212749824 billion
Non-zero Parameters   : 2.647378627 billion
Sparsity              : 17.60%
Model Size on Disk    : 6.44 GB


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((3072,), eps=1e-05)
    (rotary_emb

In [7]:

model = load_model_stats("/home/meesum/latest-research-work/DSnoT/pruned-llama-3b-sparsegpt")
# Example usage

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.99s/it]


Total Parameters      : 3.212749824 billion
Non-zero Parameters   : 3.212745392 billion
Sparsity              : 0.00%
Model Size on Disk    : 6.44 GB


In [15]:

model = get_llm("meta-llama/llama-3.2-3b","/home/meesum/.cache/huggingface/hub")
# Example usage
total_params = sum(p.numel() for p in model.parameters()) / 1e9
nonzero_params = sum((p != 0).sum().item() for p in model.parameters() if p.requires_grad) /1e9

# size_gb = sum(os.path.getsize(os.path.join(dp, f)) for dp, _, fs in os.walk(model_path) for f in fs) / 1e9

print(f"Total Parameters      : {total_params:,} billion")
print(f"Non-zero Parameters   : {nonzero_params:,} billion")
print(f"Sparsity              : {(1 - nonzero_params / total_params) * 100:.2f}%")
# print(f"Model Size on Disk    : {size_gb:.2f} GB")

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.18s/it]


Total Parameters      : 3.212749824 billion
Non-zero Parameters   : 3.212745392 billion
Sparsity              : 0.00%
