In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!nvidia-smi

Thu Jul  3 17:58:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [3]:
!git clone https://github.com/AlphaAnas/DSnoT.git



Cloning into 'DSnoT'...
remote: Enumerating objects: 89, done.[K
remote: Counting objects: 100% (89/89), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 89 (delta 39), reused 1 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (89/89), 1.22 MiB | 8.36 MiB/s, done.
Resolving deltas: 100% (39/39), done.


In [4]:
%cd DSnoT

/kaggle/working/DSnoT


In [5]:
!ls -a

.  ..  environment.yaml  .git  imgs  lib  main.py  README.md


In [6]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Only GPUs 0 and 1 will be visible

In [None]:
!huggingface-cli login --token 

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `Access Token for Llama` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Access Token for Llama`


### DOWNLOAD THE DATASET

In [8]:
# Step 1: Download the dataset files
!wget -q https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-train.00000-of-01024.json.gz
!wget -q https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-validation.00000-of-00008.json.gz

# Step 2: Create the 'en' directory (if it doesn't exist)
!mkdir -p en

# Step 3: Move the downloaded files into the 'en' directory
!mv c4-train.00000-of-01024.json.gz en/
!mv c4-validation.00000-of-00008.json.gz en/

# Optional Step 4: Change directory to working directory (not needed unless your code explicitly requires it)
# %cd /kaggle/working


In [9]:
from datasets import load_dataset

traindata = load_dataset('json', data_files='en/c4-train.00000-of-01024.json.gz', split='train')
valdata = load_dataset('json', data_files='en/c4-validation.00000-of-00008.json.gz', split='train')  # still 'train' because there's only one split

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [16]:
class Args:
    model = "meta-llama/llama-3.2-3b"
    model_type = "llama"  # will be inferred automatically
    seed = 0
    nsamples = 128
    eval_dataset = 'wikitext2'
    sparsity_ratio = 0.5
    sparsity_type = 'unstructured'
    prune_method = 'magnitude'
    initial_method = 'magnitude'
    max_cycle_time = 50
    without_DSnoT = True
    update_threshold = 0.1
    pow_of_var_regrowing = 1
    pow_of_var_pruning = 1  # default not overridden
    skip_layer = 'no_skip'
    skip_sub_layer = 'no_skip'
    without_same_sign = 'True'
    get_time_overhead = False
    output_results_file = 'results.txt'
    cache_dir = 'llm_weights'
    save_model = "pruned-llama-3b-L1"


In [23]:
import os
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from importlib.metadata import version

from lib.prune import check_sparsity, prune_DSnoT, prune_magnitude, prune_sparsegpt, prune_wanda
from lib.prune_opt import check_sparsity_opt, prune_DSnoT_opt
from lib.eval import eval_ppl
from lib.save_results import save_ppl_result

print('torch', version('torch'))
print('transformers', version('transformers'))
print('accelerate', version('accelerate'))
print('# of gpus: ', torch.cuda.device_count())





def get_llm(model, cache_dir="llm_weights"):
    model = AutoModelForCausalLM.from_pretrained(
        model, 
        torch_dtype=torch.float16, 
        cache_dir=cache_dir, 
        low_cpu_mem_usage=True, 
        device_map="auto"
    )

    model.seqlen = 2048
    return model


def main(args):
  

    # Set random seeds
    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)

    # Determine model type
    if not args.model_type:
        if any(model_name in args.model for model_name in ["llama", "vicuna"]):
            args.model_type = "llama"
        elif "opt" in args.model:
            args.model_type = "opt"
        else:
            print("Warning: Could not determine model type from model name.")
            return
    print(f"model type: {args.model_type}")

    prune_n, prune_m = 0, 0
    if args.sparsity_type != "unstructured":
        assert args.sparsity_ratio == 0.5, "sparsity ratio must be 0.5 for structured N:M sparsity"
        prune_n, prune_m = map(int, args.sparsity_type.split(":"))

    print(f"loading llm model {args.model}")
    model = get_llm(args.model, args.cache_dir)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)

    device = torch.device("cuda:0")
    if "30b" in args.model or "65b" in args.model:
        device = model.hf_device_map["lm_head"]
    print("use device ", device)

    if args.sparsity_ratio != 0:
        print("pruning starts")
        if args.model_type == "llama":
            if args.prune_method == "wanda":
                prune_wanda(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude":
                prune_magnitude(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "sparsegpt":
                prune_sparsegpt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "DSnoT":
                prune_DSnoT(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
        elif args.model_type == "opt":
            if args.prune_method == "wanda":
                prune_wanda_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "magnitude":
                prune_magnitude_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "sparsegpt":
                prune_sparsegpt_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)
            elif args.prune_method == "DSnoT":
                prune_DSnoT_opt(args, model, tokenizer, device, prune_n=prune_n, prune_m=prune_m)

    print("*" * 30)
    sparsity_ratio = check_sparsity(model) if args.model_type == "llama" else check_sparsity_opt(model)
    print(f"sparsity sanity check {sparsity_ratio:.4f}")
    print("*" * 30)

    dataset = 'wikitext2'
    ppl = eval_ppl(model, tokenizer, dataset, device)
    print(f"\nppl on {dataset}: {ppl}\n")

    save_ppl_result(args, args.output_results_file, sparsity_ratio, ppl)

    if args.save_model:
        model.save_pretrained(args.save_model)
        tokenizer.save_pretrained(args.save_model)





torch 2.6.0+cu124
transformers 4.51.3
accelerate 1.5.2
# of gpus:  2


### USING L1 PRUNING ON LLAMA 3B 

In [24]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'magnitude' # right now it only supports L1 pruning - will add L2 as well
args.initial_method = 'magnitude'
args.output_results_file = 'llama-3b-l1-results.txt'
args.cache_dir = 'llama-3b-l1-_weights'
args.save_model = "pruned-llama-3b-l1"


In [None]:
main(args)

### USING WANDA BASED PRUNING ON LLAMA 3B 

In [None]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'wanda'  #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'wanda'
args.output_results_file = 'llama-3b-wanda-results.txt'
args.cache_dir = 'llama-3b-wanda-_weights'
args.save_model = "pruned-llama-3b-wanda"


In [None]:
main(args)

### USING DSnoT PRUNING ON LLAMA 3B 

In [None]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'DSnoT"' #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'DSnoT'
args.output_results_file = 'llama-3b-DSnoT-results.txt'
args.cache_dir = 'llama-3b-DSnoT-_weights'
args.save_model = "pruned-llama-3b-DSnoT"


In [None]:
main(args)

### USING SparseGPT PRUNING ON LLAMA 3B 

In [None]:
args = Args()
args.model = "meta-llama/llama-3.2-3b"
args.model_type = "llama"  # will be inferred automatically
args.sparsity_ratio = 0.2
args.sparsity_type = 'unstructured'
args.prune_method = 'sparsegpt"' #choices=["wanda", "sparsegpt", "magnitude", "DSnoT", "dense"]
args.initial_method = 'sparsegpt'
args.output_results_file = 'llama-3b-sparsegpt-results.txt'
args.cache_dir = 'llama-3b-sparsegpt-_weights'
args.save_model = "pruned-llama-3b-sparsegpt"


In [None]:
main(args)

In [10]:
# !CUDA_VISIBLE_DEVICES=0,1 python main.py \
#     --model baffo32/decapoda-research-llama-7B-hf \
#     --prune_method DSnoT \
#     --initial_method wanda \
#     --sparsity_ratio 0.5 \
#     --sparsity_type unstructured \
#     --max_cycle_time 50 \
#     --update_threshold 0.1 \
#     --pow_of_var_regrowing 1

torch 2.6.0+cu124
transformers 4.51.3
accelerate 1.5.2
# of gpus:  2
model type: llama
loading llm model baffo32/decapoda-research-llama-7B-hf
config.json: 100%|█████████████████████████████| 428/428 [00:00<00:00, 2.82MB/s]
2025-07-03 18:01:37.113207: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751565697.300908     204 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751565697.357013     204 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
pytorch_model.bin.index.json: 25.5kB [00:00, 93.7MB/s]
Fetching 33 files:   0%|                                 | 0/33 [00:00<?, ?it/s]
pytorch_model-00002-of-00033.bin:   0%|              | 0.00/405M [00:00<?, ?B