# Imports

In [1]:
import torch
from torch.nn.utils import prune

from tqdm import tqdm

from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset

from utils.mask_utils import calculate_mask
from utils.hessian_utils import calc_inverse_hessian
from utils.prehook_utils import put_input_hooks
import gc

# Constants

In [6]:
CALIBRATION_SIZE=128
TOKEN_LENGTH=1024
CALIBRATION_BATCH_SIZE=1

EPSILON = 1e-8
B = 128 
Bs = 128

#hyperparam test, remove later
EPOCH_COUNT = 10

#set device
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 
MODEL_NAME = "opt-125m"

#load tokenizer
tokenizer = AutoTokenizer.from_pretrained(f'facebook/{MODEL_NAME}')
#Load dataset
dataset = load_dataset('c4', 'en', streaming=True)

# Calibration

## Function

In [4]:
# run model on batches of calibration data, then concatenate inputs
def split_model_calibration(model):
    batch_sentences = []
    for i, data in tqdm(enumerate(iter(dataset['train'])), total=CALIBRATION_SIZE):
        if i < CALIBRATION_SIZE + 1:
            if len(batch_sentences) >= CALIBRATION_BATCH_SIZE:
                with torch.no_grad():
                    encoded_input = tokenizer(batch_sentences, return_tensors="pt",
                                              padding="max_length", max_length=TOKEN_LENGTH,
                                              truncation=True).to(device=DEVICE)
                    model(**encoded_input, labels=encoded_input.input_ids)
                    del encoded_input
                    torch.cuda.empty_cache()
                    batch_sentences = []
            batch_sentences.append(data['text'])
        else:
            break

# SparseGPT

## Prune models (SparseGPT)

In [None]:
from utils.prune_utils import sparsegpt_prune

SPARSITIES = [0.2,0.3,0.5,0.7,0.9,1]#0.1, 0.2,0.3,0.5,0.7,0.9,1
    
for i, SPARSITY in enumerate(tqdm(SPARSITIES, total=len(SPARSITIES))):
    model = OPTForCausalLM.from_pretrained(f'facebook/{MODEL_NAME}', 
                                       output_attentions=True, 
                                       output_hidden_states=True).to(device=DEVICE)
    model = torch.nn.DataParallel(model, device_ids=[0,1,2,3])
    !nvidia-smi
    # Calculate feature hessians only for the first iteration
    if i == 0:
        feature_hessians = {}
        #put_input_hooks(model=model, features=feature_hessians, storage_dir=storage_dir, offload_freq=10000, feature_storage_device='cpu')
        all_hooks = put_input_hooks(model=model, features=feature_hessians, feature_storage_device='cpu')
        split_model_calibration(model)
        for hook in all_hooks:
            hook.remove()
    
    # Prune using the sparseGPT method, saves as pruned_models/{model_name}-{SPARSENESS}.pt WITHOUT mask
    sparsegpt_prune(model, MODEL_NAME, feature_hessians, EPSILON, SPARSITY, B, Bs)
    !nvidia-smi
    del model
del feature_hessians

  0%|          | 0/6 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Sat Feb 25 13:12:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   28C    P0    60W / 250W |  26355MiB / 40960MiB |     43%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   28C    P0    36W / 250W |      2MiB / 40960MiB |      0%      Default |
|       


  0%|          | 0/128 [00:00<?, ?it/s][A

  2%|▏         | 2/128 [00:16<20:44,  9.88s/it][A
  2%|▏         | 3/128 [00:39<32:38, 15.67s/it][A
  3%|▎         | 4/128 [01:02<38:03, 18.41s/it][A
  4%|▍         | 5/128 [01:24<40:51, 19.93s/it][A
  5%|▍         | 6/128 [01:47<42:25, 20.86s/it][A
  5%|▌         | 7/128 [02:10<43:15, 21.45s/it][A
  6%|▋         | 8/128 [02:32<43:39, 21.83s/it][A
  7%|▋         | 9/128 [02:55<43:50, 22.10s/it][A
  8%|▊         | 10/128 [03:18<43:48, 22.27s/it][A
  9%|▊         | 11/128 [03:40<43:40, 22.40s/it][A
  9%|▉         | 12/128 [04:03<43:28, 22.49s/it][A
 10%|█         | 13/128 [04:26<43:11, 22.54s/it][A
 11%|█         | 14/128 [04:48<42:53, 22.58s/it][A
 12%|█▏        | 15/128 [05:11<42:34, 22.61s/it][A
 12%|█▎        | 16/128 [05:34<42:13, 22.62s/it][A
 13%|█▎        | 17/128 [05:56<41:53, 22.65s/it][A
 14%|█▍        | 18/128 [06:19<41:32, 22.66s/it][A
 15%|█▍        | 19/128 [06:42<41:10, 22.66s/it][A
 16%|█▌        | 20/128 [07

## Finetune models (SparseGPT)

In [16]:
!pip install accelerate

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.16.0-py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 28.4 MB/s eta 0:00:01
Installing collected packages: accelerate
Successfully installed accelerate-0.16.0
You should consider upgrading via the '/gs/gsfs0/hpc01/rhel8/apps/conda3/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [5]:
from utils.finetune_utils import finetune_model
SPARSITIES = [0.2,0.3,0.5,0.7,0.9,1]#0.1, 0.2,0.3,0.5,0.7,0.9,1
    
for i, SPARSITY in enumerate(tqdm(SPARSITIES, total=len(SPARSITIES))):
    # Finetune, saves as pruned_models/{model_name}-{SPARSENESS}-finetuned.pt WITHOUT mask
    finetune_model(MODEL_NAME, tokenizer, SPARSITY, EPOCH_COUNT=EPOCH_COUNT)

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][AAsking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  0%|          | 0/10 [00:03<?, ?it/s]
  0%|          | 0/6 [00:26<?, ?it/s]


In [1]:
!nvidia-smi

Sun Feb 26 03:34:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   28C    P0    59W / 250W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  On   | 00000000:41:00.0 Off |                    0 |
| N/A   28C    P0    39W / 250W |      0MiB / 40960MiB |      0%      Default |
|       

## Test models (SparseGPT)

In [3]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maaquib111[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
import numpy as np
from testing_module import test_model

TOKEN_LENGTH=1024
STRIDE = 512
wandb.init(project="ICLR", 
           name = f'{MODEL_NAME} Wikitext Test', 
           config={'token_length': TOKEN_LENGTH,
                 'model_name': MODEL_NAME,
                 'stride': STRIDE,
                 'fine_tuned': 'not finetuned'})
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(f'facebook/{MODEL_NAME}', 
                                          padding_side='left', 
                                          use_fast=False)
# Load dataset
test_set = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
encodings = tokenizer("\n\n".join(test_set['text']), return_tensors='pt')

seq_len = encodings.input_ids.size(1)
SPARSITIES = [0.2, 0.3, 0.5, 0.7, 0.9, 1]#, 0.4, 0.6, 0.8, 1

for SPARSITY in SPARSITIES:
    test_model(MODEL_NAME, encodings, TOKEN_LENGTH, seq_len, STRIDE, wandb, SPARSITY, is_finetuned=False)
    
### NOW DO FINETUNED
wandb.init(project="ICLR", 
           name = f'{MODEL_NAME} Wikitext Test', 
           config={'token_length': token_length,
                 'model_name': MODEL_NAME,
                 'stride': stride,
                 'fine_tuned': 'finetuned'})
for SPARSITY in SPARSITIES:
    test_model(MODEL_NAME, encodings, TOKEN_LENGTH, seq_len, STRIDE, wandb, SPARSITY, is_finetuned=True)

Found cached dataset wikitext (/gs/gsfs0/users/asyed/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|█████████▉| 560/562 [02:12<00:00,  4.21it/s]
  3%|▎         | 18/562 [00:04<02:11,  4.13it/s]

KeyboardInterrupt



In [7]:
import numpy as np
from utils.test_utils import test_model

TOKEN_LENGTH=1024
STRIDE = 512
wandb.init(project="ICLR", 
           name = f'{MODEL_NAME} Wikitext Test', 
           config={'token_length': TOKEN_LENGTH,
                 'model_name': MODEL_NAME,
                 'stride': STRIDE,
                 'fine_tuned': 'iterative'})
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(f'facebook/{MODEL_NAME}', 
                                          padding_side='left', 
                                          use_fast=False)
# Load dataset
test_set = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
encodings = tokenizer("\n\n".join(test_set['text']), return_tensors='pt')

seq_len = encodings.input_ids.size(1)
SPARSITIES = [0.2, 0.3, 0.5, 0.7, 0.9, 1]#, 0.4, 0.6, 0.8, 1

for SPARSITY in SPARSITIES:
    test_model(MODEL_NAME, encodings, TOKEN_LENGTH, seq_len, STRIDE, wandb, SPARSITY, model_type='iterative')

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668259616320333, max=1.0…

Found cached dataset wikitext (/gs/gsfs0/users/asyed/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|█████████▉| 560/562 [00:19<00:00, 28.68it/s]
100%|█████████▉| 560/562 [00:19<00:00, 28.92it/s]
100%|█████████▉| 560/562 [00:19<00:00, 28.70it/s]
100%|█████████▉| 560/562 [00:19<00:00, 28.73it/s]
100%|█████████▉| 560/562 [00:19<00:00, 28.89it/s]
100%|█████████▉| 560/562 [00:19<00:00, 28.82it/s]


# Cerebras

In [None]:
for model_name in ['opt-125m', 'opt-350m', 'opt-1.3b', 'opt-2.7b']:
    for sparsity in [0.2,0.3,0.5,0.7,0.9,1]:
        