In [5]:
import torch
from torch.nn.utils import prune

from tqdm import tqdm

from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset

from calculate_mask import calculate_mask
from inverse_hessian import inverse_hessian
from input_prehooks import put_input_hooks
from testing_module import calculate_perp

In [None]:
#DEVICE
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = "facebook/opt-125m"

#Load dataset
dataset = load_dataset('c4', 'en', streaming=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

# Load model with pre-trained head
model = OPTForCausalLM.from_pretrained(model_name, output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

#model = torch.nn.DataParallel(model, device_ids=[0,1,2,3])

# Load generator
generator = pipeline('text-generation', model=model_name)

In [None]:
# Calibrate model (get inputs to each layer with calibration data)

calibration_size=128
token_length=1024
calibrate_on_cpu = False
calibration_batch_size=2

# First, put in forward hooks
features = {}
put_input_hooks(model=model, features=features, feature_storage_device='cpu')

model.eval()
# run model on batches of calibration data, then concatenate inputs
def split_model_calibration(model):
    batch_sentences = []
    for i, data in tqdm(enumerate(iter(dataset['train']))):
        if i < calibration_size + 1:
            if len(batch_sentences) >= calibration_batch_size:
                encoded_input = tokenizer(batch_sentences, return_tensors="pt", padding="max_length", max_length=token_length, truncation=True).to(device=device)
                with torch.no_grad():
                    model(**encoded_input, labels=encoded_input.input_ids)
                batch_sentences = []
            batch_sentences.append(data['text'])
        else:
            break

split_model_calibration(model)

# Sparsify Model

In [4]:
# make a dictionary to access module by name
module_lookup_dict = {}
for module_name, module_iter in model.named_modules():
    module_lookup_dict[module_name] = module_iter
EPSILON = 1e-8
SPARSENESS = .8
B = 128
Bs = 64

# function to get module name from parameter name
def get_module_name(param_name):
    if param_name[-5:] == ".bias":
        return param_name[:-5], "bias"
    elif param_name[-7:] == ".weight":
        return param_name[:-7], "weight"
    else:
        return None, None

In [5]:
layer_blacklist = ['model.decoder.embed_tokens.weight', 'model.decoder.embed_tokens.bias',
'model.decoder.embed_positions.weight']

# Using calibration data (inputs to each intermediate weight layer)
# Iterate through named parameters, calculate inverse hessian and calculate mask

# without this
param_lookup_dict = {}
param_names = []
for name, param in model.named_parameters():
    param_names.append(name)
    param_lookup_dict[name] = param

with torch.no_grad():
    for name in tqdm(param_names):
        param = param_lookup_dict[name]

        # skip the embed layer
        if name in layer_blacklist:
            continue
        
        # skip norms which have 1 dimension
        if len(param.shape) < 2:
            continue

        module_name, param_type = get_module_name(name)

        # apply to weight and bias layers
        if param_type == "weight" or param_type == "bias":
            # input to parameter
            layer_input = features[module_name].to(device=device)
            # calculate inverse hessian
            # check if input is flattened e.g. from 8,512,768 to 4096,768
            if len(layer_input.shape) == 2:
                inv_hess = inverse_hessian(torch.transpose(layer_input, 0, 1), epsilon=EPSILON, 
                flattened=True).to(device=device)

            else:
                inv_hess = inverse_hessian(torch.transpose(layer_input, 1, 2), epsilon=EPSILON,
                flattened=False).to(device=device)

            # calculate mask
            mask = calculate_mask(W=param, H_inv=inv_hess, p=SPARSENESS, B=B, Bs=Bs)
            
            # get module from lookup dictionary by module name
            module = module_lookup_dict[module_name]
            # apply mask
            prune.custom_from_mask(module=module, name=param_type, mask=mask)

100%|██████████| 196/196 [02:14<00:00,  1.46it/s]


# Save Pruned Model

In [6]:
# SAVE PRUNED MODEL
pruned_model_name = f'opt-125m-{SPARSENESS}'
# torch.save(model,'pruned_models/' + pruned_model_name)
# model.save_pretrained(save_directory = 'pruned_models/' + pruned_model_name)

torch.save(model.state_dict(), f'pruned_models/{pruned_model_name}.pt')

In [7]:
# LOAD SAVED MODEL

from save_pruned_model import load_into_model
import torch
from torch.nn.utils import prune
from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset
device = 'cuda' if torch.cuda.is_available() else 'cpu'

loaded_model = OPTForCausalLM.from_pretrained('facebook/opt-125m', output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

load_into_model(loaded_model, f'pruned_models/{pruned_model_name}.pt')

100%|██████████| 196/196 [00:00<00:00, 13909.81it/s]


In [10]:
'''model_name = "facebook/opt-125m"

def get_prop_zeros(model):
    return torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) / (torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) + torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight != 0))

print(get_prop_zeros(loaded_model))
print(get_prop_zeros(model))'''

# Sample Testing

In [2]:
import wandb
wandb.login()
wandb.init(project="ICLR", name = pruned_model_name)

NameError: name 'pruned_model_name' is not defined

In [8]:
'''# REGULAR OUTPUT
dense_model = OPTForCausalLM.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True).to(device=device)
encoded_test_input = tokenizer('What did you just say to me? I will have you know', return_tensors="pt",
                                                                                    padding="max_length", 
                                                                                    max_length=token_length, 
                                                                                    truncation=True)
#print(encoded_test_input)
print('DENSE MODEL:')
with torch.no_grad():
    generated_ids = dense_model.generate(**encoded_test_input, max_new_tokens=30, num_beams=5, do_sample=True)
dense_output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(f'\tOutput: {dense_output}')

print('SPARSE MODEL: ')
with torch.no_grad():
    generated_ids = model.generate(**encoded_test_input, max_new_tokens=30, num_beams=5, do_sample=True)
sparse_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f'\tOutput: {sparse_output}')'''

DENSE MODEL:
	Output: ["What did you just say to me? I will have you know that I am not the only one.\nI'm not the only one."]
SPARSE MODEL: 


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 128 but got size 1 for tensor number 1 in the list.

In [11]:
'''dense_model = OPTForCausalLM.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True).to(device=device)
encoded_test_input = tokenizer('What did you just say to me? I will have you know', return_tensors="pt",
                                                                                    padding="max_length", 
                                                                                    max_length=token_length, 
                                                                                    truncation=True).to(device=device)
print(torch.exp(dense_model(**encoded_test_input, labels = encoded_test_input.input_ids).loss))
print(torch.exp(model(**encoded_test_input, labels = encoded_test_input.input_ids).loss))
print(torch.exp(loaded_model(**encoded_test_input, labels = encoded_test_input.input_ids).loss))'''

tensor(9591.2529, grad_fn=<ExpBackward0>)
tensor(219081.9531, grad_fn=<ExpBackward0>)
tensor(2541604.7500, grad_fn=<ExpBackward0>)


In [None]:
del model
torch.cuda.empty_cache()

# Perplexity Testing

In [None]:
loaded_model = torch.nn.DataParallel(loaded_model, device_ids=[0,1,2,3])

In [9]:
#test_set = load_dataset('wikitext', 'wikitext-2-v1', split='test[:10%]')
test_set = load_dataset('wikitext', 'wikitext-2-v1', split='test')
tokenized_test = tokenizer(test_set['text'])

flattened_input_ids = [item for sublist in tokenized_test.input_ids for item in sublist]
flattened_input_ids = flattened_input_ids[:(len(flattened_input_ids) - (len(flattened_input_ids) % token_length))]
flattened_input_ids = torch.Tensor(flattened_input_ids).reshape(-1, token_length).type(torch.LongTensor).to(device=device)

flattened_masks = [item for sublist in tokenized_test.attention_mask for item in sublist]
flattened_masks = flattened_masks[:(len(flattened_masks) - (len(flattened_masks) % token_length))]
flattened_masks = torch.Tensor(flattened_masks).reshape(-1, token_length).type(torch.LongTensor).to(device=device)

test_dict = {'input_ids': flattened_input_ids, 'attention_mask': flattened_masks}

Found cached dataset wikitext (C:/Users/Aaquib/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [12]:
torch.cuda.empty_cache()
loaded_model.eval()
sum_perp_sparse = 0
sum_perp_dense = 0
with torch.no_grad():
    #for input_id, attention_mask in zip(test_dict['input_ids'], test_dict['attention_mask']):
        #batch_dict = {'input_ids': input_id, 'attention_mask': attention_mask}
    sparse_output = loaded_model(**test_dict, labels=test_dict['input_ids'])
    sum_perp_sparse += torch.exp(sparse_output.loss)
#sum_perp_sparse /= len(test_dict['input_ids'])
#sum_perp_dense /= len(test_dict['input_ids'])

In [None]:
#wandb.log({"sparse_perplexity": sum_perp_sparse, "dense_perplexity": sum_perp_dense})
wandb.log({"sparse_perplexity": sum_perp_sparse})

In [None]:
print(sum_perp_sparse)
print(sum_perp_dense)