In [1]:
import torch
from torch.nn.utils import prune

from tqdm import tqdm

from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset

from calculate_mask import calculate_mask
from inverse_hessian import inverse_hessian
from input_prehooks import put_input_hooks
from testing_module import calculate_perp

import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def print_memory_summary(name=""):
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    print(f"{name}, memory allocated: {a/1024/1024/1024} gb")

In [3]:
#DEVICE
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "facebook/opt-350m"

#Load dataset
dataset = load_dataset('c4', 'en', streaming=True)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')

In [10]:
# Calibrate model (get inputs to each layer with calibration data)
calibration_size=16
token_length=512
calibrate_on_cpu = False
calibration_batch_size=2
EPSILON = 1e-8
B = 8
Bs = 4
layer_blacklist = ['model.decoder.embed_tokens.weight', 'model.decoder.embed_tokens.bias',
'model.decoder.embed_positions.weight']

# run model on batches of calibration data, then concatenate inputs
def split_model_calibration(model):
    batch_sentences = []
    for i, data in tqdm(enumerate(iter(dataset['train']))):
        if i < calibration_size + 1:
            if len(batch_sentences) >= calibration_batch_size:
                encoded_input = tokenizer(batch_sentences, return_tensors="pt", padding="max_length", max_length=token_length, truncation=True).to(device=device)
                with torch.no_grad():
                    model(**encoded_input, labels=encoded_input.input_ids)
                batch_sentences = []
            batch_sentences.append(data['text'])
        else:
            break
            
# function to get module name from parameter name
def get_module_name(param_name):
    if param_name[-5:] == ".bias":
        return param_name[:-5], "bias"
    elif param_name[-7:] == ".weight":
        return param_name[:-7], "weight"
    else:
        return None, None
    
for SPARSENESS in [0.3]:#0.2,
    print_memory_summary(0)
    print(f'On SPASENESS {SPARSENESS}')
    # Load model with pre-trained head
    model = OPTForCausalLM.from_pretrained(model_name, output_attentions=True,
                                           output_hidden_states=True).to(device=device) # type: ignore
    
    model.eval()
    # First, put in forward hooks
    features = {}
    put_input_hooks(model=model, features=features, feature_storage_device='cpu')
    split_model_calibration(model)
    # make a dictionary to access module by name
    module_lookup_dict = {}
    for module_name, module_iter in model.named_modules():
        module_lookup_dict[module_name] = module_iter
        
    #Iterate through named parameters, calculate inverse hessian and mask
    param_lookup_dict = {}
    param_names = []
    for name, param in model.named_parameters():
        param_names.append(name)
        param_lookup_dict[name] = param
    print_memory_summary(1)
    print("HI")
    print(f'SPARSIFYING SPASENESS {SPARSENESS}')
    with torch.no_grad():
        for name in param_names:
            param = param_lookup_dict[name]

            # skip the embed layer
            if name in layer_blacklist:
                continue

            # skip norms which have 1 dimension
            if len(param.shape) < 2:
                continue

            module_name, param_type = get_module_name(name)
            print_memory_summary(2)
            # apply to weight and bias layers
            if param_type == "weight" or param_type == "bias":
                # input to parameter, move to gpu
                layer_input = features[module_name].to(device=device)
                # calculate inverse hessian
                # check if input is flattened e.g. from 8,512,768 to 4096,768
                print_memory_summary("before hessian")
                if len(layer_input.shape) == 2:
                    inv_hess = inverse_hessian(torch.transpose(layer_input, 0, 1), epsilon=EPSILON, 
                    flattened=True).to(device=device)
                else:
                    inv_hess = inverse_hessian(torch.transpose(layer_input, 1, 2), epsilon=EPSILON,
                    flattened=False).to(device=device)
                print_memory_summary("after hessian")
                #No need for layer input now
                del features[module_name]
                # calculate mask
                mask = calculate_mask(W=param.to(device=device), H_inv=inv_hess, p=SPARSENESS, B=B, Bs=Bs)
                del inv_hess
                del param
                # mask.to(device=device)
                print_memory_summary("after mask")
                # get module from lookup dictionary by module name
                module = module_lookup_dict[module_name]
                # apply mask
                prune.custom_from_mask(module=module, name=param_type, mask=mask)

                # masks add memory, remove the mask and replace masked weights with 0
                prune.remove(module=module, name=param_type)
                del module
                del mask
                print_memory_summary("after pruning")
        print(f'SAVING SPASENESS {SPARSENESS}')
        pruned_model_name = f'opt-350m-{SPARSENESS}'
        torch.save(model.state_dict(), f'pruned_models/{pruned_model_name}.pt')
        del model
        gc.collect()
        torch.cuda.empty_cache()

0, memory allocated: 2.7602057456970215 gb
On SPASENESS 0.3


17it [00:21,  1.28s/it]


1, memory allocated: 1.5034546852111816 gb
HI
SPARSIFYING SPASENESS 0.3
2, memory allocated: 1.5034546852111816 gb
before hessian, memory allocated: 1.4097046852111816 gb
after hessian, memory allocated: 1.4175171852111816 gb
after mask, memory allocated: 1.4101929664611816 gb
after pruning, memory allocated: 1.4097046852111816 gb
2, memory allocated: 1.4097046852111816 gb
before hessian, memory allocated: 1.3940796852111816 gb
after hessian, memory allocated: 1.3960328102111816 gb
after mask, memory allocated: 1.3945679664611816 gb
after pruning, memory allocated: 1.3940796852111816 gb
2, memory allocated: 1.3940796852111816 gb
before hessian, memory allocated: 1.4097046852111816 gb
after hessian, memory allocated: 1.4175171852111816 gb
after mask, memory allocated: 1.4106812477111816 gb
after pruning, memory allocated: 1.4097046852111816 gb
2, memory allocated: 1.4097046852111816 gb
before hessian, memory allocated: 1.4097046852111816 gb
after hessian, memory allocated: 1.41751718521

In [1]:
# load pruned model

from save_pruned_model import load_unmasked_model, load_masked_model
import torch
from torch.nn.utils import prune
from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# loaded_model = OPTForCausalLM.from_pretrained('facebook/opt-350m', output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

# load_unmasked_model(loaded_model, 'pruned_models/opt-350m-0.3.pt')

loaded_model2 = OPTForCausalLM.from_pretrained('facebook/opt-350m', output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore
load_masked_model(loaded_model2, 'pruned_models/opt-350m-0.3.pt')

loaded_model2.eval()
loaded_model2(torch.randint(high=20, size=(1,10)).to(device=device))

  from .autonotebook import tqdm as notebook_tqdm


ValueError: too many values to unpack (expected 2)

In [14]:
def get_prop_zeros(model):
    return torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) / (torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) + torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight != 0))

print(get_prop_zeros(loaded_model2))
print(get_prop_zeros(loaded_model))
# print(get_prop_zeros(model2))

tensor(0.7005, device='cuda:0')
