In [10]:
import torch
from torch.nn.utils import prune

from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset

from calculate_mask import calculate_mask
from inverse_hessian import inverse_hessian
from input_prehooks import put_input_hooks

In [11]:
#DEVICE
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = "facebook/opt-125m"

#Load dataset
dataset = load_dataset('c4', 'en', streaming=True)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model with pre-trained head

model = OPTForCausalLM.from_pretrained(model_name, output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

# Load generator
generator = pipeline('text-generation', model=model_name)
# Create calibration data

calibration_size=4
token_length=512

calibration_data_array = []
for i, data in enumerate(iter(dataset['train'])): # type: ignore
    if i > calibration_size:
        break
    tokenized = tokenizer.encode(data['text'], return_tensors="pt", padding="max_length", truncation=True, max_length=token_length)
    calibration_data_array.append(tokenized)
calibration_data = torch.squeeze(torch.stack(calibration_data_array)).to(device=device)
calibration_data.double()

tensor([[2.0000e+00, 4.8290e+04, 7.1300e+03,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 4.8763e+04, 1.1000e+01,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 5.9700e+02, 1.4189e+04,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 6.1790e+03, 1.7100e+02,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [2.0000e+00, 1.3300e+02, 4.4650e+03,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00]], device='cuda:0', dtype=torch.float64)

In [12]:
# First, put in forward hooks
features = {}
put_input_hooks(model=model, features=features)

# Run calibration data through model at first to calculate features dictionary with
# input tensors to each intermediate layer
model(calibration_data)

# function to get module name from parameter name
def get_module_name(param_name):
    if param_name[-5:] == ".bias":
        return param_name[:-5], "bias"
    elif param_name[-7:] == ".weight":
        return param_name[:-7], "weight"
    else:
        return None, None

In [13]:
# Re-load model with pre-trained head
# model = OPTForCausalLM.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True)

# make a dictionary to access module by name
module_lookup_dict = {}
for module_name, module_iter in model.named_modules():
    module_lookup_dict[module_name] = module_iter
EPSILON = 0.0001
SPARSENESS = .9
B = 128
Bs = 64

In [14]:
from tqdm import tqdm
layer_blacklist = ['model.decoder.embed_tokens.weight', 'model.decoder.embed_tokens.bias',
'model.decoder.embed_positions.weight']

# Using calibration data (inputs to each intermediate weight layer)
# Iterate through named parameters, calculate inverse hessian and calculate mask

# without this
param_lookup_dict = {}
param_names = []
for name, param in model.named_parameters():
    param_names.append(name)
    param_lookup_dict[name] = param

with torch.no_grad():
    for name in tqdm(param_names):
        param = param_lookup_dict[name]

        # skip the embed layer
        if name in layer_blacklist:
            continue
        
        # skip norms which have 1 dimension
        if len(param.shape) < 2:
            continue

        module_name, param_type = get_module_name(name)

        # apply to weight and bias layers
        if param_type == "weight" or param_type == "bias":
            # input to parameter
            layer_input = features[module_name][0]

            # calculate inverse hessian
            # check if input is flattened e.g. from 8,512,768 to 4096,768
            if len(layer_input.shape) == 2:
                inv_hess = inverse_hessian(torch.transpose(layer_input, 0, 1), epsilon=EPSILON, 
                flattened=True)

            else:
                inv_hess = inverse_hessian(torch.transpose(layer_input, 1, 2), epsilon=EPSILON,
                flattened=False)

            # calculate mask
            mask = calculate_mask(W=param, H_inv=inv_hess, p=SPARSENESS, B=B, Bs=Bs)
            
            # get module from lookup dictionary by module name
            module = module_lookup_dict[module_name]
            # apply mask
            prune.custom_from_mask(module=module, name=param_type, mask=mask)

100%|██████████| 196/196 [00:28<00:00,  6.95it/s]


In [19]:
# SAVE PRUNED MODEL
pruned_model_name = 'opt-125m'
# torch.save(model,'pruned_models/' + pruned_model_name)
# model.save_pretrained(save_directory = 'pruned_models/' + pruned_model_name)

torch.save(model.state_dict(), 'pruned_models/opt-125m')

In [2]:
# LOAD PRUNED MODEL

from transformers import AutoTokenizer, OPTForCausalLM
from datasets import load_dataset
import torch
from torch.nn.utils import prune
import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m')

loaded_model = OPTForCausalLM.from_pretrained('facebook/opt-125m', output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

def apply_identity_prune(model):
    layer_blacklist = ['loaded_model.decoder.embed_tokens.weight', 'loaded_model.decoder.embed_tokens.bias',
    'loaded_model.decoder.embed_positions.weight']

    # Using calibration data (inputs to each intermediate weight layer)
    # Iterate through named parameters, calculate inverse hessian and calculate mask

    # without this
    param_lookup_dict = {}
    param_names = []
    for name, param in model.named_parameters():
        param_names.append(name)
        param_lookup_dict[name] = param

    with torch.no_grad():
        for name in tqdm(param_names):
            param = param_lookup_dict[name]

            # skip the embed layer
            # if name in layer_blacklist:
            #     continue

            if 'embed' in name:
                continue
            
            # skip norms which have 1 dimension
            if len(param.shape) < 2:
                continue
        
        prune.custom_from_mask(module=module, name=param_type, mask=mask)

apply_identity_prune(model=loaded_model)
for n, m in loaded_model.named_parameters():
    print(n)

# loaded_model.load_state_dict(torch.load('pruned_models/opt-125m'))

# pretrained_model = 'pruned_models/opt-125m'

# model = OPTForCausalLM.from_pretrained(model_name, output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

# loaded_model = OPTForCausalLM.from_pretrained(pretrained_model).to(device=device) # type: ignore



TypeError: 'module' object is not callable

In [7]:
input1 = tokenizer("Hello, my dog is cute", return_tensors="pt", padding="max_length", truncation=True).to(device=device)
input2 = tokenizer("What the fuck did you just fucking say about me, you little bitch?", return_tensors="pt", padding="max_length", truncation=True).to(device=device)
output = loaded_model.generate(input1.input_ids, max_length=100, num_return_sequences=1, temperature=0.9, top_p=0.95)
tokenizer.decode(output[0], skip_special_tokens=True)

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


"Hello, my dog is cute and I love her. I'm a little nervous about her because she's a little bit shy and I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle"

In [9]:
# Proportion of weights that are 0:

def get_prop_zeros(model):
    return torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) / (torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) + torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight != 0))

print(get_prop_zeros(loaded_model))

tensor(0., device='cuda:0')


In [9]:
# REGULAR OUTPUT
model2 = OPTForCausalLM.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True)

input1 = tokenizer("Hello, my dog is cute", return_tensors="pt", padding="max_length", truncation=True)
input2 = tokenizer("What the fuck did you just fucking say about me, you little bitch?", return_tensors="pt", padding="max_length", truncation=True)
output = model2.generate(input1.input_ids, max_length=100, num_return_sequences=1, temperature=0.9, top_p=0.95)
tokenizer.decode(output[0], skip_special_tokens=True)

"Hello, my dog is cute and I love her. I'm a little nervous about her because she's a little bit shy and I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle it. I'm not sure if she's going to be able to handle"

In [13]:
from testing_module import calculate_perp

# def calculate_perp(model, input_data, device):
#     input_data = torch.squeeze(torch.stack(input_data)).to(device=device)
#     input_data.double()
#     outputs = model(input_data)[0] 
#     log_probs = outputs[0, -1, :].log_softmax(-1)
#     neg_log_likelihood = -log_probs.mean()
#     perplexity = torch.exp(neg_log_likelihood)      
#     return perplexity.item()

# print(calculate_perp(model, input_data, device))

# lmao out of memory
print(calculate_perp(model, calibration_data_array, 'cpu'))

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)