In [1]:
import torch
from torch.nn.utils import prune

from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset

from calculate_mask import calculate_mask
from inverse_hessian import inverse_hessian
from input_prehooks import put_input_hooks

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_memory_free_MiB(gpu_index):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_index))
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.free // 1024 ** 2
    
def print_memory_summary(name=""):
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    print(f"{name}, memory allocated: {a/1024/1024/1024} gb")

In [3]:
#DEVICE
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = "facebook/opt-350m"

#Load dataset
dataset = load_dataset('c4', 'en', streaming=True)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load model with pre-trained head

print_memory_summary('before loading model')

model = OPTForCausalLM.from_pretrained(model_name, output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

print_memory_summary('after loading model')

# Load generator
generator = pipeline('text-generation', model=model_name)
# Create calibration data

before loading model, memory allocated: 0.0 gb
after loading model, memory allocated: 1.2338027954101562 gb


In [4]:
# ACTIVATE TORCH.NO_GRAD FROM HERE UNTIL MASKS IN PLACE
torch.no_grad()

<torch.autograd.grad_mode.no_grad at 0x1f124db3be0>

In [6]:
# Calibrate model (get inputs to each layer with calibration data)

calibration_size=8
token_length=2048
calibrate_on_cpu = True
calibration_batch_size=1

calibration_data = []
for i, data in enumerate(iter(dataset['train'])): # type: ignore
    if i >= calibration_size:
        break
    tokenized = tokenizer.encode(data['text'], return_tensors="pt", padding="max_length", truncation=True, max_length=token_length)
    calibration_data.append(tokenized)

print_memory_summary(3)
calibration_data = torch.squeeze(torch.stack(calibration_data)).to(device=device)
print_memory_summary(4)
# calibration_data.double()

# First, put in forward hooks
features = {}
store_features_cpu = True # if you want to store features on the cpu or on device, cpu for less vram demand

if store_features_cpu:
    # store features on cpu device with shared memory so vram isn't hogged
    put_input_hooks(model=model, features=features, feature_storage_device='cpu')
else:
    put_input_hooks(model=model, features=features, feature_storage_device=device)

# run model on batches of calibration data, then concatenate inputs
def split_model_calibration(model, calibration_data, execution_device, batch_size=2):
    # split into batches of batch_size
    split_data = torch.split(calibration_data, split_size_or_sections=batch_size, dim=0)

    # iterate through split_data and calibrate
    for batch in split_data:
        print_memory_summary('batch start')
        model(batch)
        print_memory_summary('batch end')

with torch.no_grad():
    # calibrate model on cpu for less vram
    if calibrate_on_cpu:
        model.to(device='cpu')

        # Run calibration data through model at first to calculate features dictionary with
        # input tensors to each intermediate layer
        # at first, run on cpu for less memory?
        calibration_data.to(device='cpu')
        # calibration_data.double()
        
        split_model_calibration(model, calibration_data, execution_device='cpu', batch_size=calibration_batch_size)

        # send model back to cpu
        model.to(device=device)
    else:
        split_model_calibration(model, calibration_data, execution_device=device, batch_size=calibration_batch_size)
print_memory_summary(5)

del calibration_data

3, memory allocated: 0.0 gb
4, memory allocated: 0.0001220703125 gb
batch start, memory allocated: 0.0001220703125 gb


KeyboardInterrupt: 

In [None]:
# make a dictionary to access module by name
module_lookup_dict = {}
for module_name, module_iter in model.named_modules():
    module_lookup_dict[module_name] = module_iter
EPSILON = 0.0001
SPARSENESS = .9
B = 128
Bs = 64

# function to get module name from parameter name
def get_module_name(param_name):
    if param_name[-5:] == ".bias":
        return param_name[:-5], "bias"
    elif param_name[-7:] == ".weight":
        return param_name[:-7], "weight"
    else:
        return None, None

In [None]:
from tqdm import tqdm
layer_blacklist = ['model.decoder.embed_tokens.weight', 'model.decoder.embed_tokens.bias',
'model.decoder.embed_positions.weight']

# Using calibration data (inputs to each intermediate weight layer)
# Iterate through named parameters, calculate inverse hessian and calculate mask

# without this
param_lookup_dict = {}
param_names = []
for name, param in model.named_parameters():
    param_names.append(name)
    param_lookup_dict[name] = param

with torch.no_grad():
    for name in tqdm(param_names):
        param = param_lookup_dict[name]

        # skip the embed layer
        if name in layer_blacklist:
            continue
        
        # skip norms which have 1 dimension
        if len(param.shape) < 2:
            continue

        module_name, param_type = get_module_name(name)

        # apply to weight and bias layers
        if param_type == "weight" or param_type == "bias":
            # input to parameter
            layer_input = features[module_name].to(device=device)

            # calculate inverse hessian
            # check if input is flattened e.g. from 8,512,768 to 4096,768
            if len(layer_input.shape) == 2:
                inv_hess = inverse_hessian(torch.transpose(layer_input, 0, 1), epsilon=EPSILON, 
                flattened=True)

            else:
                inv_hess = inverse_hessian(torch.transpose(layer_input, 1, 2), epsilon=EPSILON,
                flattened=False)

            # calculate mask
            mask = calculate_mask(W=param, H_inv=inv_hess, p=SPARSENESS, B=B, Bs=Bs)
            
            # get module from lookup dictionary by module name
            module = module_lookup_dict[module_name]
            # apply mask
            prune.custom_from_mask(module=module, name=param_type, mask=mask)

In [None]:
# SAVE PRUNED MODEL
pruned_model_name = 'opt-125m'
# torch.save(model,'pruned_models/' + pruned_model_name)
# model.save_pretrained(save_directory = 'pruned_models/' + pruned_model_name)

torch.save(model.state_dict(), 'pruned_models/opt-125m')

In [None]:
# LOAD SAVED MODEL

from save_pruned_model import load_into_model
import torch
from torch.nn.utils import prune
from transformers import AutoTokenizer, OPTForCausalLM, pipeline
from datasets import load_dataset
device = 'cuda' if torch.cuda.is_available() else 'cpu'

loaded_model = OPTForCausalLM.from_pretrained('facebook/opt-125m', output_attentions=True, output_hidden_states=True).to(device=device) # type: ignore

load_into_model(loaded_model, 'pruned_models/opt-125m')

In [None]:
model_name = "facebook/opt-125m"

#Load dataset
dataset = load_dataset('c4', 'en', streaming=True)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

input1 = tokenizer("Hello, my dog is cute", return_tensors="pt", padding="max_length", truncation=True).to(device=device)
input2 = tokenizer("What the fuck did you just fucking say about me, you little bitch?", return_tensors="pt", padding="max_length", truncation=True).to(device=device)
output = loaded_model.generate(input1.input_ids, max_length=100, num_return_sequences=1, temperature=0.9, top_p=0.95)
tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Proportion of weights that are 0:

def get_prop_zeros(model):
    return torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) / (torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight == 0) + torch.sum(model.get_decoder().layers[0].self_attn.k_proj.weight != 0))

print(get_prop_zeros(loaded_model))
# print(get_prop_zeros(model))

In [None]:
# REGULAR OUTPUT
model2 = OPTForCausalLM.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True).to(device=device)

In [None]:
from testing_module import calculate_perp

# def calculate_perp(model, input_data, device):
#     input_data = torch.squeeze(torch.stack(input_data)).to(device=device)
#     input_data.double()
#     outputs = model(input_data)[0] 
#     log_probs = outputs[0, -1, :].log_softmax(-1)
#     neg_log_likelihood = -log_probs.mean()
#     perplexity = torch.exp(neg_log_likelihood)      
#     return perplexity.item()

# print(calculate_perp(model, input_data, device))

dataset = load_dataset('c4', 'en', streaming=True)
input_data = []
for i, data in enumerate(iter(dataset['train'])):
    if i > 7:
        break
    tokenized = tokenizer.encode(data['text'], return_tensors="pt", padding="max_length", truncation=True, max_length=512)
    input_data.append(tokenized)
input_data = torch.squeeze(torch.stack(input_data)).to(device=device)
input_data.double()

# lmao out of memory
print(calculate_perp(loaded_model, input_data))