# Calibration Data, Model

In [122]:
%%capture
from transformers import AutoTokenizer, OPTForCausalLM, OPTModel, pipeline
from datasets import load_dataset

dataset = load_dataset('c4', 'en', streaming=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

# Load model with pre-trained head
model = OPTModel.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True)

generator = pipeline('text-generation', model="facebook/opt-125m")

calibration_data = []
for i, data in enumerate(iter(dataset['train'])):
    if i > 1:
        break
    tokenized = tokenizer.encode(data['text'], return_tensors="pt", padding="max_length", truncation=True, max_length=2048)
    calibration_data.append(tokenized)
calibration_data = torch.squeeze(torch.stack(calibration_data)).to(device=device)

In [59]:
# calibration_data.shape
for i, data in enumerate(iter(dataset['train'])):
    print(data['text'])
    break
print()

output = model.generate(calibration_data, max_length=500, num_return_sequences=1, temperature=0.9, top_p=0.95)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Beginners BBQ Class Taking Place in Missoula!
Do you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Rangers. He will be teaching a beginner level class for everyone who wants to get better with their culinary skills.
He will teach you everything you need to know to compete in a KCBS BBQ competition, including techniques, recipes, timelines, meat selection and trimming, plus smoker and fire information.
The cost to be in the class is $35 per person, and for spectators it is free. Included in the cost will be either a t-shirt or apron and you will be tasting samples of each meat that is prepared.

Beginners BBQ Class Taking Place in Missoula!
Do you want to get better at making delicious BBQ? You will have the opportunity, put this on your calendar now. Thursday, September 22nd join World Class BBQ Champion, Tony Balay from Lonestar Smoke Range

In [1]:
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def inverse_hessian(X, epsilon=0.5):
    """
    Calculate the inverse of a positive-definite matrix using the Cholesky decomposition.
    Args:
    - Hessian (torch.Tensor): positive-definite matrix to be inverted
    - epsilon (float): small constant to prevent Hessian from being singular
    Returns:
    - torch.Tensor: inverted matrix
    """
    X = X.float()
    X = torch.transpose(X, 0, 1)
    X = (2 * (X @ torch.transpose(X, 0, 1)))
    X = X + torch.eye(X.shape[0]) * epsilon
    
    # print(f"num 0s: {torch.sum(X.diag()==0)}")
    
    print(f"sum of diagonal {torch.sum(X.diag())}")
    # print(f"determinant: {torch.linalg.det(X)}")

    hessian = np.linalg.inv(X)
    # Decompose the matrix into a upper triangular matrix
    inverse_hessian = torch.transpose(torch.cholesky(hessian, upper=True), 0, 1)
    return inverse_hessian


  from .autonotebook import tqdm as notebook_tqdm


In [121]:
inv_h = inverse_hessian(calibration_data)
print(inv_h.shape)

sum of diagonal 165617991680.0


LinAlgError: Singular matrix

In [85]:
# W is weights matrix for one layer
# H_inv is inverse hessian for one layer
# p is proportion of weights to 0
# B is lazy block size, low B helps to reduce memory use
# Bs is inverse of how often to make masks (e.g. when Bs is 4, make new masks with 20% sparseness every 4 columns)
def calculate_mask(W, H_inv, p, B, Bs):
    # Get the number of rows and columns in W
    d_row, d_col = W.shape
    
    # Initialize the pruning mask M and block quantization errors E to all zeros
    M = torch.zeros(d_row, d_col, dtype=torch.bool)
    E = torch.zeros(d_row, B)

    # only need to calculate w_square and h_square once
    # w_square = torch.square(W)
    # h_square = torch.square(H_inv)

    # Loop over blocks of columns of W
    for i in range(0, d_col, B):
        # Loop over columns within a block
        for j in range(i, min(i + B, d_col)):
            # If j is a multiple of Bs, prune a portion of the weights
            if j % Bs == 0:
                # Get the mask for the largest (1 - p)% of weights based on squared value and inverse hessian

                # prune_values is matrix of w^2/H^(-1)_cc
                
                w_square_section = torch.square(W[:, j:j+Bs])
                h_square_section = torch.square(H_inv[j:j+Bs, j:j+Bs]).diag() # 1 dimensional vector

                # print("weights squared and h_inv:")
                # print(w_square_section)
                # print(h_square_section)

                prune_values = w_square_section / h_square_section.unsqueeze(0)
                # print("prune values: ")
                # print(prune_values)

                cutoff_value = torch.kthvalue(prune_values, int((1 - p) * d_row), dim=0)[0]
                # print("cutoff value: ")
                # print(cutoff_value)
    
                # print("mask: ")
                mask = prune_values > cutoff_value
            
                M[:, j:j+Bs] = mask

            # Calculate the pruning error for this column
            E[:, j-i] = W[:, j] / H_inv[j, j]
            # Freeze the weights that are not pruned by multiplying by the pruning mask
            # Invert mask (~M equivalent to 1 - M)
            E[:, j-i] = ~M[:, j] * E[:, j-i]
            # Update the weights in this block based on the pruning error and inverse hessian information
            W[:, j:i+B] -= torch.ger(E[:, j-i], H_inv[j, j:i+B])
        # Update all remaining weights
        W[:, i+B:] -= torch.matmul(E, H_inv[i:i+B, i+B:])
    
    # return mask
    return M

In [123]:
# Load model with pre-trained head
model = OPTModel.from_pretrained("facebook/opt-125m", output_attentions=True, output_hidden_states=True)

# test on one particular layer: 
test_param = model.get_decoder().layers[11]
print(test_param)

for name, param in model.named_parameters():
    print(name)
    print(param.shape)

OPTDecoderLayer(
  (self_attn): OPTAttention(
    (k_proj): Linear(in_features=768, out_features=768, bias=True)
    (v_proj): Linear(in_features=768, out_features=768, bias=True)
    (q_proj): Linear(in_features=768, out_features=768, bias=True)
    (out_proj): Linear(in_features=768, out_features=768, bias=True)
  )
  (activation_fn): ReLU()
  (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=768, out_features=3072, bias=True)
  (fc2): Linear(in_features=3072, out_features=768, bias=True)
  (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
decoder.embed_tokens.weight
torch.Size([50272, 768])
decoder.embed_positions.weight
torch.Size([2050, 768])
decoder.final_layer_norm.weight
torch.Size([768])
decoder.final_layer_norm.bias
torch.Size([768])
decoder.layers.0.self_attn.k_proj.weight
torch.Size([768, 768])
decoder.layers.0.self_attn.k_proj.bias
torch.Size([768])
decoder.layers.0.self_attn.v_proj.weight
to

In [47]:
print(model)
# print(tokenizer)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0): OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [88]:
w_matrix = torch.rand(8, 5)
h_matrix = torch.rand(5,5)
# print(w_matrix)
# print(h_matrix)

# print(w_matrix / h_matrix.diag()unsqueeze(0))
print(calculate_mask(W=w_matrix, H_inv=h_matrix, p=.2, B=1, Bs=1))

tensor([[False, False, False, False, False],
        [ True, False, False, False, False],
        [False,  True, False, False, False],
        [False,  True, False,  True, False],
        [False, False,  True, False, False],
        [False, False, False, False,  True],
        [ True, False,  True, False, False],
        [False, False, False,  True,  True]])
