In [14]:
import torch
import torch.nn as nn

def quantize_model(model, num_bits, method='uniform', block_size=128, lambda_param=1e-5):
    def quantize_uniform(tensor, num_bits):
        qmin, qmax = 0, 2**num_bits - 1
        scale = (tensor.max() - tensor.min()) / (qmax - qmin)
        zero_point = qmin - torch.round(tensor.min() / scale)
        
        quantized = torch.clamp(torch.round(tensor / scale + zero_point), qmin, qmax)
        dequantized = (quantized - zero_point) * scale
        
        return dequantized

    def quant(x, num_bits):
        return quantize_uniform(x, num_bits)

    def gptq(W, H_inv, num_bits, block_size):
        d_row, d_col = W.shape
        Q = torch.zeros_like(W)
        E = torch.zeros(d_row, min(block_size, d_col), device=W.device)

        for i in range(0, d_col, block_size):
            curr_block_size = min(block_size, d_col - i)
            H_inv_block = H_inv[i:i+curr_block_size, i:i+curr_block_size]
            
            for j in range(curr_block_size):
                idx = i + j
                Q[:, idx] = quant(W[:, idx], num_bits)
                E[:, j] = W[:, idx] - Q[:, idx]
                update = E[:, j].unsqueeze(1) @ H_inv_block[j:j+1, j:]
                W[:, idx:i+curr_block_size] -= update

            if i + curr_block_size < d_col:
                W[:, i+curr_block_size:] -= E[:, :curr_block_size] @ H_inv[i:i+curr_block_size, i+curr_block_size:]

        return Q

    def compute_hessian(X, lambda_param):
        X = X.T
        H = 2 * X @ X.T
        H_reg = H + lambda_param * torch.eye(H.shape[0], device=H.device)
        return torch.inverse(H_reg)

    def quantize_layer_gptq(layer, X):
        if isinstance(layer, nn.Linear):
            H_inv = compute_hessian(X, lambda_param)
            layer.weight.data = gptq(layer.weight.data, H_inv, num_bits, block_size)
            if layer.bias is not None:
                layer.bias.data = quantize_uniform(layer.bias.data, num_bits)
        return layer

    def quantize_layer_uniform(layer):
        if isinstance(layer, nn.Linear):
            layer.weight.data = quantize_uniform(layer.weight.data, num_bits)
            if layer.bias is not None:
                layer.bias.data = quantize_uniform(layer.bias.data, num_bits)
        return layer

    if method == 'uniform':
        return model.apply(quantize_layer_uniform)
    elif method == 'gptq':
        device = next(model.parameters()).device
        for name, module in model.named_modules():
            if isinstance(module, nn.Linear):
                in_features = module.in_features
                # Generate random input for this layer
                X = torch.randn(100, in_features, device=device)  # Increased sample size
                quantize_layer_gptq(module, X)
        return model
    elif method == 'awq':
        raise NotImplementedError("AWQ quantization not implemented")
    else:
        raise ValueError("Unsupported quantization method")

# Example usage:
# quantized_model = quantize_model(model, num_bits=8, method='gptq', block_size=128, lambda_param=1e-5)

In [10]:
import torch
from torch import nn
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from datasets import COCODataset
from tqdm import tqdm
import json
import time


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
model = model.to("cpu")

# Load COCO dataset
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

loading annotations into memory...
Done (t=0.08s)
creating index...
index created!


In [None]:
quantized_model = quantize_model(model, num_bits=8, method='gptq', block_size=128, lambda_param=1e-5)