### **Importing Libraries**

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from datasets import load_dataset
import math
import gc
import psutil
from tqdm import tqdm
import time
import copy
from copy import deepcopy
import torch.nn as nn
import torch.nn.functional as F
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
import string
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

2024-11-16 20:08:01.014541: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/chetan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


cuda


### **Printing few Info**

In [2]:
model_name = "gpt2"

model_org = AutoModelForCausalLM.from_pretrained(
    "gpt2"
)

memory_footprint_bytes = model_org.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 510342192 bytes
Memory footprint: 486.70 MB
Memory footprint: 0.48 GB


In [3]:
# import torch
# import torch.nn as nn
# from transformers import GPT2Model

# # Load the GPT-2 model
# model_new = GPT2Model.from_pretrained('gpt2')

# # Function to quantize weights to 8-bit
# def quantize_weights(module):
#     if isinstance(module, (nn.Linear, nn.Conv2d)):
#         # Quantize weights
#         module.weight.data = torch.quantize_per_tensor(module.weight.data, scale=0.1, zero_point=0, dtype=torch.qint8)
#         if module.bias is not None:
#             module.bias.data = torch.quantize_per_tensor(module.bias.data, scale=0.1, zero_point=0, dtype=torch.qint8)

# # Apply quantization to the model
# model_new.apply(quantize_weights)

# # Verify the quantization
# print(model_new)

# memory_footprint_bytes = model_new.get_memory_footprint()

# memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
# memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

# print(f"Memory footprint: {memory_footprint_bytes} bytes")
# print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
# print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

### **Quantized Model**

In [4]:
def quantize8bit(tenosrs):
    qmin = tenosrs.min().item()
    qmax = tenosrs.max().item()

    n=8
    scale = (qmax - qmin) / (2. ** n - 1.)
    if scale == 0:
        scale = 1e-8
    zero_point = round(-1*(2**n)-qmin / scale)
    zero_point = int(zero_point)

    quant_tensor =torch.round(tenosrs / scale + zero_point).to(torch.int8)
    return quant_tensor, scale, zero_point

def dequantize8bit(quant_tensor, scale, zero_point):
    return scale * (quant_tensor.to(torch.float32) - zero_point)

def quantize_int8(tensor):
    """
    Quantize tensor to INT8 with proper scaling
    """
    tensor = tensor.to(torch.float32)
    
    qmin = tensor.min()
    qmax = tensor.max()
    
    scale = (qmax - qmin) / 255.0  # 255 = 2^8 - 1
    zero_point = (-128 - qmin / scale).round().clamp(-128, 127).to(torch.int8)
    
    quant_tensor = (tensor / scale + zero_point).round().clamp(-128, 127).to(torch.int8)
    
    return quant_tensor, scale, zero_point

def dequantize_int8(quant_tensor, scale, zero_point):
    """
    Dequantize INT8 tensor back to floating point
    """
    return scale * (quant_tensor.to(torch.float32) - zero_point)

class Int8LinearLayer(nn.Module):
    def __init__(self, input_features, output_features, bias=True, dtype=torch.float32):
        super().__init__()
        
        self.register_buffer("quant_weights", 
                           torch.zeros((output_features, input_features), dtype=torch.int8))
        self.register_buffer("scales", 
                           torch.ones(output_features, dtype=torch.float32))
        self.register_buffer("zero_points",
                           torch.zeros(output_features, dtype=torch.int8))
        
        if bias:
            self.register_buffer("bias", 
                               torch.zeros(output_features, dtype=dtype))
        else:
            self.bias = None
            
    def forward(self, inputs):
        """
        Forward pass with INT8 quantized weights
        """
        inputs = inputs.to(torch.float16)
        
        # Dequantize weights for computation
        dequantized_weights = (
            (self.quant_weights.float() - self.zero_points.unsqueeze(1)) * 
            self.scales.unsqueeze(1)
        ).to(inputs.dtype)
        
        output = F.linear(inputs, dequantized_weights)
        
        if self.bias is not None:
            output = output + self.bias.to(inputs.dtype)
            
        return output
    
    def quantize(self, weights):
        """
        Quantize weights to INT8 with proper scaling
        """
        w_fp32 = weights.detach().clone().to(torch.float32)
        
        for idx in range(w_fp32.size(0)):
            row = w_fp32[idx]
            quant_row, scale, zero_point = quantize_int8(row)
            
            self.quant_weights[idx] = quant_row
            self.scales[idx] = scale
            self.zero_points[idx] = zero_point

class Int8QuantModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.exclude_layers = ['wte', 'wpe']
        
        self._replace_linear_layers()
    
    def _replace_linear_layers(self):
        """
        Replace standard linear layers with INT8 quantized layers
        """
        for name, module in self.model.named_modules():
            if isinstance(module, nn.Linear) and not any(exclude in name for exclude in self.exclude_layers):
                parent = self._get_parent_module(name)
                if parent is not None:
                    layer_name = name.split('.')[-1]
                    new_layer = Int8LinearLayer(
                        module.in_features,
                        module.out_features,
                        bias=module.bias is not None,
                        dtype=module.weight.dtype
                    )
                    new_layer.quantize(module.weight)
                    if module.bias is not None:
                        new_layer.bias = module.bias
                    
                    setattr(parent, layer_name, new_layer)
    
    def _get_parent_module(self, name):
        """
        Get parent module for a given module name
        """
        parent_path = '.'.join(name.split('.')[:-1])
        if not parent_path:
            return self.model
            
        try:
            parent = self.model
            for part in parent_path.split('.'):
                parent = getattr(parent, part)
            return parent
        except AttributeError:
            return None

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        if input_ids is not None:
            input_ids = input_ids.long()
        if attention_mask is not None:
            attention_mask = attention_mask.to(torch.float16)
            
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **kwargs
        )
    
    def get_memory_footprint(self):
        """
        Calculate total memory footprint of the quantized model using numel().
        Returns total memory usage in bytes.
        """
        total_bytes = 0
        
        # For each named parameter in the model
        for name, param in self.model.named_parameters():
            total_bytes += param.numel()*param.element_size()
                
        return total_bytes

model_name = "gpt2"

model = AutoModelForCausalLM.from_pretrained(
    "gpt2"
)
# Quantize all linear layers in the GPT-2 model
model_quant=Int8QuantModel(model)
print(model_quant)

memory_footprint_bytes = model_quant.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")


Int8QuantModel(
  (model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Int8LinearLayer()
  )
)
Memory

### **Partial Quantization Model**

In [None]:
def quantize_to_int8(tensor):
    """Quantize tensor to int8 with proper scaling"""
    tensor = tensor.detach().float()
    qmin, qmax = -128, 127
    min_val, max_val = tensor.min().item(), tensor.max().item()
    
    scale = (max_val - min_val) / (qmax - qmin)
    scale = max(scale, 1e-8)
    zero_point = qmin - round(min_val / scale)
    
    # Quantize
    quantized = torch.round(tensor / scale + zero_point).clamp(qmin, qmax).to(torch.int8)
    return quantized, scale, zero_point

def dequantize_from_int8(quantized_tensor, scale, zero_point):
    """Dequantize int8 tensor back to floating point"""
    return (quantized_tensor.float() - zero_point) * scale

class Decoder_QuantizedModel(torch.nn.Module):
    def __init__(self, original_model, quantized_blocks):
        super().__init__()
        self.quantization_params = {}
        self.model_size = 0
        self.quantized_blocks = quantized_blocks
        
        self.device = next(original_model.parameters()).device
        
        self.original_model = self.original_model.to(self.device)
        
        with torch.no_grad():
            for name, param in self.original_model.named_parameters():
                param.data = param.data.to(self.device) 
                is_decoder_block = any(f"transformer.h.{idx}." in name for idx in self.quantized_blocks)
                if is_decoder_block:
                    quantized_tensor, scale, zero_point = quantize_to_int8(param.data)
                    self.model_size += quantized_tensor.numel()
                    
                    self.quantization_params[name] = {
                        "quantized": quantized_tensor.to(self.device),
                        "scale": scale,
                        "zero_point": zero_point,
                    }
                    
                    dequantized = dequantize_from_int8(
                        quantized_tensor, 
                        scale, 
                        zero_point
                    ).to(self.device)
                    
                    param.data.copy_(dequantized)
                else:
                    self.model_size += param.numel() * param.element_size()
        
        self.model_size = self.model_size / (1024 ** 2)
    
    def to(self, device):
        """Override to method to handle device movement"""
        self.device = device
        self.original_model = self.original_model.to(device)
        return super().to(device)
    
    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        input_ids = input_ids.to(self.device)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.device)
        if labels is not None:
            labels = labels.to(self.device)
            
        with torch.no_grad():
            outputs = self.original_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                **kwargs
            )
        return outputs

# Usage example
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Quantize middle layers
num_layers = len(model.transformer.h)
quantized_blocks = list(range(1, num_layers-1))
quantized_model = Decoder_QuantizedModel(model, quantized_blocks)
print(quantized_model.model_size)


AttributeError: 'Decoder_QuantizedModel' object has no attribute 'original_model'

### **Corpus Retrieval**

In [5]:
def retrieve_corpus(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        corpus = []
        for line in file:
            # print(line.strip().lower())
            corpus.append(line.lower())
        return corpus
    
def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word not in string.punctuation]
    
# corpus_train = retrieve_corpus("./Dataset/ptb.train.txt")
# corpus_test = retrieve_corpus("./Dataset/ptb.test.txt")
# corpus_val = retrieve_corpus("./Dataset/ptb.valid.txt")

### **Dataset Loading**

In [6]:

# test_data = retrieve_corpus("./Dataset/ptb.test.txt")
# test_data = [remove_punctuation(word_tokenize(sentence)) for sentence in test_data]

# test_size = int(1 * len(test_data))
# test_data = test_data[:test_size]

# print(f"Testing data size: {len(test_data)}")

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

eval_texts = []
for item in dataset:
    if item['text'].strip():
        eval_texts.append(item['text'])
        if len(eval_texts) >= 3000:
            break
print(f"Loaded {len(eval_texts)} samples for evaluation")
test_data = eval_texts

Loaded 2891 samples for evaluation


### **Testing**

In [7]:
def memory_usage(device=0):
    if torch.cuda.is_available():
        allocated_memory = torch.cuda.memory_allocated(device) / 1024 / 1024  # in MB
        reserved_memory = torch.cuda.max_memory_allocated(device) / 1024 / 1024  # in MB
        memory_used = reserved_memory - allocated_memory
        return allocated_memory, reserved_memory, memory_used
    else:
        process = psutil.Process()
        return process.memory_info().rss / 1024 / 1024  # Memory in MB

def clear_memory(device=0):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

def compute_perplexity(model, tokenizer, dataset, max_length=1024, device='cpu'):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    loss_fn = nn.CrossEntropyLoss(reduction='sum')

    with torch.no_grad():
        for example in tqdm(dataset, desc="Calculating perplexity"):
            # Tokenize and prepare inputs
            encodings = tokenizer(example, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            
            labels = input_ids.clone()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            total_loss += loss.item()
            total_tokens += attention_mask.sum().item() - 1

        perplexity = math.exp(total_loss / total_tokens)

    return perplexity

def measure_latency(model, tokenizer, num_trials=10, max_length=1024, device='cpu'):
    model.eval()
    latencies = []
    example = "This is a sample text to measure inference performance. " * 10 
    inputs = tokenizer(example, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    
    with torch.no_grad():
        for _ in range(num_trials):
            inputs = {key: value.to(device) for key, value in inputs.items()}
            
            start_time = time.time()
            _ = model(**inputs)
            end_time = time.time()
            
            latencies.append(end_time - start_time)
    
    avg_latency = sum(latencies) / len(latencies)
    return avg_latency

def evaluate_model(model,model_name="gpt2", dataset=None, device='cpu'):

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    print(device)
    model.to(device) 
    
    memory_before = memory_usage()

    perplexity_before = compute_perplexity(model, tokenizer, dataset, device=device)

    latency_before = measure_latency(model, tokenizer, device=device)
    
    
    return {
        "Memory Usage": memory_before,
        "Latency": latency_before,
        "Perplexity": perplexity_before,
    }

### **Results**

In [16]:
# results = evaluate_model(model_org, model_name="gpt2", dataset=test_data, device=device)
# print("Original Model")
# print(results)
# clear_memory(device)

results = evaluate_model(quantized_model, model_name="gpt2", dataset=test_data, device=device)
print("8 Bit Partial Quantized Model")
print(results)
clear_memory(device)

cuda


Calculating perplexity: 100%|██████████| 2891/2891 [00:39<00:00, 72.35it/s] 


8 Bit Partial Quantized Model
{'Memory Usage': (1995.67626953125, 1995.67626953125, 0.0), 'Latency': 0.011525678634643554, 'Perplexity': 51.41608254277834}
