### **Importing Libraries**

In [1]:
import torch
import math
import gc
import psutil
import time
import copy
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
import nltk
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
import string
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
    
## Task Specific Libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import BitsAndBytesConfig


[nltk_data] Downloading package punkt to /home/chetan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


cuda


2024-11-16 12:01:38.773715: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### **Loading Model**

In [2]:
model_name = "gpt2"

model_org = AutoModelForCausalLM.from_pretrained(
    "gpt2"
)

memory_footprint_bytes = model_org.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 510342192 bytes
Memory footprint: 486.70 MB
Memory footprint: 0.48 GB


### **Quantization 8-bit Model**

In [3]:

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=128.0
    )

model_8bit = AutoModelForCausalLM.from_pretrained(
    "gpt2", 
    quantization_config=quantization_config,
    device_map="auto"
)

memory_footprint_bytes = model_8bit.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 176527896 bytes
Memory footprint: 168.35 MB
Memory footprint: 0.16 GB


### **Quantization 4-Bit Model**

In [4]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="fp4",
    bnb_4bit_compute_dtype=torch.float16
    )

model_4bit = AutoModelForCausalLM.from_pretrained(
    "gpt2", 
    quantization_config=quantization_config,
    device_map="auto"
)

memory_footprint_bytes = model_4bit.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 134060568 bytes
Memory footprint: 127.85 MB
Memory footprint: 0.12 GB


### **NF-4 Quantization Model**

In [5]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
)

model_nf4bit = AutoModelForCausalLM.from_pretrained(
    "gpt2", 
    quantization_config=nf4_config,
    device_map="auto"
)

memory_footprint_bytes = model_nf4bit.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 134060568 bytes
Memory footprint: 127.85 MB
Memory footprint: 0.12 GB


### **Corpus Retrieval**

In [6]:
def retrieve_corpus(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        corpus = []
        for line in file:
            # print(line.strip().lower())
            corpus.append(line.lower())
        return corpus
    
def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word not in string.punctuation]
    
# corpus_train = retrieve_corpus("./Dataset/ptb.train.txt")
# corpus_test = retrieve_corpus("./Dataset/ptb.test.txt")
# corpus_val = retrieve_corpus("./Dataset/ptb.valid.txt")

### **Loading Test Data**

In [7]:

# test_data = retrieve_corpus("./Dataset/ptb.test.txt")
# test_data = [remove_punctuation(word_tokenize(sentence)) for sentence in test_data]

# test_size = int(1 * len(test_data))
# test_data = test_data[:test_size]

# print(f"Testing data size: {len(test_data)}")

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

eval_texts = []
for item in dataset:
    if item['text'].strip():
        eval_texts.append(item['text'])
        if len(eval_texts) >= 3000:
            break
print(f"Loaded {len(eval_texts)} samples for evaluation")
test_data = eval_texts

Loaded 2891 samples for evaluation


### **Testing**

In [8]:
def memory_usage(device=0):
    if torch.cuda.is_available():
        allocated_memory = torch.cuda.memory_allocated(device) / 1024 / 1024  # in MB
        reserved_memory = torch.cuda.max_memory_allocated(device) / 1024 / 1024  # in MB
        memory_used = reserved_memory - allocated_memory
        return allocated_memory, reserved_memory, memory_used
    else:
        process = psutil.Process()
        return process.memory_info().rss / 1024 / 1024  # Memory in MB

def clear_memory(device=0):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

def compute_perplexity(model, tokenizer, dataset, max_length=1024, device='cpu'):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    loss_fn = nn.CrossEntropyLoss(reduction='sum')

    with torch.no_grad():
        for example in tqdm(dataset, desc="Calculating perplexity"):
            # Tokenize and prepare inputs
            encodings = tokenizer(example, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            
            labels = input_ids.clone()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            total_loss += loss.item()
            total_tokens += attention_mask.sum().item() - 1

        perplexity = math.exp(total_loss / total_tokens)

    return perplexity

def measure_latency(model, tokenizer, num_trials=10, max_length=1024, device='cpu'):
    model.eval()
    latencies = []
    example = "This is a sample text to measure inference performance. " * 10 
    inputs = tokenizer(example, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    
    with torch.no_grad():
        for _ in range(num_trials):
            inputs = {key: value.to(device) for key, value in inputs.items()}
            
            start_time = time.time()
            _ = model(**inputs)
            end_time = time.time()
            
            latencies.append(end_time - start_time)
    
    avg_latency = sum(latencies) / len(latencies)
    return avg_latency

def evaluate_model(model,model_name="gpt2", dataset=None, device='cpu',quantized=False):

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    print(device)
    if quantized is False:
        model.to(device) 
    
    memory_before = memory_usage()

    perplexity_before = compute_perplexity(model, tokenizer, dataset, device=device)

    latency_before = measure_latency(model, tokenizer, device=device)
    
    
    return {
        "Memory Usage": memory_before,
        "Latency": latency_before,
        "Perplexity": perplexity_before,
    }

In [None]:
results = evaluate_model(model_org,model_name="gpt2", dataset=test_data, device=device)
print("Original Model")
print(results)
clear_memory(device)

results = evaluate_model(model_8bit,model_name="gpt2", dataset=test_data, device=device, quantized=True)
print("8 Bit Quantized Model")
print(results)
clear_memory(device)

results = evaluate_model(model_4bit,model_name="gpt2", dataset=test_data, device=device, quantized=True)
print("4 Bit Quantized Model")
print(results)
clear_memory(device)

results = evaluate_model(model_nf4bit,model_name="gpt2", dataset=test_data, device=device, quantized=True)
print("NF4 Quantized Model")
print(results)
clear_memory(device)

cuda


Calculating perplexity: 100%|██████████| 2891/2891 [00:39<00:00, 73.11it/s]


8 Bit Quantized Model
{'Memory Usage': (443.5751953125, 443.5751953125, 0.0), 'Latency': 0.011621427536010743, 'Perplexity': 49.854691724482834}
