### **Importing Libraries**

In [1]:
import torch
import math
import gc
import psutil
import time
import copy
import torch.nn as nn
import torch.nn.functional as F
import nltk
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
import string
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
    
## Task Specific Libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import BitsAndBytesConfig


[nltk_data] Downloading package punkt to /home/chetan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


cuda


2024-11-16 03:24:50.731366: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### **Loading Model**

In [2]:
model_name = "gpt2"

# Load the model and tokenizer
model_org = GPT2LMHeadModel.from_pretrained(
    "gpt2"
)

memory_footprint_bytes = model_org.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 510342192 bytes
Memory footprint: 486.70 MB
Memory footprint: 0.48 GB


### **Quantization 8-bit Model**

In [3]:

quantization_config = BitsAndBytesConfig(load_in_8bit=True)

model_8bit = AutoModelForCausalLM.from_pretrained(
    "gpt2", 
    quantization_config=quantization_config,
    device_map="auto"
)

memory_footprint_bytes = model_8bit.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 176527896 bytes
Memory footprint: 168.35 MB
Memory footprint: 0.16 GB


### **Quantization 4-Bit Model**

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

model_4bit = AutoModelForCausalLM.from_pretrained(
    "gpt2", 
    quantization_config=quantization_config,
    device_map="auto"
)

memory_footprint_bytes = model_4bit.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 134060568 bytes
Memory footprint: 127.85 MB
Memory footprint: 0.12 GB


### **NF-4 Quantization Model**

In [5]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
)

model_nf4bit = AutoModelForCausalLM.from_pretrained(
    "gpt2", 
    quantization_config=nf4_config,
    device_map="auto"
)

memory_footprint_bytes = model_nf4bit.get_memory_footprint()

memory_footprint_mb = memory_footprint_bytes / (1024 ** 2)
memory_footprint_gb = memory_footprint_bytes / (1024 ** 3)

print(f"Memory footprint: {memory_footprint_bytes} bytes")
print(f"Memory footprint: {memory_footprint_mb:.2f} MB")
print(f"Memory footprint: {memory_footprint_gb:.2f} GB")

Memory footprint: 134060568 bytes
Memory footprint: 127.85 MB
Memory footprint: 0.12 GB


### **Corpus Retrieval**

In [6]:
def retrieve_corpus(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        corpus = []
        for line in file:
            # print(line.strip().lower())
            corpus.append(line.lower())
        return corpus
    
def remove_punctuation(tokenized_sentence):
    return [word for word in tokenized_sentence if word not in string.punctuation]
    
# corpus_train = retrieve_corpus("./Dataset/ptb.train.txt")
# corpus_test = retrieve_corpus("./Dataset/ptb.test.txt")
# corpus_val = retrieve_corpus("./Dataset/ptb.valid.txt")

### **Loading Test Data**

In [7]:

test_data = retrieve_corpus("./Dataset/ptb.test.txt")
test_data = [remove_punctuation(word_tokenize(sentence)) for sentence in test_data]

test_size = int(1 * len(test_data))
test_data = test_data[:test_size]

print(f"Testing data size: {len(test_data)}")

Testing data size: 3761


### **Testing**

In [8]:
def memory_usage(device=0):
    if torch.cuda.is_available():
        allocated_memory = torch.cuda.memory_allocated(device) / 1024 / 1024  # in MB
        reserved_memory = torch.cuda.memory_reserved(device) / 1024 / 1024  # in MB
        return allocated_memory, reserved_memory
    else:
        process = psutil.Process()
        return process.memory_info().rss / 1024 / 1024  # Memory in MB

def clear_memory(device=0):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

def compute_perplexity(model, tokenizer, dataset, max_length=40, device='cpu'):
    model.eval()
    total_loss = 0.0
    total_tokens = 0
    for example in dataset:
        inputs = tokenizer(example, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
            if( not math.isnan(loss.item())):
                total_loss += loss.item() * inputs['input_ids'].size(1)
                total_tokens += inputs['input_ids'].size(1)
    
    perplexity = torch.exp(torch.tensor(total_loss / total_tokens))
    return perplexity.item()

def measure_latency(model, tokenizer, dataset, num_trials=10, max_length=40, device='cpu'):
    model.eval()
    latencies = []
    
    for _ in range(num_trials):
        example = dataset[0]  
        inputs = tokenizer(example, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        start_time = time.time()
        with torch.no_grad():
            model(**inputs)
        end_time = time.time()
        
        latencies.append(end_time - start_time)
    
    avg_latency = sum(latencies) / len(latencies)
    return avg_latency

def evaluate_model(model,model_name="gpt2", dataset=None, device='cpu',quantized=False):

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as padding token
    if quantized is False:
        model.to(device) 
    
    memory_before = memory_usage()

    perplexity_before = compute_perplexity(model, tokenizer, dataset, device=device)

    latency_before = measure_latency(model, tokenizer, dataset, device=device)
    
    
    return {
        "Memory Usage": memory_before,
        "Latency": latency_before,
        "Perplexity": perplexity_before,
    }

In [9]:
results = evaluate_model(model_org,model_name="gpt2", dataset=test_data, device=device)
print("Original Model")
print(results)
clear_memory(device)

results = evaluate_model(model_8bit,model_name="gpt2", dataset=test_data, device=device, quantized=True)
print("8 Bit Quantized Model")
print(results)
clear_memory(device)

results = evaluate_model(model_4bit,model_name="gpt2", dataset=test_data, device=device, quantized=True)
print("4 Bit Quantized Model")
print(results)
clear_memory(device)

results = evaluate_model(model_nf4bit,model_name="gpt2", dataset=test_data, device=device, quantized=True)
print("NF4 Quantized Model")
print(results)
clear_memory(device)

Original Model
{'Memory Usage': (931.9189453125, 984.0), 'Latency': 0.002531099319458008, 'Perplexity': 960.97509765625}
8 Bit Quantized Model
{'Memory Usage': (940.0439453125, 984.0), 'Latency': 0.017486882209777833, 'Perplexity': 972.1326904296875}




4 Bit Quantized Model
{'Memory Usage': (939.0693359375, 988.0), 'Latency': 0.007877540588378907, 'Perplexity': 1537.63427734375}
NF4 Quantized Model
{'Memory Usage': (939.0693359375, 988.0), 'Latency': 0.007396888732910156, 'Perplexity': 1472.7154541015625}
