### **Importing Libraries**

In [None]:
from transformers import GPT2Model, GPT2Tokenizer

# Load pre-trained model and tokenizer
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

### **Printing few Info**

In [None]:
print(model)
print("Next")
print(model.wte)  # Token embeddings
print("Next")
print(model.wpe)  # Position embeddings
print("Next")
print(model.ln_f)  # Final layer normalization
print("Next")
print(model.h[0].attn)  # Attention layer of the first block
print("Next")
print(model.h[0].mlp)  # Feedforward layer of the first block
print("Next")


total_params = sum(p.numel() for p in model.parameters())
param_size = 4  # float32 is 4 bytes
total_size = total_params * param_size

# Convert to megabytes (MB) and gigabytes (GB)
total_size_mb = total_size / (1024 ** 2)  # MB
total_size_gb = total_size / (1024 ** 3)  # GB

print(f"Model size: {total_size_mb:.2f} MB ({total_size_gb:.2f} GB)")

In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Model

# Load the GPT-2 model
model_new = GPT2Model.from_pretrained('gpt2')

# Function to quantize weights to 8-bit
def quantize_weights(module):
    if isinstance(module, (nn.Linear, nn.Conv2d)):
        # Quantize weights
        module.weight.data = torch.quantize_per_tensor(module.weight.data, scale=0.1, zero_point=0, dtype=torch.qint8)
        if module.bias is not None:
            module.bias.data = torch.quantize_per_tensor(module.bias.data, scale=0.1, zero_point=0, dtype=torch.qint8)

# Apply quantization to the model
model_new.apply(quantize_weights)

# Verify the quantization
print(model_new)
print("Next")
print(model_new.wte)  # Token embeddings
print("Next")
print(model_new.wpe)  # Position embeddings
print("Next")
print(model_new.ln_f)  # Final layer normalization
print("Next")
print(model_new.h[0].attn)  # Attention layer of the first block
print("Next")
print(model_new.h[0].mlp)  # Feedforward layer of the first block
print("Next")

# Calculate the model size after quantization
total_params = sum(p.numel() for p in model_new.parameters())
param_size = 1  # int8 is 1 byte
total_size = total_params * param_size

# Convert to megabytes (MB) and gigabytes (GB)
total_size_mb = total_size / (1024 ** 2)  # MB
total_size_gb = total_size / (1024 ** 3)  # GB

print(f"Model size after quantization: {total_size_mb:.2f} MB ({total_size_gb:.2f} GB)")

### **Importing Libraries**

In [None]:
import torch
import math
import time
import copy
import torch.nn as nn
import torch.nn.functional as F
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
import string
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

### **Importing and Cleaning Dataset**

In [None]:
def load_dataset(file_path):    
    with open(file_path, 'r', encoding='utf-8') as file:
        corpus = file.read()

    print("Dataset Loaded")

    corpus = corpus.lower()
    clean_text = sent_tokenize(corpus)
    translator = str.maketrans('', '', string.punctuation)
    clean_text = [sentence.translate(translator) for sentence in clean_text]
    return clean_text

### **Tokenization**

In [None]:
def prepare_data(clean_text, word_to_ind):
    tokenized_corpus = [word_tokenize(sentence) for sentence in clean_text]
    for i in range(len(tokenized_corpus)):
        token_arr = tokenized_corpus[i]

        #Vocabulary
        for tokken in token_arr:
            if tokken not in word_to_ind:
                word_to_ind[tokken] = len(word_to_ind)

        token_arr = ['<sos>'] * 5 + token_arr + ['<eos>'] * 5
        tokenized_corpus[i] = token_arr

    # print(tokenized_corpus[2])

    return tokenized_corpus, word_to_ind
# print(len(word_to_ind))

word2idx = {}
word2idx["<sos>"] = len(word2idx)
word2idx["<eos>"] = len(word2idx)
word2idx["<pad>"] = len(word2idx)
word2idx["<unk>"] = len(word2idx)

train_corpus = load_dataset("./Dataset/ptb.train.txt")
valid_corpus = load_dataset("./Dataset/ptb.valid.txt")
test_corpus = load_dataset("./Dataset/ptb.test.txt")

token_train, word2idx = prepare_data(train_corpus, word2idx)
token_valid, word2idx = prepare_data(valid_corpus, word2idx)

### **Training Set**

In [None]:
print(f"Training data size: {len(token_train)}")
print(f"Validation data size: {len(token_valid)}")

### **Model**

In [None]:
class GPT2LMHead(nn.Module):
    def __init__(self, base_model, vocab_size):
        super(GPT2LMHead, self).__init__()
        self.gpt2 = base_model
        self.lm_head = nn.Linear(base_model.config.hidden_size, vocab_size, bias=False)
    
    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        logits = self.lm_head(hidden_states)
        return logits

### **Creating Dataset**

In [None]:
class LM_Dataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return (self.data.size(0) - 1) // self.seq_len

    def __getitem__(self, idx):
        start = idx * self.seq_len
        end = start + self.seq_len
        src = self.data[start:end]
        target = self.data[start+1:end+1]
        return src, target
    


def prepare_data(sentences, word_to_index, max_len=None):
    def words_to_indices(words, word_to_index):
        return [word_to_index.get(word, 0) for word in words]

    all_indices = []

    for sentence in sentences:
        word_indices = words_to_indices(sentence, word_to_index)

        if max_len is not None:
            word_indices = word_indices[:max_len]

        all_indices.extend(word_indices)

    data = torch.LongTensor(all_indices)
    return data

train_gram_inp = prepare_data(token_train, word2idx, max_len=100)
val_gram_inp= prepare_data(token_valid, word2idx, max_len=100)

print("Created input for loading")

In [None]:
# print("Training Begins")

# dataset_train = LM_Dataset(train_gram_inp, 200)
# dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=4, shuffle=True)

# dataset_val = LM_Dataset(val_gram_inp, 200)
# dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=4)

# vocab_size = tokenizer.vocab_size
# custom_model = GPT2LMHead(model, vocab_size)
# custom_model = custom_model.to(device)

# start_time = time.time()

# num_epochs = 5
# learning_rate = 0.001
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(custom_model.parameters(), lr=learning_rate)
# perp_vis_t = []
# perp_vis_val = []

# for epoch in range(num_epochs):
#     custom_model.train()
#     total_loss = 0

#     for batch_index, batch in enumerate(dataloader_train):
#         context_words, target_words = batch
#         context_words = context_words.to(device)
#         target_words = target_words.to(device)

#         optimizer.zero_grad()

#         outputs = custom_model(context_words)
#         # print(f"Outputs shape: {outputs.shape}")
#         # print(f"Outputs sample: {outputs[0, :5]}")

#         outputs = outputs.view(-1, outputs.size(-1))
#         target_words = target_words.view(-1)

#         loss = criterion(outputs, target_words)
#         # print(f"Loss: {loss.item()}")

#         loss.backward()
#         optimizer.step()
#         torch.nn.utils.clip_grad_norm_(custom_model.parameters(), max_norm=1.0)

#         total_loss += loss.item()
#         batch_perplexity_t = math.exp(loss.item())

#     avg_train_loss = total_loss / len(dataloader_train)
#     train_perplexity = math.exp(avg_train_loss)


#     # Validation loop
#     custom_model.eval()
#     total_val_loss = 0
#     correct = 0
#     total = 0

#     with torch.no_grad():
#         for batch in dataloader_val:
#             context_words, target_words = batch
#             context_words = context_words.to(device)
#             target_words = target_words.to(device)

#             outputs = custom_model(context_words)
#             outputs = outputs.view(-1, outputs.size(-1))
#             target_words = target_words.view(-1)
#             loss = criterion(outputs, target_words)
#             total_val_loss += loss.item()
#             batch_perplexity = math.exp(loss.item())

#             _, predicted = torch.max(outputs, 1)
#             total += target_words.size(0)
#             correct += (predicted == target_words).sum().item()


#     avg_val_loss = total_val_loss / len(dataloader_val)
#     val_perplexity = math.exp(avg_val_loss)
#     accuracy = 100 * correct / total
#     perp_vis_t.append(train_perplexity)
#     perp_vis_val.append(val_perplexity)
#     print(f'Train Perplexity is : {train_perplexity}, Validation Perplexity is : {val_perplexity}')

#     print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {accuracy:.2f}%')

# print("Training and Validation Complete.")

# end_time = time.time()
# training_time = end_time - start_time
# print("Time taken to train is: ", training_time)
# peak_memory_allocated = torch.cuda.max_memory_allocated() / 1024**2
# print(f'Peak GPU memory allocated: {peak_memory_allocated:.2f} MB')

In [None]:
print("Training Begins")

dataset_train = LM_Dataset(train_gram_inp, 100)
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=4, shuffle=True)

dataset_val = LM_Dataset(val_gram_inp, 100)
dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=4)

vocab_size = tokenizer.vocab_size
custom_model = GPT2LMHead(model_new, vocab_size)
custom_model = custom_model.to(device)

start_time = time.time()

num_epochs = 5
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(custom_model.parameters(), lr=learning_rate)
perp_vis_t = []
perp_vis_val = []

for epoch in range(num_epochs):
    custom_model.train()
    total_loss = 0

    for batch_index, batch in enumerate(dataloader_train):
        context_words, target_words = batch
        context_words = context_words.to(device)
        target_words = target_words.to(device)

        optimizer.zero_grad()

        outputs = custom_model(context_words)
        # print(f"Outputs shape: {outputs.shape}")
        # print(f"Outputs sample: {outputs[0, :5]}")

        outputs = outputs.view(-1, outputs.size(-1))
        target_words = target_words.view(-1)

        loss = criterion(outputs, target_words)
        # print(f"Loss: {loss.item()}")

        loss.backward()
        optimizer.step()
        torch.nn.utils.clip_grad_norm_(custom_model.parameters(), max_norm=1.0)

        total_loss += loss.item()
        batch_perplexity_t = math.exp(loss.item())

    avg_train_loss = total_loss / len(dataloader_train)
    train_perplexity = math.exp(avg_train_loss)


    # Validation loop
    custom_model.eval()
    total_val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader_val:
            context_words, target_words = batch
            context_words = context_words.to(device)
            target_words = target_words.to(device)

            outputs = custom_model(context_words)
            outputs = outputs.view(-1, outputs.size(-1))
            target_words = target_words.view(-1)
            loss = criterion(outputs, target_words)
            total_val_loss += loss.item()
            batch_perplexity = math.exp(loss.item())

            _, predicted = torch.max(outputs, 1)
            total += target_words.size(0)
            correct += (predicted == target_words).sum().item()


    avg_val_loss = total_val_loss / len(dataloader_val)
    val_perplexity = math.exp(avg_val_loss)
    accuracy = 100 * correct / total
    perp_vis_t.append(train_perplexity)
    perp_vis_val.append(val_perplexity)
    print(f'Train Perplexity is : {train_perplexity}, Validation Perplexity is : {val_perplexity}')

    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {accuracy:.2f}%')

print("Training and Validation Complete.")

end_time = time.time()
training_time = end_time - start_time
print("Time taken to train is: ", training_time)
peak_memory_allocated = torch.cuda.max_memory_allocated() / 1024**2
print(f'Peak GPU memory allocated: {peak_memory_allocated:.2f} MB')


In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import time
import numpy as np

# Load model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Function to load PTB dataset from file
def load_ptb_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]

# Load datasets
train_texts = load_ptb_data('./Dataset/ptb.train.txt')
valid_texts = load_ptb_data('./Dataset/ptb.valid.txt')
test_texts = load_ptb_data('./Dataset/ptb.test.txt')

# Function to calculate memory usage
def get_memory_usage():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / (1024 ** 2)  # Convert to MB
    else:
        import psutil
        return psutil.Process().memory_info().rss / (1024 ** 2)  # Convert to MB

# Function to compute latency and perplexity
def compute_perplexity_and_latency(model, texts):
    model.eval()
    total_loss = 0
    total_time = 0
    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True)
            if torch.cuda.is_available():
                inputs = {key: value.to('cuda') for key, value in inputs.items()}
                model.to('cuda')

            start_time = time.time()
            outputs = model(**inputs, labels=inputs['input_ids'])
            total_time += time.time() - start_time

            loss = outputs.loss.item()
            total_loss += loss

    avg_latency = total_time / len(texts)
    perplexity = np.exp(total_loss / len(texts)) if total_loss < 700 else float("inf")

    return avg_latency, perplexity

# Baseline metrics
baseline_memory_usage = get_memory_usage()
print(f"Baseline memory usage: {baseline_memory_usage:.2f} MB")

baseline_latency, baseline_perplexity = compute_perplexity_and_latency(model, test_texts)
print(f"Baseline latency: {baseline_latency:.4f} seconds")
print(f"Baseline perplexity: {baseline_perplexity:.4f}")

# Quantization function using PyTorch quantization API
def quantize_model(model):
    # Set up quantization configuration
    model.qconfig = torch.quantization.default_dynamic_qconfig
    # Prepare model for quantization
    model = torch.quantization.prepare(model)
    # Apply quantization
    quantized_model = torch.quantization.convert(model)
    return quantized_model

# Apply quantization
quantized_model = quantize_model(model)

# Quantized metrics
quantized_memory_usage = get_memory_usage()
print(f"Quantized memory usage: {quantized_memory_usage:.2f} MB")

quantized_latency, quantized_perplexity = compute_perplexity_and_latency(quantized_model, test_texts)
print(f"Quantized latency: {quantized_latency:.4f} seconds")
print(f"Quantized perplexity: {quantized_perplexity:.4f}")

# Summary of results
print("\n--- Summary of Results ---")
print(f"Baseline Memory Usage: {baseline_memory_usage:.2f} MB")
print(f"Quantized Memory Usage: {quantized_memory_usage:.2f} MB")
print(f"Memory Reduction: {baseline_memory_usage - quantized_memory_usage:.2f} MB")

print(f"Baseline Latency: {baseline_latency:.4f} seconds")
print(f"Quantized Latency: {quantized_latency:.4f} seconds")
print(f"Latency Reduction: {baseline_latency - quantized_latency:.4f} seconds")

print(f"Baseline Perplexity: {baseline_perplexity:.4f}")
print(f"Quantized Perplexity: {quantized_perplexity:.4f}")




Baseline memory usage: 0.00 MB
Baseline latency: 0.0024 seconds
Baseline perplexity: inf


AssertionError: Embedding quantization is only supported with float_qparams_weight_only_qconfig.