In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
import os
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset

# Load the best model
model_path = r'/content/drive/MyDrive/LLM Models/FLAN-T5-base x GSM8K/best_model.pth'
checkpoint = torch.load(model_path)
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.load_state_dict(checkpoint['model_state_dict'])

# Evaluate the size of the model before quantization
def get_model_size(model):
    param_size = 0
    param_bytes = 0
    for param in model.parameters():
        param_bytes += param.nelement() * param.element_size()
        param_size += param_bytes
    return param_size / 1e6  # Convert to MB

original_model_size = get_model_size(model)
print(f"Original model size: {original_model_size:.2f} MB")

# Prepare dataset and dataloader
dataset = load_dataset("gsm8k", "main")
tokenizer = T5Tokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = [question for question in examples["question"]]
    targets = [answer for answer in examples["answer"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["question", "answer"])

def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

eval_dataset = tokenized_datasets["test"]
eval_dataloader = DataLoader(eval_dataset, batch_size=8, collate_fn=collate_fn)

# Function to evaluate the model and compute validation loss
def evaluate_model(model, dataloader, device):
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(dataloader)
    return avg_eval_loss

# Move the original model to device and evaluate
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

original_eval_loss = evaluate_model(model, eval_dataloader, device)
print(f"Original Model Evaluation Loss: {original_eval_loss:.4f}")

# Perform dynamic quantization and move the model to CPU
quantized_model = torch.quantization.quantize_dynamic(
    model.cpu(), {nn.Linear}, dtype=torch.qint8
)

# Evaluate the size of the quantized model
quantized_model_size = get_model_size(quantized_model)
print(f"Quantized model size: {quantized_model_size:.2f} MB")

# Ensure the quantized model is evaluated on the CPU
cpu_device = torch.device("cpu")

quantized_eval_loss = evaluate_model(quantized_model, eval_dataloader, cpu_device)
print(f"Quantized Model Evaluation Loss: {quantized_eval_loss:.4f}")

# Compare model sizes and evaluation losses
size_reduction = 100 * (original_model_size - quantized_model_size) / original_model_size
print(f"Size reduction: {size_reduction:.2f}%")
loss_increase = 100 * (quantized_eval_loss - original_eval_loss) / original_eval_loss
print(f"Loss increase: {loss_increase:.2f}%")

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the best model
model_path = r'/content/drive/MyDrive/LLM Models/FLAN-T5-base x GSM8K/best_model.pth'
checkpoint = torch.load(model_path)
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.load_state_dict(checkpoint['model_state_dict'])

# Move the original model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example from GSM8K dataset
example = "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"

# Function to generate output from the model
def generate_answer(model, tokenizer, example, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer.encode(example, return_tensors="pt", max_length=512, truncation=True).to(device)
        outputs = model.generate(inputs, max_length=128)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Get output from the original model
original_output = generate_answer(model, tokenizer, example, device)
print(f"Original Model Output: {original_output}")

# Perform dynamic quantization and move the model to CPU
quantized_model = torch.quantization.quantize_dynamic(
    model.cpu(), {torch.nn.Linear}, dtype=torch.qint8
)

# Ensure the quantized model is evaluated on the CPU
cpu_device = torch.device("cpu")

# Get output from the quantized model
quantized_output = generate_answer(quantized_model, tokenizer, example, cpu_device)
print(f"Quantized Model Output: {quantized_output}")