In [None]:
# Install the necessary libraries, including SentencePiece
%pip install torch 
%pip install transformers 
%pip install sentencepiece

# Import required libraries
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Step 1: Load the T5-large model and tokenizer
print("Loading the T5-large model and tokenizer...")
tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")

# Step 2: Apply dynamic quantization to the model
print("Applying quantization...")
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8  # Quantize the linear layers
)

# Step 3: Save the quantized model and tokenizer to the quantization folder
save_directory = "quantization/quantized_t5_large"
print(f"Saving the quantized model to {save_directory}...")
quantized_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Step 4: Load and use the quantized model from the quantization folder
print("Loading the quantized model for inference...")
quantized_tokenizer = T5Tokenizer.from_pretrained(save_directory)
quantized_model = T5ForConditionalGeneration.from_pretrained(save_directory)

# Example: Translate a sentence from English to French
input_text = "translate English to French: My name is Alana"
input_ids = quantized_tokenizer(input_text, return_tensors="pt").input_ids

print("Generating translation...")
outputs = quantized_model.generate(input_ids, max_new_tokens=50)
translation = quantized_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Translation:", translation)