In [27]:
import time
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [28]:
# Set the quantization backend to 'qnnpack'
torch.backends.quantized.engine = 'qnnpack'


In [29]:
# Load the model
model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')

In [30]:


# Apply dynamic quantization
start_time = time.time()
print("Applying quantization...")
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8  # Quantize the linear layers
)
print(f"Applied quantization in {time.time() - start_time:.2f} seconds")

# Save the quantized model's state dictionary
torch.save(quantized_model.state_dict(), 'quantized_t5_large.pth')
print("Quantized model saved successfully.")

# Define the tokenizer
quantized_tokenizer = tokenizer

Applying quantization...
Applied quantization in 1.71 seconds
Quantized model saved successfully.


In [31]:

# Generate translation
input_text = "translate English to French: My name is Alana"
input_ids = quantized_tokenizer(input_text, return_tensors="pt").input_ids

start_time = time.time()
print("Generating translation...")
outputs = quantized_model.generate(input_ids, max_new_tokens=50)
translation = quantized_tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated translation in {time.time() - start_time:.2f} seconds")
print("Translation:", translation)

Generating translation...




Generated translation in 1.86 seconds
Translation: Mon nom est Alana
