# FlanT5-large on cpu

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration,pipeline
from cached_path import cached_path
import os
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large").cpu()
translation = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
text = "He never went out without a book under his arm, and he often came back with two."
result = translation(text)



  from .autonotebook import tqdm as notebook_tqdm
2023-07-07 01:47:38.862984: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Your input_length: 22 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


In [3]:
import tqdm
import time
seq_len = [8, 32, 128, 512]
for i in range(4):
    start = time.time()
    for j in tqdm.tqdm(range(100)):
        input_text =  "house "* seq_len[i]
        result = translation(text, max_length=520)
        
    end = time.time()
    print("For Flan-T5 large: orignal:")
    print("seq_len: ", seq_len[i], "time: ", (end - start)/100)

100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


For Flan-T5 large: orignal:
seq_len:  8 time:  0.7376394391059875


100%|██████████| 100/100 [01:13<00:00,  1.35it/s]


For Flan-T5 large: orignal:
seq_len:  32 time:  0.7389604616165161


100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


For Flan-T5 large: orignal:
seq_len:  128 time:  0.734180109500885


100%|██████████| 100/100 [01:13<00:00,  1.36it/s]

For Flan-T5 large: orignal:
seq_len:  512 time:  0.7366388320922852





# FlanT5-large onnx (cpu)

In [None]:

# %%
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

model_name = "google/flan-t5-large"
model = ORTModelForSeq2SeqLM.from_pretrained(model_name,export =True)
export_dir='/home/meeami/aparna/onnx_model'
#model.export(output_dir = export_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name)

onnx_translation = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
text = "He never went out without a book under his arm, and he often came back with two."
result = onnx_translation(text,max_length=520)
# [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]



In [None]:
# %%
import tqdm
import time
seq_len = [8, 32, 128, 512]
for i in range(4):
    start = time.time()
    for j in tqdm.tqdm(range(100)):
        input_text =  "house "* seq_len[i]
        result = onnx_translation(text,max_length=520)
    print("For Flan-T5 large: onnx:")
    end = time.time()
    print("seq_len: ", seq_len[i], "time: ", (end - start)/100)

# FlanT5-large GPU

In [None]:

from transformers import T5Tokenizer, T5ForConditionalGeneration,pipeline
from cached_path import cached_path
import os
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large").cuda()

In [None]:
import tqdm
import time
seq_len = [8, 32, 128, 512,1024]
for i in range(5):
    start = time.time()
    for j in tqdm.tqdm(range(100)):
        input_text =  "house "* seq_len[i]
        input = tokenizer(input_text, return_tensors='pt').input_ids.cuda()

        outputs = model.generate(input, max_length=5)

        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
    end = time.time()
    print("For Flan-T5 large: orignal:")
    print("seq_len: ", seq_len[i], "time: ", (end - start)/100)

# Flan-T5-large onnx GPU

In [None]:

from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import numpy as np 
import onnx

model_name = "google/flan-t5-large"
model = ORTModelForSeq2SeqLM.from_pretrained(model_name,export =True, provider="CUDAExecutionProvider")

export_dir='/home/meeami/aparna/onnx_model/'
model.save_pretrained(export_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = "He never went out without a book under his arm, and he often came back with two."
#result = onnx_translation(text,max_length=520)
[{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
print(model.providers)



In [None]:
import tqdm
import time
seq_len = [ 512,1048]
for i in range(4):
    start = time.time()
    for j in tqdm.tqdm(range(100)):
        start1  = time.time()
        input_text =  "house "* seq_len[i]
        input = tokenizer(input_text, return_tensors='pt').input_ids.cuda()

        outputs =model.generate(input, max_length=5)

        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        end1= time.time()
        print(end1-start1)
    print("For Flan-T5 large: onnx:")
    end = time.time()
    print("seq_len: ", seq_len[i], "time: ", (end - start)/99)

# Flan-T5-large onnx Tensorrt

In [None]:

from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import numpy as np 
import onnx

model_name = "google/flan-t5-large"
model = ORTModelForSeq2SeqLM.from_pretrained(model_name,export =True, provider="TensorrtExecutionProvider")

export_dir='/home/meeami/aparna/onnx_model/'
model.save_pretrained(export_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = "He never went out without a book under his arm, and he often came back with two."
#result = onnx_translation(text,max_length=520)
[{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
print(model.providers)


In [None]:
import tqdm
import time
seq_len = [ 512,1048]
for i in range(4):
    start = time.time()
    for j in tqdm.tqdm(range(100)):
        start1  = time.time()
        input_text =  "house "* seq_len[i]
        input = tokenizer(input_text, return_tensors='pt').input_ids.cuda()

        outputs =model.generate(input, max_length=5)

        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        end1= time.time()
        print(end1-start1)
    print("For Flan-T5 large: onnx:")
    end = time.time()
    print("seq_len: ", seq_len[i], "time: ", (end - start)/99)