In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

def get_t5_pipeline(model_name="t5-3b", device=-1):
    """
    Create a HuggingFace pipeline for T5 model for text2text-generation.
    Args:
        model_name (str): Name or path of the T5 model.
        device (int): Device to run the pipeline on. -1 for CPU, >=0 for GPU.
    Returns:
        transformers.Pipeline: The text2text-generation pipeline.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    t5_pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)
    return t5_pipe

# Example usage:
t5_pipe = get_t5_pipeline("t5-base", device=0)
result = t5_pipe("get the domains for this protein sequence: MCCLTSILPLAALAADAEKAPATTEAPAAEAPRPPLLERSQEDALALERLVPRAEQQTLQAGADSFLALWKPANDSDPQGAVIIVPGAGETADWPNAVGPLRQKFPDVGWHSLSLSLPDLLADSPQARVEAKPAAEPEKTKGESAPAKDVPADANANVAQATAADADTAESTDAEQASEQTDTADAERIFARLDAAVAFAQQHNARSIVLIGHGSGAYWAARYLSEKQPPHVQKLVMVAAQTPARVEHDLESLAPTLKVPTADIYYATRSQDRSAAQQRLQASKRQKDSQYRQLSLIAMPGNKAAEQEQLFRRVRGWMSPQG")
print(result)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


[{'generated_text': 'of this protein sequence:  of this protein sequence: MCCLTSILPLA'}]


In [2]:
result = t5_pipe("What is the next number after 8?")
print(result)

[{'generated_text': '... ... ... ... ... ... ... ... ... ...'}]


In [3]:
result = t5_pipe("how to bake a cake?")
print(result)

[{'generated_text': ', bake a cake?,,,,, bake'}]


In [8]:
Prmpt = """The high similarities between protein sequences and natural language, particularly in their sequential data structures, have driven parallel advancements in deep
learning models for both domains. In natural language processing (NLP), large
language models (LLMs) have achieved remarkable success in tasks such as text
generation, translation, and conversational agents, owing to their extensive training
on diverse datasets that enable them to capture complex language patterns and generate human-like text. Inspired by these advancements, researchers have attempted
to adapt LLMs for protein understanding by integrating a protein sequence encoder"""

result = t5_pipe(Prmpt)
print(result)

[{'generated_text': 'and..... (...'}]


In [None]:
prot_t5 = get_t5_pipeline("Rostlab/prot_t5_xl_uniref50", device=0)
result = prot_t5("get the domains for this protein sequence: MCCLTSILPLAALAADAEKAPATTEAPAAEAPRPPLLERSQEDALALERLVPRAEQQTLQAGADSFLALWKPANDSDPQGAVIIVPGAGETADWPNAVGPLRQKFPDVGWHSLSLSLPDLLADSPQARVEAKPAAEPEKTKGESAPAKDVPADANANVAQATAADADTAESTDAEQASEQTDTADAERIFARLDAAVAFAQQHNARSIVLIGHGSGAYWAARYLSEKQPPHVQKLVMVAAQTPARVEHDLESLAPTLKVPTADIYYATRSQDRSAAQQRLQASKRQKDSQYRQLSLIAMPGNKAAEQEQLFRRVRGWMSPQG")
print(result)

ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']