In [None]:
from transformers import T5Tokenizer, T5EncoderModel
import torch
import re
import os 

In [None]:
print('torch.cuda.is_available() ' + str(torch.cuda.is_available()))

device = torch.device('cuda:0')

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

# Load the model
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)

# only GPUs support half-precision currently; if you want to run on CPU use full-precision (not recommended, much slower)
model.full() if device=='cpu' else model.half()

## Load dataset

In [None]:
# Load 2_model_df.pkl
INPUT_PATH = os.path.join('..', 'data', 'interim')
protein_pairs = pd.read_pickle(os.path.join(INPUT_PATH, '2_model_df.pkl'))

## Embed

In [None]:
# Get sequence list
sequence_examples = protein_pairs['seqID_phage'].tolist()

# Replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
sequence_examples = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in sequence_examples]

# tokenize sequences and pad up to the longest sequence in the batch
ids = tokenizer(sequence_examples, add_special_tokens=True, padding="longest")

input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

# generate embeddings
with torch.no_grad():
    embedding_repr = model(input_ids=input_ids, attention_mask=attention_mask)