### Protein Sequence Embedding Generation with ProtT5-XL-BFD
**Reference:** https://github.com/agemagician/ProtTrans/blob/master/Embedding/PyTorch/Advanced/ProtT5-XL-BFD.ipynb

In [1]:
!pip install -q SentencePiece git+https://github.com/huggingface/transformers.git@40ecaf0c2b1c0b3894e9abf619f32472c5a3b3ca

In [2]:
!pip install transformers -U

Collecting transformers
  Using cached transformers-4.30.2-py3-none-any.whl (7.2 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.9.4
    Uninstalling tokenizers-0.9.4:
      Successfully uninstalled tokenizers-0.9.4
  Attempting uninstall: transformers
    Found existing installation: transformers 4.0.0rc1
    Uninstalling transformers-4.0.0rc1:
      Successfully uninstalled transformers-4.0.0rc1
Successfully installed tokenizers-0.13.3 transformers-4.30.2


In [3]:
import gc
import numpy as np
import re
import torch
from tqdm import tqdm
from transformers import T5EncoderModel, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_bfd", do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_bfd")
gc.collect()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model = model.eval()

single_datasets = ["Training", "Validation", "TEST2018", "SPOT-2018", "SPOT-2018-HQ", "CASP12-FM", "CASP13-FM"]
lm_datasets = ["SPOT-2018-Neff1", "CASP14-FM"]
single_path, lm_path = "../datasets/SPOT-1D-Single/Features", "../datasets/SPOT-1D-LM/Features"

for dataset in single_datasets + lm_datasets:
    if dataset in lm_datasets:
        with open(f"{lm_path}/{dataset}/{dataset}_below_700_proteins.txt", 'r') as accessions_file:
            proteins = [row.split(',')[0] for row in accessions_file.read().split('\n') if row != '']
    else:
        with open(f"{single_path}/{dataset}/{dataset}_below_700_proteins.txt", 'r') as accessions_file:
            proteins = [row.split(',')[0] for row in accessions_file.read().split('\n') if row != '']
    
    for protein in tqdm(iterable=proteins, desc=f"{dataset} in Progress", ncols=100, unit="protein"):
        if dataset in lm_datasets:
            with open(f"{lm_path}/{dataset}/Rawdata/{protein}/{protein}.fasta", 'r') as fasta_file:
                pseq = fasta_file.read().split('\n')[1]
        else:
            with open(f"{single_path}/{dataset}/Rawdata/{protein}/{protein}.fasta", 'r') as fasta_file:
                pseq = fasta_file.read().split('\n')[1]
        
        p_s_e_q = ' '.join(pseq)
        p_s_e_q = re.sub(r"[UZOB]", 'X', p_s_e_q)
        p_s_e_q = [p_s_e_q]
        
        ids = tokenizer.batch_encode_plus(p_s_e_q, add_special_tokens=True, padding=True)
        input_ids = torch.tensor(ids["input_ids"]).to(device)
        attention_mask = torch.tensor(ids["attention_mask"]).to(device)
        
        with torch.no_grad():
            embeddings = model(input_ids=input_ids, attention_mask=attention_mask)
        
        embeddings = embeddings.last_hidden_state.cpu().numpy()
        features = []
        
        for sequence_num in range(len(embeddings)):
            sequence_length = (attention_mask[sequence_num] == 1).sum()
            sequence_embeddings = embeddings[sequence_num][:sequence_length - 1]
            features.append(sequence_embeddings)
        
        assert len(pseq) == len(features[0])
        
        if dataset in lm_datasets:
            with open(f"{lm_path}/{dataset}/Rawdata/{protein}/{protein}_bfd.npy", 'wb') as bfd_file:
                np.save(file=bfd_file, arr=features[0].astype(np.float32))
        else:
            with open(f"{single_path}/{dataset}/Rawdata/{protein}/{protein}_bfd.npy", 'wb') as bfd_file:
                np.save(file=bfd_file, arr=features[0].astype(np.float32))

Some weights of the model checkpoint at Rostlab/prot_t5_xl_bfd were not used when initializing T5EncoderModel: ['decoder.block.20.layer.2.DenseReluDense.wo.weight', 'decoder.block.9.layer.0.SelfAttention.q.weight', 'decoder.block.15.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.k.weight', 'decoder.block.13.layer.0.SelfAttention.v.weight', 'decoder.block.22.layer.1.EncDecAttention.k.weight', 'decoder.block.13.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.3.layer.1.EncDecAttention.q.weight', 'decoder.block.6.layer.1.EncDecAttention.k.weight', 'decoder.block.10.layer.1.EncDecAttention.k.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.16.layer.1.EncDecAttention.v.weight', 'decoder.block.14.layer.0.SelfAttention.q.weight', 'decoder.block.11.layer.0.SelfAttention.q.weight', 'decoder.block.13.layer.1.layer_norm.weight', 'decoder.block.

Training in Progress: 100%|██████████████████████████████| 38211/38211 [48:38<00:00, 13.09protein/s]
Validation in Progress: 100%|██████████████████████████████████| 96/96 [00:07<00:00, 13.54protein/s]
TEST2018 in Progress: 100%|██████████████████████████████████| 250/250 [00:20<00:00, 12.16protein/s]
SPOT-2018 in Progress: 100%|█████████████████████████████████| 646/646 [00:42<00:00, 15.33protein/s]
SPOT-2018-HQ in Progress: 100%|██████████████████████████████| 121/121 [00:07<00:00, 16.84protein/s]
CASP12-FM in Progress: 100%|███████████████████████████████████| 22/22 [00:02<00:00,  8.03protein/s]
CASP13-FM in Progress: 100%|███████████████████████████████████| 17/17 [00:01<00:00,  9.23protein/s]
SPOT-2018-Neff1 in Progress: 100%|█████████████████████████████| 46/46 [00:02<00:00, 21.54protein/s]
CASP14-FM in Progress: 100%|███████████████████████████████████| 15/15 [00:00<00:00, 15.56protein/s]
