#**Description of Notebook:**
The following notebook uses the pretrained models described here - https://www.biorxiv.org/content/10.1101/2020.07.12.199554v1.full.pdf, and found here - https://github.com/agemagician/ProtTrans, to create embedding vectors for each sequence in the inputted dataset. Below is an example of how to create word embeddings using the T5 Uniprot50 pre-trained model using the function created.

#**Imports:**

In [1]:
!pip install -q transformers
!pip install -q transformers sentencePiece

In [2]:
import torch
from transformers import BertModel, BertTokenizer, XLNetModel, XLNetTokenizer, T5EncoderModel, T5Tokenizer
import re
import os
import requests
import gc
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

# Function To Extract Word Embeddings:**

In [3]:
class LM_EMBED:

  def __init__(self, language_model, max_len, rare_aa):
    self.lang_model = language_model
    self.max_len = max_len
    self.rare_aa = rare_aa

    # Import tokenizer and model from ProtTrans Pre-Trained Rostlab:
    if self.lang_model == 'BERT-BFD':
      self.tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
      self.model = BertModel.from_pretrained("Rostlab/prot_bert_bfd")
    elif self.lang_model == 'BERT':
      self.tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
      self.model = BertModel.from_pretrained("Rostlab/prot_bert")
    elif self.lang_model == 'T5-XL-BFD':
      self.tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_bfd", do_lower_case=False )
      self.model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_bfd")
      gc.collect()
    elif self.lang_model == 'T5-XL-UNI':
      self.tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False )
      self.model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")
      gc.collect()
    elif self.lang_model == 'XLNET':
      self.tokenizer = XLNetTokenizer.from_pretrained("Rostlab/prot_xlnet", do_lower_case=False)
      self.model = XLNetModel.from_pretrained("Rostlab/prot_xlnet", mem_len=512)


  # Function to use the specified model and tokenizer to create word embedding array:
  def extract_word_embs(self, seq_df, filename):

    # Setting device to GPU if available:
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Assigning model to GPU if available, and setting to eval mode:
    self.model = self.model.to(device)
    self.model = self.model.eval()

    # Making a list of sequences from the df:
    seqs_list = seq_df.SEQUENCE.to_list()

    # Adding spaces in between sequence letters (amino acids):
    seqs_spaced = self.add_spaces(seqs_list)

    # Map Rarely Occuring Amino Acids (U, Z, O, B) to (X) if they are present in the dataset:
    if self.rare_aa:
      seqs_spaced = [re.sub(r"[UZOB]", "X", sequence) for sequence in seqs_spaced]

    # ID list tokenized:
    ids = self.tokenizer.batch_encode_plus(seqs_spaced, add_special_tokens=True, padding = 'max_length', max_length = self.max_len)

    # Retrieving the input IDs and mask for attention as tensors:
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # Emptying cache to ensure enough memory:
    torch.cuda.empty_cache()

    # Loop to process the sequences into embeddings in batches of 10:
    for i in range(10, len(input_ids)+10, 10):
      if i%100 == 0:
        print("Initial Embedding Batch Ending with...", i)
      with torch.no_grad():
        embeddings = self.model(input_ids=input_ids[i-10:i], 
                                attention_mask=attention_mask[i-10:i])[0]
        emb_array = embeddings.cpu().numpy()

        # Creating initial array or concatenating to existing array:
        if i==10:
          embedding_res = emb_array
        else:
          embedding_res = np.concatenate((embedding_res, emb_array))

    # Extracting features using the function below:
    features = self.extract_features(embedding_res, attention_mask) 

    # Padding these features to a specified max length with zeros:
    padded_arr = self.pad(features)

    #Ensuring in correct location to save embeddings:
    %cd "INSERT_EMBEDDINGS_FOLDER_LOC"

    # Saving array:
    print("Saving Embeddings...")
    np.save(filename, padded_arr)


  # Function to add spaces between the amino acids in each sequence:
  def add_spaces(self, df_col):
    return [" ".join(x) for x in df_col]

  # Function to remove any CLS or SEP tokens, just leaving features:
  def extract_features(self, emb_res, att_msk):
    features = [] 

    for seq_num in range(len(emb_res)):
      seq_len = (att_msk[seq_num] == 1).sum()

      if self.lang_model in ['BERT-BFD', 'BERT']:
        seq_emd = emb_res[seq_num][1:seq_len-1]

      elif self.lang_model in ['T5-XL-BFD', 'T5-XL-UNI']:
        seq_emd = emb_res[seq_num][:seq_len-1]

      elif self.lang_model == 'XLNET':
        padded_seq_len = len(att_msk[seq_num])
        seq_emd = emb_res[seq_num][padded_seq_len-seq_len:padded_seq_len-2]

      features.append(seq_emd)
    
    features_arr = np.array(features, dtype=object)

    return features_arr

  # Function to add zeros to pad all features to max length:
  def pad(self, features):
    dim1 = self.max_len-2   # reducing by 2 for CLS and SEP tokens which have already been removed
    dim2 = features[0].shape[1]

    for i in range(len(features)):
      if i%100 == 0:
        print("Padding Batch: ", i)

      all_zeros = np.zeros((dim1, dim2))
      all_zeros[:features[i].shape[0], :features[i].shape[1]] = features[i]

      if i==0:
        padded_arr = all_zeros
      elif i==1:
        padded_arr = np.stack((padded_arr, all_zeros), axis=0)
      else:
        reshaped_arr = all_zeros.reshape(1, all_zeros.shape[0], all_zeros.shape[1])
        padded_arr = np.vstack((padded_arr, reshaped_arr))
    
    return padded_arr
    

# MY dataset:

In [4]:
#EC
EC_X_train = pd.read_csv('My_paper/EC_X_train_40.csv')
EC_X_test = pd.read_csv('My_paper/EC_X_test_40.csv')
EC_X_val = pd.read_csv('My_paper/EC_X_val_40.csv')
#SA
SA_X_train = pd.read_csv('My_paper/SA_X_train_40.csv')
SA_X_test = pd.read_csv('My_paper/SA_X_test_40.csv')
SA_X_val = pd.read_csv('My_paper/SA_X_val_40.csv')
#PA
PA_X_train = pd.read_csv('My_paper/PA_X_train_40.csv')
PA_X_test = pd.read_csv('My_paper/PA_X_test_40.csv')
PA_X_val = pd.read_csv('My_paper/PA_X_val_40.csv')

# **Example Embeddings:**
- Example below of how language model word embeddings were created using the T5 Model that had been pre-trained by Elnaggar et al. (Source: https://github.com/agemagician/ProtTrans) on the UniRef50 protein dataset (see here for more information on the dataset: https://www.uniprot.org/help/uniref).

##**T5XL Uniprot50 Language Model:**

In [5]:
# Specifying the max sequence length in the given dataset (255 for the LMPred Dataset), 
# then adding 2 to account for special [CLS, SEP] tokens added by the language models):
max_seq_len = 42
T5XL_UNI_EMBED_40 = LM_EMBED('T5-XL-UNI', max_seq_len, True)


Some weights of the model checkpoint at Rostlab/prot_t5_xl_uniref50 were not used when initializing T5EncoderModel: ['decoder.block.1.layer.0.layer_norm.weight', 'decoder.block.7.layer.0.SelfAttention.q.weight', 'decoder.block.4.layer.0.SelfAttention.o.weight', 'decoder.block.2.layer.2.layer_norm.weight', 'decoder.block.17.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.15.layer.1.EncDecAttention.o.weight', 'decoder.block.18.layer.0.SelfAttention.v.weight', 'decoder.block.23.layer.0.SelfAttention.o.weight', 'decoder.block.21.layer.1.EncDecAttention.k.weight', 'decoder.block.8.layer.1.EncDecAttention.k.weight', 'decoder.block.10.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.0.layer_norm.weight', 'decoder.block.23.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.10.layer.2.Dense

## My dataset ##T5XL #max_len=40

In [6]:
T5XL_UNI_EMBED_40.extract_word_embs(EC_X_train, "EC_X_TRAIN.npy")
T5XL_UNI_EMBED_40.extract_word_embs(EC_X_test, "EC_X_TEST.npy")
T5XL_UNI_EMBED_40.extract_word_embs(EC_X_val, "EC_X_VAL.npy")

KeyboardInterrupt: 

In [9]:
T5XL_UNI_EMBED_40.extract_word_embs(SA_X_train, "SA_X_TRAIN.npy")
T5XL_UNI_EMBED_40.extract_word_embs(SA_X_test, "SA_X_TEST.npy")
T5XL_UNI_EMBED_40.extract_word_embs(SA_X_val, "SA_X_VAL.npy")

Initial Embedding Batch Ending with... 100
Initial Embedding Batch Ending with... 200
Initial Embedding Batch Ending with... 300
Initial Embedding Batch Ending with... 400
Initial Embedding Batch Ending with... 500
Initial Embedding Batch Ending with... 600
Initial Embedding Batch Ending with... 700
Initial Embedding Batch Ending with... 800
Initial Embedding Batch Ending with... 900
Initial Embedding Batch Ending with... 1000
Initial Embedding Batch Ending with... 1100
Initial Embedding Batch Ending with... 1200
Initial Embedding Batch Ending with... 1300
Initial Embedding Batch Ending with... 1400
Initial Embedding Batch Ending with... 1500
Initial Embedding Batch Ending with... 1600
Initial Embedding Batch Ending with... 1700
Initial Embedding Batch Ending with... 1800
Initial Embedding Batch Ending with... 1900
Initial Embedding Batch Ending with... 2000
Initial Embedding Batch Ending with... 2100
Initial Embedding Batch Ending with... 2200
Initial Embedding Batch Ending with... 23

In [10]:
T5XL_UNI_EMBED_40.extract_word_embs(PA_X_train, "PA_X_TRAIN.npy")
T5XL_UNI_EMBED_40.extract_word_embs(PA_X_test, "PA_X_TEST.npy")
T5XL_UNI_EMBED_40.extract_word_embs(PA_X_val, "PA_X_VAL.npy")

Initial Embedding Batch Ending with... 100
Initial Embedding Batch Ending with... 200
Initial Embedding Batch Ending with... 300
Initial Embedding Batch Ending with... 400
Initial Embedding Batch Ending with... 500
Initial Embedding Batch Ending with... 600
Initial Embedding Batch Ending with... 700
Initial Embedding Batch Ending with... 800
Initial Embedding Batch Ending with... 900
Initial Embedding Batch Ending with... 1000
Initial Embedding Batch Ending with... 1100
Initial Embedding Batch Ending with... 1200
Initial Embedding Batch Ending with... 1300
Initial Embedding Batch Ending with... 1400
Initial Embedding Batch Ending with... 1500
Initial Embedding Batch Ending with... 1600
Initial Embedding Batch Ending with... 1700
Initial Embedding Batch Ending with... 1800
Initial Embedding Batch Ending with... 1900
Initial Embedding Batch Ending with... 2000
Initial Embedding Batch Ending with... 2100
Initial Embedding Batch Ending with... 2200
Initial Embedding Batch Ending with... 23

- N.B. Embedding arrays have been saved as zarr files to the 'Embeddings' folder for the T5 model.
- These can be loaded as numpy arrays using the following functions: