In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn import metrics
import torch.nn.functional as F
import torch.optim as opt
from transformers import T5EncoderModel, T5Tokenizer
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import StratifiedKFold
device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")


In [None]:
data = pd.read_csv('Alternate/alternate.csv')
sequences = data['sequences'].values
labels = data['labels'].values

In [None]:
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_uniref50', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")

In [None]:
model = model.to(device)
model = model.eval()

In [None]:
X_ = [' '.join(sequence) for sequence in sequences]

In [None]:
def make_vector(seqs):
    seq_feats = np.zeros((len(seqs),1024))
    
    for i in range(len(seqs)):
        ids = tokenizer.batch_encode_plus(seqs[i:i+1], add_special_tokens=True, padding=True)
        input_ids = torch.tensor(ids['input_ids']).to(device)
        attention_mask = torch.tensor(ids['attention_mask']).to(device)

        with torch.no_grad():
            embedding = model(input_ids=input_ids,attention_mask=attention_mask)
            
        encoder_embedding = embedding.last_hidden_state.cpu().numpy()
        for seq_num in range(len(encoder_embedding)):
            seq_len = (attention_mask[seq_num] == 1).sum()
            seq_emd = encoder_embedding[seq_num][:seq_len-1]
#             print(seq_emd.shape)
            seq_feats[i,:] = np.mean(seq_emd, axis=0)
#             seq_feats[i,:] = encoder_embedding[seq_num][seq_len-1:seq_len]
    
    return seq_feats

In [None]:
T5_matrix = make_vector(X_)

In [None]:
T5_matrix.shape

In [None]:
def normalization(data):
    _range = np.max(data) - np.min(data)
    return (data - np.min(data)) / _range

In [None]:
T5_norm = np.zeros((21164,1024))
for i in range(21164):
    temp = normalization(T5_matrix[i,:])
    T5_norm[i,:] = temp

In [None]:
# np.save('Alternate/t5_mean.npy',T5_norm)