In [1]:
#@title Install requirements. { display-mode: "form" }
# Install requirements
!pip install torch transformers sentencepiece h5py

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
#@title Set up working directories and download files/checkpoints. { display-mode: "form" }
# Create directory for storing model weights (2.3GB) and example sequences.
# Here we use the encoder-part of ProtT5-XL-U50 in half-precision (fp16) as
# it performed best in our benchmarks (also outperforming ProtBERT-BFD).
# Also download secondary structure prediction checkpoint to show annotation extraction from embeddings
!mkdir protT5 # root directory for storing checkpoints, results etc
!mkdir protT5/protT5_checkpoint # directory holding the ProtT5 checkpoint
!mkdir protT5/sec_struct_checkpoint # directory storing the supervised classifier's checkpoint
!mkdir protT5/output # directory for storing your embeddings & predictions
!wget -nc -P protT5/ https://rostlab.org/~deepppi/example_seqs.fasta
# Huge kudos to the bio_embeddings team here! We will integrate the new encoder, half-prec ProtT5 checkpoint soon
!wget -nc -P protT5/sec_struct_checkpoint http://data.bioembeddings.com/public/embeddings/feature_models/t5/secstruct_checkpoint.pt

--2024-05-10 04:15:16--  https://rostlab.org/~deepppi/example_seqs.fasta
Resolving rostlab.org (rostlab.org)... 172.67.219.184, 104.21.38.66, 2606:4700:3030::ac43:dbb8, ...
Connecting to rostlab.org (rostlab.org)|172.67.219.184|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.rostlab.org/~deepppi/example_seqs.fasta [following]
--2024-05-10 04:15:16--  https://files.rostlab.org/~deepppi/example_seqs.fasta
Resolving files.rostlab.org (files.rostlab.org)... 131.159.28.73
Connecting to files.rostlab.org (files.rostlab.org)|131.159.28.73|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /deepppi/example_seqs.fasta [following]
--2024-05-10 04:15:16--  https://files.rostlab.org/deepppi/example_seqs.fasta
Reusing existing connection to files.rostlab.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 627
Saving to: ‘protT5/example_seqs.fasta’


2024-05-10 04:15:17 (73.1 MB/s) - ‘protT5/

In [3]:
from google.colab import files
uploaded = files.upload()

Saving PLM_ARG_DB.fasta to PLM_ARG_DB.fasta


In [4]:
seq_path = "./PLM_ARG_DB.fasta"
per_protein = True
per_protein_path = "./protT5/output/per_protein_embeddings.h5" # where to store the embeddings
sec_struct = False
sec_struct_path = "./protT5/output/ss3_preds.fasta" # file for storing predictions

# make sure that either per-residue or per-protein embeddings are stored
assert per_protein is True  or sec_struct is True, print(
    "Minimally, you need to active per_residue, per_protein or sec_struct. (or any combination)")


In [5]:
#@title Import dependencies and check whether GPU is available. { display-mode: "form" }
from transformers import BertModel, BertTokenizer
import torch
import h5py
import time
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using {}".format(device))

Using cuda:0


In [6]:
#@title Load encoder-part of ProtT5 in half-precision. { display-mode: "form" }
# Load ProtT5 in half-precision (more specifically: the encoder-part of ProtT5-XL-U50)
def get_T5_model():
    tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
    model = BertModel.from_pretrained("Rostlab/prot_bert")
    
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model  

    return model, tokenizer

In [7]:
#@title Read in file in fasta format. { display-mode: "form" }
def read_fasta( fasta_path, split_char="!", id_field=0):
    '''
        Reads in fasta file containing multiple sequences.
        Split_char and id_field allow to control identifier extraction from header.
        E.g.: set split_char="|" and id_field=1 for SwissProt/UniProt Headers.
        Returns dictionary holding multiple sequences or only single
        sequence, depending on input file.
    '''

    seqs = dict()
    with open( fasta_path, 'r' ) as fasta_f:
        for line in fasta_f:
            # get uniprot ID from header and create new entry
            if line.startswith('>'):
                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
                # replace tokens that are mis-interpreted when loading h5
                uniprot_id = uniprot_id.replace("/","_").replace(".","_")
                seqs[ uniprot_id ] = ''
            else:
                # repl. all whie-space chars and join seqs spanning multiple lines, drop gaps and cast to upper-case
                seq= ''.join( line.split() ).upper().replace("-","")
                # repl. all non-standard AAs and map them to unknown/X
                seq = seq.replace('U','X').replace('Z','X').replace('O','X')
                seqs[ uniprot_id ] += seq
    example_id=next(iter(seqs))
    print("Read {} sequences.".format(len(seqs)))
    print("Example:\n{}\n{}".format(example_id,seqs[example_id]))
    seqs_headers, seqs_strings = list(seqs.keys()), list(seqs.values())
    return seqs_headers, seqs_strings, seqs

In [8]:
#@title Generate embeddings. { display-mode: "form" }
# Generate embeddings via batch-processing
# per_residue indicates that embeddings for each residue in a protein should be returned.
# per_protein indicates that embeddings for a whole protein should be returned (average-pooling)
# max_residues gives the upper limit of residues within one batch
# max_seq_len gives the upper sequences length for applying batch-processing
# max_batch gives the upper number of sequences per batch

from tqdm import tqdm

def get_embeddings( model, tokenizer, seqs, per_protein, sec_struct,
                   max_residues=4000, max_seq_len=1000, max_batch=100 ):

    results = {
               "protein_embs" : dict()
               }

    # sort sequences according to length (reduces unnecessary padding --> speeds up embedding)
    seq_dict   = sorted( seqs.items(), key=lambda kv: len( seqs[kv[0]] ), reverse=True )
    start = time.time()
    batch = list()

    for seq_idx, (pdb_id, seq) in enumerate(seq_dict,1):
          seq = seq
          seq_len = len(seq)
          seq = ' '.join(list(seq))
          batch.append((pdb_id,seq,seq_len))
          # count residues in current batch and add the last sequence length to
          # avoid that batches with (n_res_batch > max_residues) get processed
          n_res_batch = sum([ s_len for  _, _, s_len in batch ]) + seq_len
          if len(batch) >= max_batch or n_res_batch>=max_residues or seq_idx==len(seq_dict) or seq_len>max_seq_len:
              pdb_ids, seqs, seq_lens = zip(*batch)
              batch = list()

              # add_special_tokens adds extra token at the end of each sequence
              token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
              input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
              attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

              try:
                  with torch.no_grad():
                      # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
                      embedding_repr = model(input_ids, attention_mask=attention_mask)
              except RuntimeError:
                  print("RuntimeError during embedding for {} (L={})".format(pdb_id, seq_len))
                  continue


              for batch_idx, identifier in enumerate(pdb_ids): # for each protein in the current mini-batch
                  s_len = seq_lens[batch_idx]
                  # slice off padding --> batch-size x seq_len x embedding_dim
                  emb = embedding_repr.last_hidden_state[batch_idx,:s_len]
                  #if sec_struct: # get classification results
                  ##   results["sec_structs"][identifier] = torch.max( d3_Yhat[batch_idx,:s_len], dim=1 )[1].detach().cpu().numpy().squeeze()
                  if per_protein: # apply average-pooling to derive per-protein embeddings (1024-d)
                      protein_emb = emb.mean(dim=0)
                      results["protein_embs"][identifier] = protein_emb.detach().cpu().numpy().squeeze()
                      print("1 completed")





    passed_time=time.time()-start
    avg_time =  passed_time/len(results["protein_embs"])
    print('\n############# EMBEDDING STATS #############')
    print('Total number of per-protein embeddings: {}'.format(len(results["protein_embs"])))
    print("Time for generating embeddings: {:.1f}[m] ({:.3f}[s/protein])".format(
        passed_time/60, avg_time ))
    print('\n############# END #############')
    return results["protein_embs"]

In [9]:
#@title Write embeddings to disk. { display-mode: "form" }
def save_embeddings(emb_dict,out_path):
    with h5py.File(str(out_path), "w") as hf:
        for sequence_id, embedding in emb_dict.items():
            # noinspection PyUnboundLocalVariable
            hf.create_dataset(sequence_id, data=embedding)
    return None

In [10]:
import numpy as np
import torch
import joblib
import numpy as np
from xgboost import XGBClassifier
#from utility import extract, get_label
from sklearn.multioutput import MultiOutputClassifier
from math import ceil
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader


In [11]:
AA_list = ('G','A','V','L','I','P','F','Y','W','R'
           'S','T','C','M','N','Q','D','E','K','H')

def AA_replace(seq):
    odd_AAs = set()
    for s in seq:
        if s not in AA_list:
            odd_AAs.add(s)
    for k in odd_AAs:
        seq = seq.replace(k,'X')
    return seq

In [12]:
def extract(seq_path, batch_size=500, max_len = 200):
    #model, alphabet = pretrained.load_model_and_alphabet_local(model_location)
    seqs_headers, seqs_strings, seqs = read_fasta(seq_path) #dictionary
    seq_num = len(seqs_headers)

    #for i in range(seq_num):
     #   seqs_strings[i] = AA_replace(seqs_strings[i])

    model, tokenizer = get_T5_model()

    seqs_sliced_keys = [key for key in np.random.choice(list(seqs.keys()),10)]
    seqs_sliced = {key:seqs[key] for key in seqs_sliced_keys}
    embedding_dict = get_embeddings( model, tokenizer, seqs, per_protein, sec_struct = False)
    seq_id, embed_res = list(embedding_dict.keys()), list(embedding_dict.values())

    return seq_id, embed_res

In [13]:
def get_label(seq_id, min_seq =50):
    Label_ID = []

    for ID in seq_id:
        protein_id, src, arg_classes = ID.split("|")
        Label_ID.append(arg_classes.split(";"))
    mlb = MultiLabelBinarizer()
    Label_ID = mlb.fit_transform(Label_ID)
    ARG_Category = mlb.classes_
    if(ARG_Category.shape[0] < 2):
        print("Error: The number of category is less than 2!")
        return
    if(ARG_Category.shape[0] > 2):
        arg_freq = Label_ID.sum(axis = 0)
        rare_id = np.where(arg_freq < min_seq)[0]]
        nonarg_id = np.where(ARG_Category=="nonARG")[0]
        multi_drug_id = np.where(ARG_Category=="multi-drug")[0]
        others_id = np.where(ARG_Category == "antibiotic without defined classification")[0]
        others_id = np.append(np.append(rare_id, multi_drug_id), others_id)

        others_arg = Label_ID[:,others_id].sum(axis=1)
        others_arg = np.where(others_arg>0, 1, others_arg)
        nonarg = Label_ID[:,nonarg_id]
        arg = 1-nonarg
        # delete nonARG and rare arg colunms
        Label_ID = np.delete(Label_ID, np.append(others_id,nonarg_id), axis=1)
        ARG_Category = np.delete(ARG_Category, np.append(others_id,nonarg_id), axis=0)
        # add others, ARG, nonARG colums
        Label_ID = np.insert(Label_ID, Label_ID.shape[1], values=[others_arg],axis = 1)
        #Label_ID = np.insert(Label_ID, 0, values=[[nonarg]],axis = 1)
        Label_ID = np.insert(Label_ID, 0, values=[[arg]],axis = 1)
        ARG_Category = np.insert(ARG_Category, ARG_Category.shape[0], "others",axis=0)
    return Label_ID, ARG_Category



In [14]:
def train(train_seq_path, maxlen = 200, min_seq = 50, batch_size = 10,
          arg_model = './protT5/output/arg_model.pkl', cat_model = './protT5/output/cat_model.pkl',
          cat_index = './protT5/output/Category_Index.csv'):

    print("Generating embedding representation for each protein sequence ...")
    seq_id, embedding_res = extract(train_seq_path,batch_size = batch_size, max_len= maxlen)
    # get categories for training

    Label_ID, ARG_Category = get_label(seq_id)
    np.savetxt(cat_index, ARG_Category, delimiter=",", fmt='%s')
    # training with XGBoost
    X = embedding_res
    Y = Label_ID
    print("Training for ARG identification ...")
    model1 = XGBClassifier(learning_rate=0.1, objective='binary:logistic',
                            max_depth = 7, n_estimators = 200)
    model1.fit(X, Y[:,0])
    joblib.dump(model1, arg_model)
    print("Training for resistance category classification ...")
    arg_ind = Y[:,0] == 1
    X, Y = np.array(X),np.array(Y)
    ARG_X = X[arg_ind,:]
    ARG_Y = Y[arg_ind,1:]
    model2 = MultiOutputClassifier(XGBClassifier(learning_rate=0.1,
                                                    objective='binary:logistic',
                                                    max_depth = 7, n_estimators = 200))
    model2.fit(ARG_X, ARG_Y)
    joblib.dump(model2, cat_model)


In [15]:
with open(seq_path, 'r') as fp:
    count = 0
    strings = ''
    for line in fp:
      strings += line
      if line.startswith('>'):
        count += 1
print('Total Number of lines:', count)
tot_sequences = count
train_length= int(count*0.8)
fp.close()

Total Number of lines: 57156


In [16]:
groups = strings.split(">")

out_1 = ">".join(groups[:train_length])
out_2 = ">".join(groups[train_length:])
#
train_set = groups[1:train_length]
test_set = groups[train_length:]

with open("train_seq.txt","w") as file:
  file.writelines('>')
  file.writelines('>'.join(train_set))

with open("test_seq.txt","w") as file:
  file.writelines('>')
  file.writelines('>'.join(test_set))

In [17]:
train_seq_path = "train_seq.txt"
test_seq_path = "test_seq.txt"

In [18]:
train(train_seq_path, maxlen = 200, min_seq = 50, batch_size = 10,arg_model = './protT5/output/arg_model.pkl', cat_model = './protT5/output/cat_model.pkl',
          cat_index = './protT5/output/Category_Index.csv')

Generating embedding representation for each protein sequence ...
Read 45723 sequences.
Example:
B1L9B4|UNIPROT|nonARG
MAEKKEFVVGIDLGTTNSVIAWMKPDGTVEVIPNAEGSRVTPSVVAFTKSGEILVGEPAKRQMILNPERTIKSIKRKMGTDYKVRIDDKEYTPQEISAFILKKLKNDAEAYLGGEIKKAVITCPAYFNDAQRQATKEAGIIAGLEVLRIINEPTAAALAYGLDKAGKEEKVLVYDLGGGTFDVSILEIGEGVIEVIATAGNNHLGGDDFDQRLIDWMAEEFKKQHGIDLREDRQALQRLRDAAEKAKIELSTKMETDVSLPFIAVSPSGQPLHLEMRITRSLFESLTRDLVEMTRGPIEQALNDAKLSPQDIDEIILVGGMTRVPMVQRFIKEFFGKEPNKSVNPDEAVAIGAAIQAAILAGTEGAKGRDIVLVDVTPLTLGIEVKGGLFEPIIPRNTKIPVRKSKIFTTVEDGQTEVEIRVYQGERPIARENIFLGSFKLVGIPPAPRGVPQIEVTFDIDSDGIVHVSAKDLGSGKEQSMVVTGRHKLSEDEIKRMIEDAKRYEEQDKRLKEEIELKNRADDLAYSVEKTLKEHGDKIPADLKSRLEDMIRELRDAINRNDIPKVKMLFDDLQKESMKIGEYLYKSATGGETSNQ


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed

In [19]:
import pandas as pd

In [20]:
def predict(test_seq_path, batch_size=10, maxlen = 200, min_prob = 0.5, arg_model='./protT5/output/arg_model.pkl',
            cat_model='./protT5/output/cat_model.pkl',cat_index='./protT5/output/Category_Index.csv',output_file='higarg_out.csv'):
    ## 1. load arg model and category model and category index
    arg_model = joblib.load(arg_model)
    cat_model = joblib.load(cat_model)
    cat_index = np.loadtxt(cat_index,dtype = str,delimiter = ",").tolist()#np.loadtxt('models/Category_Index.csv',dtype = str,delimiter = ",").tolist()

    seq_id, embedding_res = extract(test_seq_path,
                                    batch_size = batch_size, max_len= maxlen)
    seq_num = len(seq_id)
    cat_num = len(cat_index)
    pred_res = pd.DataFrame({'seq_id':seq_id, 'pred':''})
    pred_res = pd.concat([pred_res, pd.DataFrame(data = np.zeros((seq_num,cat_num+1),dtype='float64'),
                     columns= ['ARG']+cat_index)], axis = 1)

    pred_res['ARG'] = arg_model.predict_proba(embedding_res)[:,1]
    # predict Category
    arg_ind = np.where(pred_res['ARG']>min_prob)[0].tolist()
    if len(arg_ind) > 0:
        cat_out = cat_model.predict_proba(np.array(embedding_res)[arg_ind,])
    for i in range(len(cat_out)):
        pred_res.iloc[arg_ind, i + 3] = cat_out[i][:, 1]

    for i in arg_ind:
        cats = [cat_index[k] for k, v in enumerate(pred_res.iloc[i, 3:]) if v >= 0.5]
        pred_res.iloc[i, 1] = ';'.join(cats)
    pred_res.to_csv(output_file, sep = '\t', index=0)
    return pred_res





In [21]:
predict(test_seq_path, maxlen = 200,batch_size = 10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed
1 completed

Unnamed: 0,seq_id,pred,ARG,alkaloids with antibiotic activity,aminocoumarin,aminoglycoside,beta-lactam,diaminopyrimidine,disinfecting agents and antiseptics,fluoroquinolone,...,nucleoside,peptide,phenicol,pleuromutilin,rifamycin,streptogramin,sulfonamide,sulfone,tetracycline,others
0,Q46342|UNIPROT|nonARG,,0.005892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Q8VQ99|UNIPROT|nonARG,,0.003166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P45385|UNIPROT|nonARG,,0.001440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,B3E7S8|UNIPROT|nonARG,,0.009244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,B2TUF5|UNIPROT|nonARG,,0.000551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11428,P50071|UNIPROT|nonARG,,0.002482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11429,P83153|UNIPROT|nonARG,,0.002396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11430,P0AD77|UNIPROT|nonARG,,0.064011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11431,P81411|UNIPROT|nonARG,,0.025116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
