# Embedding protein sequences:

In this notebook we will embed our protein sequences using different approaches.

In [4]:
# lets import essential library
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
sequences = pd.read_csv('../../cleaned_data/seq_data.csv', index_col='Unnamed: 0')

In [4]:
sequences.head()

Unnamed: 0,target_id,Sequence
0,P06213,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...
1,P78368,MDFDKKGGKGETEEGRRMSKAGGGRSSHGIRSSGTSSGVLMVGPNF...
2,Q9H2K8,MRKGVLKDPEIADLFYKDDPEELFIGLHEIGHGSFGAVYFATNAHT...
3,P49336,MDYDFKVKLSSERERVEDLFEYEGCKVGRGTYGHVYKAKRKDGKDD...
4,Q6DT37,MERRLRALEQLARGEAGGCPGLDGLLDLLLALHHELSSGPLRRERS...


# SGT embeddings

SGT stands for sequence graph transform, it is a feature extraction method useful for sequence mining. It can extract useful information from amino acid sequences passed into it. You can find more information about it in here _arXiv:1608.03533_.  

Using it is very easy as we can simply use the library associated with it.

In [3]:
'''
For SGT  it likes to work with list containing individual characters of our sequences. In our sequence data the sequences are continuous so we need to
turn them in to list of single character strings so they can be fed into the SGT model.
'''
seqlist = [list(x) for x in sequences.Sequence.values]

In [9]:
# We need to import the SGT module from sgt library. It is basically the model that will perform the embeddings
from sgt import Sgt

sgt = Sgt(kappa=10, lengthsensitive = False)
embedding = sgt.fit_transform(corpus=seqlist) #this creates list containing embedding vectors for the sequences

In [10]:
len(embedding), len(seqlist) 

(204, 204)

In [12]:
# lets look at the length of individual embeds
print(len(embedding[0]))

400


Now that we have a list containing the embeding vector we can put them into a pandas dataframe and change the columns names

In [13]:
encode = pd.DataFrame(embedding)

In [14]:
encode.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.215062,0.203091,0.180683,0.198066,0.171366,0.188681,0.193831,0.172092,0.199483,0.19077,...,0.068085,0.187856,0.169934,0.198071,0.190562,0.177598,0.187509,0.19823,0.196648,0.192867
1,0.079285,0.004321,0.236577,0.21302,0.004,0.244418,0.089572,0.252362,0.208389,0.22098,...,0.240204,0.223962,0.030206,0.22078,0.230988,0.08021,0.206182,0.247487,0.095402,0.217688
2,0.202843,0.078705,0.204041,0.194296,0.071651,0.214227,0.203855,0.206597,0.179541,0.206352,...,0.073583,0.198795,0.01131,0.182349,0.214013,0.1922,0.026675,0.010142,0.257809,0.21154
3,0.207967,0.237654,0.230337,0.223728,0.214036,0.204381,0.084948,0.222819,0.199139,0.212795,...,0.225932,0.085967,0.215828,0.221116,0.215934,0.221184,0.22342,0.029879,0.004575,0.010838
4,0.188301,0.187365,0.179571,0.192665,0.183457,0.190285,0.174676,0.185862,0.19237,0.190364,...,0.028361,0.073716,0.173434,0.177409,0.062972,0.068286,0.19722,0.19404,0.001428,0.206309


In [5]:
encode = encode.rename(columns = lambda x : 'sgtencode_' + str(x)) 

In [15]:
# we concat it to our original csv file so we know the target id associated with the embeds
protdata = pd.concat([sequences, encode], axis=1)
protdata.head()

Unnamed: 0,target_id,Sequence,0,1,2,3,4,5,6,7,...,390,391,392,393,394,395,396,397,398,399
0,P06213,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...,0.215062,0.203091,0.180683,0.198066,0.171366,0.188681,0.193831,0.172092,...,0.068085,0.187856,0.169934,0.198071,0.190562,0.177598,0.187509,0.19823,0.196648,0.192867
1,P78368,MDFDKKGGKGETEEGRRMSKAGGGRSSHGIRSSGTSSGVLMVGPNF...,0.079285,0.004321,0.236577,0.21302,0.004,0.244418,0.089572,0.252362,...,0.240204,0.223962,0.030206,0.22078,0.230988,0.08021,0.206182,0.247487,0.095402,0.217688
2,Q9H2K8,MRKGVLKDPEIADLFYKDDPEELFIGLHEIGHGSFGAVYFATNAHT...,0.202843,0.078705,0.204041,0.194296,0.071651,0.214227,0.203855,0.206597,...,0.073583,0.198795,0.01131,0.182349,0.214013,0.1922,0.026675,0.010142,0.257809,0.21154
3,P49336,MDYDFKVKLSSERERVEDLFEYEGCKVGRGTYGHVYKAKRKDGKDD...,0.207967,0.237654,0.230337,0.223728,0.214036,0.204381,0.084948,0.222819,...,0.225932,0.085967,0.215828,0.221116,0.215934,0.221184,0.22342,0.029879,0.004575,0.010838
4,Q6DT37,MERRLRALEQLARGEAGGCPGLDGLLDLLLALHHELSSGPLRRERS...,0.188301,0.187365,0.179571,0.192665,0.183457,0.190285,0.174676,0.185862,...,0.028361,0.073716,0.173434,0.177409,0.062972,0.068286,0.19722,0.19404,0.001428,0.206309


In [16]:
# we dont need to have the column of sequence as we already got the embeds so we will drop it
protdata.drop('Sequence', inplace=True, axis=1)

In [17]:
protdata.head()

Unnamed: 0,target_id,0,1,2,3,4,5,6,7,8,...,390,391,392,393,394,395,396,397,398,399
0,P06213,0.215062,0.203091,0.180683,0.198066,0.171366,0.188681,0.193831,0.172092,0.199483,...,0.068085,0.187856,0.169934,0.198071,0.190562,0.177598,0.187509,0.19823,0.196648,0.192867
1,P78368,0.079285,0.004321,0.236577,0.21302,0.004,0.244418,0.089572,0.252362,0.208389,...,0.240204,0.223962,0.030206,0.22078,0.230988,0.08021,0.206182,0.247487,0.095402,0.217688
2,Q9H2K8,0.202843,0.078705,0.204041,0.194296,0.071651,0.214227,0.203855,0.206597,0.179541,...,0.073583,0.198795,0.01131,0.182349,0.214013,0.1922,0.026675,0.010142,0.257809,0.21154
3,P49336,0.207967,0.237654,0.230337,0.223728,0.214036,0.204381,0.084948,0.222819,0.199139,...,0.225932,0.085967,0.215828,0.221116,0.215934,0.221184,0.22342,0.029879,0.004575,0.010838
4,Q6DT37,0.188301,0.187365,0.179571,0.192665,0.183457,0.190285,0.174676,0.185862,0.19237,...,0.028361,0.073716,0.173434,0.177409,0.062972,0.068286,0.19722,0.19404,0.001428,0.206309


In [18]:
#protdata.to_csv('sgt_embeds.csv')

# Elmo encoder

Elmo embedding, developed by Allen NLP, is a state-of-the-art pre-trained model available on Tensorflow Hub. Elmo embeddings are learned from the internal state of a bidirectional LSTM and represent contextual features of the input text. It’s been shown to outperform previously existing pre-trained word embeddings like word2vec and glove on a wide variety of NLP tasks. Some of those tasks are Question Answering, Named Entity Extraction and Sentiment Analysis.

I have used the pretrained model based on the paper published by Michael Heinzinger and colleagues. You can find the repository for their paper [Modeling aspects of the language of life through transfer-learning protein sequences](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3220-8) and it also holds pre-trained SeqVec model for creating embeddings for amino acid sequences. I used their ELMo model trained on UniRef50 (=SeqVec), and used it to embed my protein sequences in a 1024 dimensional vector.  

In [23]:
from allennlp.commands.elmo import ElmoEmbedder
import torch

Once we download the pretrained model we need to set the weights and options as a variables while initilising the ElmoEmbedder.

In [20]:
weights = '../../weights.hdf5'
options = '../../options.json'
seqvec  = ElmoEmbedder(options,weights,cuda_device=0)# cuda_device=-1 for CPU

In [21]:
#sequences = pd.read_csv('../../cleaned_data/seq_data.csv', index_col='Unnamed: 0')

In [22]:
# testing
seq = 'SEQWENCE' # your amino acid sequence
embedding = seqvec.embed_sentence(list(seq)) # List-of-Lists with shape [3,L,1024]

In [24]:
protein_embd = torch.tensor(embedding).sum(dim=0).mean(dim=0)
protein_embd.numpy()

array([ 0.12736754, -0.02345606, -0.04605505, ..., -0.08782069,
       -0.15530579,  0.07202841], dtype=float32)

In [25]:
len(protein_embd.numpy()) # creates a embed with dimension of 1024

1024

In [26]:
# building embeds for our sequences
seqlist = [x for x in sequences.Sequence.values]
len(seqlist), sequences.shape

(204, (204, 2))

In [27]:
# writing a function that will help with obtaining the sequences
def elmo(sequence):
    embedding = seqvec.embed_sentence(list(sequence))
    protein_embd = torch.tensor(embedding).sum(dim=0).mean(dim=0).numpy()
    return protein_embd

In [28]:
embed = elmo(seqlist[0])

In [29]:
dfconcat = pd.DataFrame(embed, columns=[sequences['target_id'][0]])
dfconcat.head()

Unnamed: 0,P06213
0,-0.067821
1,-0.038488
2,-0.098161
3,0.0287
4,-0.054495


In [6]:
# we create a loop to get embeds for each sequences and turn it into a pandas dataframe
for ind in range(1, len(seqlist)):
    encode = elmo(seqlist[ind])
    df = pd.DataFrame(encode, columns=[sequences['target_id'][ind]])
    dfconcat = pd.concat([dfconcat, df], axis=1)
    #print(f'done with {ind} indexed sequence')

In [21]:
elmo = dfconcat.T
elmo.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
P06213,-0.067821,-0.038488,-0.098161,0.0287,-0.054495,-0.047325,0.054231,0.164506,0.007704,0.0047,...,-0.00981,0.091093,-0.018362,0.12701,0.038153,0.105289,0.01704,-0.030812,-0.048926,0.016598
P78368,-0.047268,0.094689,-0.022238,0.05451,-0.062499,-0.148772,0.121522,0.098777,0.076298,-0.075349,...,0.141718,0.044936,0.088765,0.084455,0.003178,0.081679,0.029849,-0.097444,-0.116359,0.007632
Q9H2K8,-0.009615,0.057865,-0.048387,-0.02453,-0.083673,-0.199012,0.172937,-0.121133,0.059123,-0.041587,...,0.037315,0.060867,0.082201,0.199571,-0.010445,-0.088625,-0.010882,-0.136455,-0.069648,0.060995
P49336,0.112151,0.106191,-0.089974,0.071485,-0.008234,-0.12011,0.077033,0.06359,-0.006657,0.015028,...,0.227605,0.015405,0.024191,0.237119,0.033725,0.001307,0.004316,-0.096539,-0.091975,-0.113692
Q6DT37,0.179675,0.088809,-0.028415,0.040988,-0.02064,-0.09496,-0.105356,0.152026,0.239712,-0.107756,...,-0.015777,0.042666,0.040231,0.049055,0.099729,-0.099466,0.026947,-0.120531,-0.042015,-0.003135


In [22]:
# rename the columns
elmo = elmo.rename(columns = lambda x : 'elmocode_' + str(x))
elmo.head()

Unnamed: 0,elmocode_0,elmocode_1,elmocode_2,elmocode_3,elmocode_4,elmocode_5,elmocode_6,elmocode_7,elmocode_8,elmocode_9,...,elmocode_1014,elmocode_1015,elmocode_1016,elmocode_1017,elmocode_1018,elmocode_1019,elmocode_1020,elmocode_1021,elmocode_1022,elmocode_1023
P06213,-0.067821,-0.038488,-0.098161,0.0287,-0.054495,-0.047325,0.054231,0.164506,0.007704,0.0047,...,-0.00981,0.091093,-0.018362,0.12701,0.038153,0.105289,0.01704,-0.030812,-0.048926,0.016598
P78368,-0.047268,0.094689,-0.022238,0.05451,-0.062499,-0.148772,0.121522,0.098777,0.076298,-0.075349,...,0.141718,0.044936,0.088765,0.084455,0.003178,0.081679,0.029849,-0.097444,-0.116359,0.007632
Q9H2K8,-0.009615,0.057865,-0.048387,-0.02453,-0.083673,-0.199012,0.172937,-0.121133,0.059123,-0.041587,...,0.037315,0.060867,0.082201,0.199571,-0.010445,-0.088625,-0.010882,-0.136455,-0.069648,0.060995
P49336,0.112151,0.106191,-0.089974,0.071485,-0.008234,-0.12011,0.077033,0.06359,-0.006657,0.015028,...,0.227605,0.015405,0.024191,0.237119,0.033725,0.001307,0.004316,-0.096539,-0.091975,-0.113692
Q6DT37,0.179675,0.088809,-0.028415,0.040988,-0.02064,-0.09496,-0.105356,0.152026,0.239712,-0.107756,...,-0.015777,0.042666,0.040231,0.049055,0.099729,-0.099466,0.026947,-0.120531,-0.042015,-0.003135


In [23]:
elmo.to_csv('elmo_embeds.csv')

# Tape embeds

Task assessing protein embeds is based of on the paper that is available at [https://arxiv.org/abs/1906.08230](https://arxiv.org/abs/1906.08230). On their github repor which you can find [here](https://github.com/songlab-cal/tape/), there are two pre trained models, one on bert-base (Transformer model) and the other on babbler-1900 (UniRep model). I have used the bert-base pre-trained model to fit on my sequences and create embeds for them.

We follow the similar steps to those taken for elmo embedding.

In [30]:
from tape import ProteinBertModel, TAPETokenizer
model = ProteinBertModel.from_pretrained('bert-base')
tokenizer = TAPETokenizer(vocab='iupac')

In [31]:
def tape(sequence):
    token_ids = torch.tensor([tokenizer.encode(sequence)])
    output = model(token_ids)
    seq_output = output[0]
    numarr = seq_output.detach().numpy()[0]
    mean = numarr.mean(axis=0)
    return mean

In [32]:
for ind in range(11, len(seqlist)):
    encode = tape(seqlist[ind])
    df = pd.DataFrame(encode, columns=[sequences['target_id'][ind]])
    dfconcat = pd.concat([dfconcat, df], axis=1)
    print(f'done with {ind} indexed sequence')

In [33]:
tape = dfconcat.T
tape = tape.rename(columns = lambda x : 'encode_' + str(x))
tape.to_csv('tape_embeds.csv')