In [1]:

import pandas as pd
import time
import numpy as np
from collections import Counter, defaultdict

In [61]:
def load_data(filepath):
    '''
    Load up the vector data
    '''
    return pd.read_csv(filepath, header=None, sep='\t')



def round_number(x, factor=100):
    '''
    Multiply number by a factor and then convert to an int
    '''
    return int(round(x * factor))



def score_seq_original(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence
    '''
    seq = seq.upper()
    
    # make all the three-mers
    three_mers = [seq[i:i+3] for i in range(0, len(seq)-2)]

    # convert the sequence 3-mer list to a data frame
    three_mers_df = pd.DataFrame()
    three_mers_df[0] = three_mers

    # merge the two data frames, the merge happens on the sequence trimers
    merged_df = pd.merge(three_mers_df, enbedding, on=[0])

    # sum the columns and remove the sequence information (first column)
    vector_sums = merged_df.sum(axis = 0, skipna = False).drop(0)
    
    # to save on memory we may want to convert the folat to an int
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()



In [63]:

    
# load up the enbedding data
enbedding_df = load_data(filename)   





seq = "MAVSAGSARTSPSSDKVQKDKAELISGPRQDSRIGKLLGFEWTDLSSWRRLVTLLNRPTDPASLAVFRFLFGFLMVLDIPQERGLSSLDRKYLDGLDVCRFPLLDALRPLPLDWMYLVYTIMFLGALGMMLGLCYRISCVLFLLPYWYVFLLDKTSWNNHSYLYGLLAFQLTFMDANHYWSVDGLLNAHRRNAHVPLWNYAVLRGQIFIVYFIAGVKKLDADWVEGYSMEYLSRHWLFSPFKLLLSEELTSLLVVHWGGLLLDLSAGFLLFFDVSRSIGLFFVSYFHCMNSQLFSIGMFSYVMLASSPLFCSPEWPRKLVSYCPRRLQQLLPLKAAPQPSVSCVYKRSRGKSGQKPGLRHQLGAAFTLLYLLEQLFLPYSHFLTQGYNNWTNGLYGYSWDMMVHSRSHQHVKITYRDGRTGELGYLNPGVFTQSRRWKDHADMLKQYATCLSRLLPKYNVTEPQIYFDIWVSINDRFQQRIFDPRVDIVQAAWSPFQRTSWVQPLLMDLSPWRAKLQEIKSSLDNHTEVVFIADFPGLHLENFVSEDLGNTSIQLLQGEVTVELVAEQKNQTLREGEKMQLPAGEYHKVYTTSPSPSCYMYVYVNTTELALEQDLAYLQELKEKVENGSETGPLPPELQPLLEGEVKGGPEPTPLVQTFLRRQQRLQEIERRRNTPFHERFFRFLLRKLYVFRRSFLMTCISLRNLILGRPSLEQLAQEVTYANLRPFEAVGELNPSNTDSSHSNPPESNPDPVHSEF"

n = 100

t1 = time.time()

for i in range(n):
    vector = score_seq_original(seq, enbedding_df, to_int=False)
t2 = time.time()

print('time per seq: ', (t2-t1)/n)

print(vector)


time per seq:  0.013067100048065185
[-58.42623600000004, -10.759072000000009, -13.220239000000014, -58.864606999999985, 5.2938949999999965, -5.665261000000001, 21.071880000000018, -6.472677000000012, -15.875790999999998, 45.233404999999934, -16.031058999999985, -0.7588179999999976, -1.8375699999999993, 25.224292000000016, -3.6504390000000058, 3.459047999999995, 22.93561100000002, -26.155718000000004, -0.3219120000000005, -8.340253999999998, -5.105318999999998, -25.539444000000003, -23.195275999999986, 8.482181999999995, -14.642847999999983, -16.008416999999987, -6.310184, -11.114567000000012, 1.3573780000000015, -11.257596000000005, 3.773756000000001, -40.655131999999966, -4.725194000000002, -18.26120500000001, 28.149715999999987, 10.776019, -30.35872500000001, 3.4987909999999935, 12.193397000000006, -11.294633, 19.950574000000007, 8.672497000000005, 17.572024999999993, -1.9167749999999986, 7.114125999999992, 1.085876000000003, -16.64910699999999, 7.0591729999999995, -9.182861000000003

(9048, 100)
          1         2         3         4         5         6         7    \
0                                                                           
AAA -0.174060 -0.095756  0.059515  0.039673 -0.375934 -0.115415  0.090725   
ALA -0.114085 -0.093288  0.155800 -0.037351 -0.121446  0.084037  0.023819   
LLL -0.075594 -0.100834 -0.046616 -0.208980 -0.008596 -0.038612 -0.049360   
LAA -0.137546 -0.135425  0.121566 -0.038295 -0.212129  0.040009  0.078545   
AAL -0.156112 -0.133524  0.114426 -0.020264 -0.058513  0.057005  0.076881   

          8         9         10     ...          91        92        93   \
0                                    ...                                    
AAA  0.173422  0.292520  0.190375    ...     0.244482  0.015974  0.012903   
ALA  0.093442  0.143256  0.044627    ...     0.075584 -0.139661  0.034863   
LLL  0.060720 -0.062662 -0.155879    ...     0.174677 -0.175961 -0.193242   
LAA  0.029837  0.138343  0.049377    ...     0.133947 -0.156484

In [4]:
def round_number(x, factor=100):
    '''
    Multiply number by a factor and then convert to an int
    '''
    return int(round(x * factor))

In [81]:
def load_data(filename):
    '''
    Load up the vector data
    '''
    return pd.read_csv(filename, header=None, sep='\t',index_col=0)


def score_seq_gang(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence
    '''
    seq = seq.upper()
    
    # make all the three-mers
    three_mers = dict()
    for i in range(0, len(seq)-2): 
        kmer = seq[i:i+3]
        three_mers[kmer] = three_mers.get(kmer, 0) + 1
        
    c = [three_mers.get(kmer, 0) for kmer in enbedding.index]
    
    vector_sums = np.dot(np.array(c), enbedding)
    
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()


In [82]:
seq = "MAVSAGSARTSPSSDKVQKDKAELISGPRQDSRIGKLLGFEWTDLSSWRRLVTLLNRPTDPASLAVFRFLFGFLMVLDIPQERGLSSLDRKYLDGLDVCRFPLLDALRPLPLDWMYLVYTIMFLGALGMMLGLCYRISCVLFLLPYWYVFLLDKTSWNNHSYLYGLLAFQLTFMDANHYWSVDGLLNAHRRNAHVPLWNYAVLRGQIFIVYFIAGVKKLDADWVEGYSMEYLSRHWLFSPFKLLLSEELTSLLVVHWGGLLLDLSAGFLLFFDVSRSIGLFFVSYFHCMNSQLFSIGMFSYVMLASSPLFCSPEWPRKLVSYCPRRLQQLLPLKAAPQPSVSCVYKRSRGKSGQKPGLRHQLGAAFTLLYLLEQLFLPYSHFLTQGYNNWTNGLYGYSWDMMVHSRSHQHVKITYRDGRTGELGYLNPGVFTQSRRWKDHADMLKQYATCLSRLLPKYNVTEPQIYFDIWVSINDRFQQRIFDPRVDIVQAAWSPFQRTSWVQPLLMDLSPWRAKLQEIKSSLDNHTEVVFIADFPGLHLENFVSEDLGNTSIQLLQGEVTVELVAEQKNQTLREGEKMQLPAGEYHKVYTTSPSPSCYMYVYVNTTELALEQDLAYLQELKEKVENGSETGPLPPELQPLLEGEVKGGPEPTPLVQTFLRRQQRLQEIERRRNTPFHERFFRFLLRKLYVFRRSFLMTCISLRNLILGRPSLEQLAQEVTYANLRPFEAVGELNPSNTDSSHSNPPESNPDPVHSEF"

# get the vector data
filename = 'protVec_100d_3grams.tsv'

# load up the enbedding data
enbedding_df = load_data(filename)   


# calculate an enbedded representation of the protein
t1 = time.time()

n = 1000

for i in range(n):
       
    vector = score_seq_gang(seq, enbedding_df, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)



time per seq: 0.06602551245689392 


['', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [67]:
def load_data_alternate(filename):
    '''
    Load up the vector data
    '''
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            data[elements[0]] = np.array([float(i) for i in elements[1:]])
    return data





def score_seq_4(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    seq = seq.upper()  
    
    # make all the three-mers
    three_mers = Counter([seq[i:i+3] for i in range(0, len(seq)-2)])
    
    # multiply the occurances with the feature vector
    c = [enbedding[k] * v for k, v in three_mers.items()]
    
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    
    # convert to int if desired
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()






In [69]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate(filename)


t1 = time.time()

n = 1000

for i in range(n):
        
       
    vector = score_seq_4(seq, dict_enbedding, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)



time per seq: 0.0014328484535217284 


[-58.42623600000004, -10.759072000000014, -13.220239000000015, -58.86460700000002, 5.293894999999993, -5.665261000000001, 21.071880000000025, -6.472677000000013, -15.875790999999998, 45.233404999999955, -16.031058999999978, -0.758817999999997, -1.837570000000002, 25.22429200000001, -3.6504390000000013, 3.459047999999994, 22.935611000000005, -26.155718000000014, -0.32191200000000003, -8.340253999999996, -5.105318999999998, -25.539444000000003, -23.19527599999999, 8.482181999999993, -14.642847999999969, -16.00841699999999, -6.310183999999995, -11.114567000000005, 1.3573779999999984, -11.257596000000012, 3.773756000000001, -40.655131999999966, -4.725194, -18.261205000000004, 28.14971599999999, 10.776018999999987, -30.358725000000007, 3.498790999999996, 12.19339700000001, -11.294632999999994, 19.950574000000007, 8.672497, 17.57202499999999, -1.9167750000000021, 7.114125999999995, 1.0858759999999994, -16.649106999999987, 7.0591729999999995, -9.18286100

In [33]:
def load_data_alternate_2(filename):
    '''
    Load up the vector data
    '''
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            
            if elements[0] == '<unk>':
                continue
            
            one, two, three = elements[0]
            
            if data.get(one) is None:
                data[one] = {}
                
            if data[one].get(two) is None:
                data[one][two] = {}
                
            if data[one][two].get(three) is None:
                data[one][two][three] = {}
                
            for num in range(1, 10):
                data[one][two][three][num] = np.array([float(i)*num for i in elements[1:]])
                
    return data





def score_seq_5(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    seq = seq.upper()  
    
    # make all the three-mers
    three_mers = Counter([seq[i:i+3] for i in range(0, len(seq)-2)])
    
    # multiply the occurances with the feature vector
    c = [enbedding[k[0]][k[1]][k[2]][v] for k, v in three_mers.items()]
    
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    
    # convert to int if desired
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()




In [49]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate_2(filename)


t1 = time.time()

n = 1000

for i in range(n):
        
       
    vector = score_seq_5(seq, dict_enbedding, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)

time per seq: 0.0005858440399169922 


[-58.42623600000004, -10.759072000000014, -13.220239000000015, -58.86460700000002, 5.293894999999993, -5.665261000000001, 21.071880000000025, -6.472677000000013, -15.875790999999998, 45.233404999999955, -16.031058999999978, -0.758817999999997, -1.837570000000002, 25.22429200000001, -3.6504390000000013, 3.459047999999994, 22.935611000000005, -26.155718000000014, -0.32191200000000003, -8.340253999999996, -5.105318999999998, -25.539444000000003, -23.19527599999999, 8.482181999999993, -14.642847999999969, -16.00841699999999, -6.310183999999995, -11.114567000000005, 1.3573779999999984, -11.257596000000012, 3.773756000000001, -40.655131999999966, -4.725194, -18.261205000000004, 28.14971599999999, 10.776018999999987, -30.358725000000007, 3.498790999999996, 12.19339700000001, -11.294632999999994, 19.950574000000007, 8.672497, 17.57202499999999, -1.9167750000000021, 7.114125999999995, 1.0858759999999994, -16.649106999999987, 7.0591729999999995, -9.18286100

In [76]:
def load_data_alternate_3(filename):
    '''
    Load up the vector data
    '''
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            
            if elements[0] == '<unk>':
                continue
            
            codon = elements[0]
                
            data[codon] = {}
                
            for num in range(1, 10):
                data[codon][num] = np.array([float(i)*num for i in elements[1:]])
                
    return data





def score_seq_6(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    seq = seq.upper()  
    
    # make all the three-mers
    three_mers = Counter([seq[i:i+3] for i in range(0, len(seq)-2)])
    
    # multiply the occurances with the feature vector
    c = [enbedding[k][v] for k, v in three_mers.items()]
    
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    
    # convert to int if desired
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()

In [78]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate_3(filename)


t1 = time.time()

n = 1000

for i in range(n):
        
       
    vector2 = score_seq_6(seq, dict_enbedding, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector2)

print(vector = vector2)

time per seq: 0.0005088441371917724 


[-58.42623600000004, -10.759072000000014, -13.220239000000015, -58.86460700000002, 5.293894999999993, -5.665261000000001, 21.071880000000025, -6.472677000000013, -15.875790999999998, 45.233404999999955, -16.031058999999978, -0.758817999999997, -1.837570000000002, 25.22429200000001, -3.6504390000000013, 3.459047999999994, 22.935611000000005, -26.155718000000014, -0.32191200000000003, -8.340253999999996, -5.105318999999998, -25.539444000000003, -23.19527599999999, 8.482181999999993, -14.642847999999969, -16.00841699999999, -6.310183999999995, -11.114567000000005, 1.3573779999999984, -11.257596000000012, 3.773756000000001, -40.655131999999966, -4.725194, -18.261205000000004, 28.14971599999999, 10.776018999999987, -30.358725000000007, 3.498790999999996, 12.19339700000001, -11.294632999999994, 19.950574000000007, 8.672497, 17.57202499999999, -1.9167750000000021, 7.114125999999995, 1.0858759999999994, -16.649106999999987, 7.0591729999999995, -9.18286100

In [56]:
0.013638761043548584 / 0.0005056767463684082

26.971303587711613