In [1]:
# get the vector data

import pandas as pd

filename = 'protVec_100d_3grams.tsv'

    
def load_data(filepath):
    '''
    Load up the vector data
    '''
    return pd.read_csv(filepath, header=None, sep='\t')
    
# load up the enbedding data
enbedding_df = load_data(filename)   

print(enbedding_df.head())



   0         1         2         3         4         5         6         7    \
0  AAA -0.174060 -0.095756  0.059515  0.039673 -0.375934 -0.115415  0.090725   
1  ALA -0.114085 -0.093288  0.155800 -0.037351 -0.121446  0.084037  0.023819   
2  LLL -0.075594 -0.100834 -0.046616 -0.208980 -0.008596 -0.038612 -0.049360   
3  LAA -0.137546 -0.135425  0.121566 -0.038295 -0.212129  0.040009  0.078545   
4  AAL -0.156112 -0.133524  0.114426 -0.020264 -0.058513  0.057005  0.076881   

        8         9      ...          91        92        93        94   \
0  0.173422  0.292520    ...     0.244482  0.015974  0.012903  0.137528   
1  0.093442  0.143256    ...     0.075584 -0.139661  0.034863  0.056078   
2  0.060720 -0.062662    ...     0.174677 -0.175961 -0.193242 -0.072965   
3  0.029837  0.138343    ...     0.133947 -0.156484 -0.048541  0.141848   
4  0.054781  0.129436    ...     0.154597 -0.050440  0.054866  0.066185   

        95        96        97        98        99        100  
0  0

In [3]:

seq = "MAVSAGSARTSPSSDKVQKDKAELISGPRQDSRIGKLLGFEWTDLSSWRRLVTLLNRPTDPASLAVFRFLFGFLMVLDIPQERGLSSLDRKYLDGLDVCRFPLLDALRPLPLDWMYLVYTIMFLGALGMMLGLCYRISCVLFLLPYWYVFLLDKTSWNNHSYLYGLLAFQLTFMDANHYWSVDGLLNAHRRNAHVPLWNYAVLRGQIFIVYFIAGVKKLDADWVEGYSMEYLSRHWLFSPFKLLLSEELTSLLVVHWGGLLLDLSAGFLLFFDVSRSIGLFFVSYFHCMNSQLFSIGMFSYVMLASSPLFCSPEWPRKLVSYCPRRLQQLLPLKAAPQPSVSCVYKRSRGKSGQKPGLRHQLGAAFTLLYLLEQLFLPYSHFLTQGYNNWTNGLYGYSWDMMVHSRSHQHVKITYRDGRTGELGYLNPGVFTQSRRWKDHADMLKQYATCLSRLLPKYNVTEPQIYFDIWVSINDRFQQRIFDPRVDIVQAAWSPFQRTSWVQPLLMDLSPWRAKLQEIKSSLDNHTEVVFIADFPGLHLENFVSEDLGNTSIQLLQGEVTVELVAEQKNQTLREGEKMQLPAGEYHKVYTTSPSPSCYMYVYVNTTELALEQDLAYLQELKEKVENGSETGPLPPELQPLLEGEVKGGPEPTPLVQTFLRRQQRLQEIERRRNTPFHERFFRFLLRKLYVFRRSFLMTCISLRNLILGRPSLEQLAQEVTYANLRPFEAVGELNPSNTDSSHSNPPESNPDPVHSEF"


def round_number(x, factor=100):
    '''
    Multiply number by a factor and then convert to an int
    '''
    return int(round(x * factor))



def score_seq(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence
    '''
    seq = seq.upper()
    
    # make all the three-mers
    three_mers = [seq[i:i+3] for i in range(0, len(seq)-2)]

    # convert the sequence 3-mer list to a data frame
    three_mers_df = pd.DataFrame()
    three_mers_df[0] = three_mers

    # merge the two data frames, the merge happens on the sequence trimers
    merged_df = pd.merge(three_mers_df, enbedding, on=[0])

    # sum the columns and remove the sequence information (first column)
    vector_sums = merged_df.sum(axis = 0, skipna = False).drop(0)
    
    # to save on memory we may want to convert the folat to an int
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()




# calculate an enbedded representation of the protein
import time

t1 = time.time()

for i in range(100):
    vector = score_seq(seq, enbedding_df, to_int=False)
t2 = time.time()

print('time per seq: ', (t2-t1)/100)

# vector = score_seq(seq, enbedding_df, to_int=False)

time per seq:  0.013712997436523438


In [155]:
print(vector)

[-58.42623600000004, -10.759072000000009, -13.220239000000014, -58.864606999999985, 5.2938949999999965, -5.665261000000001, 21.071880000000018, -6.472677000000012, -15.875790999999998, 45.233404999999934, -16.031058999999985, -0.7588179999999976, -1.8375699999999993, 25.224292000000016, -3.6504390000000058, 3.459047999999995, 22.93561100000002, -26.155718000000004, -0.3219120000000005, -8.340253999999998, -5.105318999999998, -25.539444000000003, -23.195275999999986, 8.482181999999995, -14.642847999999983, -16.008416999999987, -6.310184, -11.114567000000012, 1.3573780000000015, -11.257596000000005, 3.773756000000001, -40.655131999999966, -4.725194000000002, -18.26120500000001, 28.149715999999987, 10.776019, -30.35872500000001, 3.4987909999999935, 12.193397000000006, -11.294633, 19.950574000000007, 8.672497000000005, 17.572024999999993, -1.9167749999999986, 7.114125999999992, 1.085876000000003, -16.64910699999999, 7.0591729999999995, -9.182861000000003, 13.477858000000007, -29.4556639999