In [1]:
import os
from dotenv import load_dotenv, find_dotenv
from os.path import join, dirname, basename, exists, isdir

### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory


# make folders specific for certain data
folder_name = 'BRENDA_2019.1_ec_data'
if folder_name != '':
    #make folders if they don't exist
    if not exists(join(RAW_EXTERNAL, folder_name)):
        os.makedirs(join(RAW_EXTERNAL, folder_name))

    if not exists(join(INTERMEDIATE, folder_name)):
        os.makedirs(join(INTERMEDIATE, folder_name))

    if not exists(join(FINAL, folder_name)):
        os.makedirs(join(FINAL, folder_name))

print('Standard variables loaded, you are good to go!')

Standard variables loaded, you are good to go!


In [2]:

import pandas as pd
import time
import numpy as np
from collections import Counter, defaultdict


seq = "MAVSAGSARTSPSSDKVQKDKAELISGPRQDSRIGKLLGFEWTDLSSWRRLVTLLNRPTDPASLAVFRFLFGFLMVLDIPQERGLSSLDRKYLDGLDVCRFPLLDALRPLPLDWMYLVYTIMFLGALGMMLGLCYRISCVLFLLPYWYVFLLDKTSWNNHSYLYGLLAFQLTFMDANHYWSVDGLLNAHRRNAHVPLWNYAVLRGQIFIVYFIAGVKKLDADWVEGYSMEYLSRHWLFSPFKLLLSEELTSLLVVHWGGLLLDLSAGFLLFFDVSRSIGLFFVSYFHCMNSQLFSIGMFSYVMLASSPLFCSPEWPRKLVSYCPRRLQQLLPLKAAPQPSVSCVYKRSRGKSGQKPGLRHQLGAAFTLLYLLEQLFLPYSHFLTQGYNNWTNGLYGYSWDMMVHSRSHQHVKITYRDGRTGELGYLNPGVFTQSRRWKDHADMLKQYATCLSRLLPKYNVTEPQIYFDIWVSINDRFQQRIFDPRVDIVQAAWSPFQRTSWVQPLLMDLSPWRAKLQEIKSSLDNHTEVVFIADFPGLHLENFVSEDLGNTSIQLLQGEVTVELVAEQKNQTLREGEKMQLPAGEYHKVYTTSPSPSCYMYVYVNTTELALEQDLAYLQELKEKVENGSETGPLPPELQPLLEGEVKGGPEPTPLVQTFLRRQQRLQEIERRRNTPFHERFFRFLLRKLYVFRRSFLMTCISLRNLILGRPSLEQLAQEVTYANLRPFEAVGELNPSNTDSSHSNPPESNPDPVHSEF"

# get the vector data
filename = join(RAW_EXTERNAL, 'protvec', 'protVec_100d_3grams.tsv')

In [None]:
def load_data(filepath):
    '''
    Load up the vector data
    '''
    return pd.read_csv(filepath, header=None, sep='\t')



def round_number(x, factor=100):
    '''
    Multiply number by a factor and then convert to an int
    '''
    return int(round(x * factor))



def score_seq_original(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence
    '''
    seq = seq.upper()
    
    # make all the three-mers
    three_mers = [seq[i:i+3] for i in range(0, len(seq)-2)]

    # convert the sequence 3-mer list to a data frame
    three_mers_df = pd.DataFrame()
    three_mers_df[0] = three_mers

    # merge the two data frames, the merge happens on the sequence trimers
    merged_df = pd.merge(three_mers_df, enbedding, on=[0])

    # sum the columns and remove the sequence information (first column)
    vector_sums = merged_df.sum(axis = 0, skipna = False).drop(0)
    
    # to save on memory we may want to convert the folat to an int
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()



In [None]:

    
# load up the enbedding data
enbedding_df = load_data(filename)   




n = 100

t1 = time.time()

for i in range(n):
    vector = score_seq_original(seq, enbedding_df, to_int=False)
t2 = time.time()

print('time per seq: ', (t2-t1)/n)

print(vector)


In [None]:
def round_number(x, factor=100):
    '''
    Multiply number by a factor and then convert to an int
    '''
    return int(round(x * factor))

In [None]:
def load_data(filename):
    '''
    Load up the vector data
    '''
    return pd.read_csv(filename, header=None, sep='\t',index_col=0)


def score_seq_gang(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence
    '''
    seq = seq.upper()
    
    # make all the three-mers
    three_mers = dict()
    for i in range(0, len(seq)-2): 
        kmer = seq[i:i+3]
        three_mers[kmer] = three_mers.get(kmer, 0) + 1
        
    c = [three_mers.get(kmer, 0) for kmer in enbedding.index]
    
    vector_sums = np.dot(np.array(c), enbedding)
    
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()


In [None]:


# load up the enbedding data
enbedding_df = load_data(filename)   


# calculate an enbedded representation of the protein
t1 = time.time()

n = 1000

for i in range(n):
       
    vector = score_seq_gang(seq, enbedding_df, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)



In [None]:
def load_data_alternate(filename):
    '''
    Load up the vector data
    '''
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            data[elements[0]] = np.array([float(i) for i in elements[1:]])
    return data





def score_seq_4(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    seq = seq.upper()  
    
    # make all the three-mers
    three_mers = Counter([seq[i:i+3] for i in range(0, len(seq)-2)])
    
    # multiply the occurances with the feature vector
    c = [enbedding[k] * v for k, v in three_mers.items()]
    
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    
    # convert to int if desired
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()






In [None]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate(filename)


t1 = time.time()

n = 1000

for i in range(n):
        
       
    vector = score_seq_4(seq, dict_enbedding, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)



In [None]:
def load_data_alternate_2(filename):
    '''
    Load up the vector data
    '''
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            
            if elements[0] == '<unk>':
                continue
            
            one, two, three = elements[0]
            
            if data.get(one) is None:
                data[one] = {}
                
            if data[one].get(two) is None:
                data[one][two] = {}
                
            if data[one][two].get(three) is None:
                data[one][two][three] = {}
                
            for num in range(1, 10):
                data[one][two][three][num] = np.array([float(i)*num for i in elements[1:]])
                
    return data





def score_seq_5(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    seq = seq.upper()  
    
    # make all the three-mers
    three_mers = Counter([seq[i:i+3] for i in range(0, len(seq)-2)])
    
    # multiply the occurances with the feature vector
    c = [enbedding[k[0]][k[1]][k[2]][v] for k, v in three_mers.items()]
    
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    
    # convert to int if desired
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()




In [None]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate_2(filename)


t1 = time.time()

n = 1000

for i in range(n):
        
       
    vector = score_seq_5(seq, dict_enbedding, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)

In [None]:
import numpy as np

def load_data_alternate_3(filename):
    '''
    Load up the vector data
    '''
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            
            if elements[0] == '<unk>':
                continue
            
            codon = elements[0]
                
            data[codon] = {}
                
            for num in range(1, 10):
                data[codon][num] = np.array([float(i)*num for i in elements[1:]])
                
    return data





def score_seq_6(seq, enbedding, to_int=True):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    seq = seq.upper()  
    
    # make all the three-mers
    three_mers = Counter([seq[i:i+3] for i in range(0, len(seq)-2)])
    
    # multiply the occurances with the feature vector
    c = [enbedding[k][v] for k, v in three_mers.items()]
    
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    
    # convert to int if desired
    if to_int is True:
        vector_sums = vector_sums.apply(round_number)
    
    return vector_sums.tolist()

In [None]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate_3(filename)
      
# %timeit score_seq_6(seq, dict_enbedding, to_int=False)

# %prun score_seq_6(seq, dict_enbedding, to_int=False)


In [None]:
%timeit score_seq_6(seq, dict_enbedding, to_int=False)

In [None]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate_3(filename)


t1 = time.time()

n = 10000

for i in range(n):
        
       
    vector = score_seq_6(seq, dict_enbedding, to_int=False)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)

In [7]:

%load_ext cythonmagic


The cythonmagic extension is already loaded. To reload it, use:
  %reload_ext cythonmagic


In [8]:
%%cython
cimport cython
from collections import Counter
import numpy as np
cimport numpy as np
@cython.infer_types(True)
cdef load_data_alternate_4(str filename):
    '''
    Load up the vector data
    '''
    cdef unsigned int num
    cdef dict data
    cdef str codon
    cdef double i
    data = {}
    with open(filename, 'r') as f:
        for line in f:
            elements = line.split('\t')
            if elements[0] == '<unk>':
                continue
            codon = elements[0]
            data[codon] = {}
            for num in range(1, 10):
                data[codon][num] = np.array([float(i)*num for i in elements[1:]])
    return data
@cython.infer_types(True)
cdef score_seq_7(str seq, dict enbedding):
    '''
    Use the embedding to calculate a feature vector for a protein sequence.
    This is the fastest one. Gives a performance of about 4.2 hours for 10 mio sequences.
    '''
    cdef unsigned int i
    cdef unsigned int three = 3
    cdef unsigned int v
    cdef str k
    cdef list c
    cdef np.ndarray vector_sums
    cdef list vector_list
    seq = seq.upper()  
    # make all the three-mers
    three_mers = Counter([seq[i:i+three] for i in range(0, len(seq)-2)])
    # multiply the occurances with the feature vector
    c = [enbedding[k][v] for k, v in three_mers.items()]
    # sum the columns of the generated vectors
    vector_sums = np.array(c).sum(axis=0)
    return vector_sums.tolist()



UsageError: Cell magic `%%cython` not found.


In [None]:

# calculate an enbedded representation of the protein

dict_enbedding = load_data_alternate_4(filename)



t1 = time.time()

n = 10000

for i in range(n):
        
       
    vector = score_seq_7(seq, dict_enbedding)
    
t2 = time.time()

print('time per seq: %s \n\n' % ((t2-t1)/n))

print(vector)


In [None]:
dict_enbedding = load_data_alternate_4(filename)

%timeit score_seq_7(seq, dict_enbedding)

%prun score_seq_7(seq, dict_enbedding)

In [None]:
dict_enbedding


In [None]:
0.013638761043548584 / 0.0005056767463684082