In [1]:
#Ben Habermeyer Second Final Project Script

#read in the parsed data of molecules, kinases, and their inhibitors
'''
import pandas as pd
from google.colab import files
uploaded = files.upload()
'''

'\nimport pandas as pd\nfrom google.colab import files\nuploaded = files.upload()\n'

In [2]:
import pandas as pd
import io
final_data = pd.read_csv('all_data.csv')
final_data = final_data.dropna(subset = ['prot_sequence'], how = 'any')
final_data = final_data.reset_index(drop = True)
final_data.head()

Unnamed: 0,compound_id,target_id,standard_type,standard_value,standard_units,prot_sequence,smiles_sequence
0,CHEMBL135581,P00374,KD,0.5,NM,SLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQN...,CCCN1C(=CC=C2C1=NC(NC2=O)N)C
1,CHEMBL135581,P00374,KD,0.6,NM,SLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQN...,CCCN1C(=CC=C2C1=NC(NC2=O)N)C
2,CHEMBL135581,P00374,KD,450.0,NM,SLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQN...,CCCN1C(=CC=C2C1=NC(NC2=O)N)C
3,CHEMBL135581,P00374,KD,1000.0,NM,SLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQN...,CCCN1C(=CC=C2C1=NC(NC2=O)N)C
4,CHEMBL135581,P00374,KD,1200.0,NM,SLNCIVAVSQNMGIGKNGDLPWPPLRNEFRYFQRMTTTSSVEGKQN...,CCCN1C(=CC=C2C1=NC(NC2=O)N)C


In [3]:
#convert the data into 3 lists corresponding to the protein sequence, smiles data, and Kd values
kd = final_data['standard_value'].tolist()
protein = final_data['prot_sequence'].tolist()
compound = final_data['smiles_sequence'].tolist()
del final_data
#del uploaded

In [4]:
#class to wrap all my ML stuff inside
import numpy as np
import functools
import time
from math import log10
import sklearn.model_selection as ms
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D


class KinaseBinding:
        
    def __init__(self, smiles_strings, protein_strings, kd_values):
        self.kd_values = kd_values
        self.protein_strings = protein_strings
        self.smiles_strings = smiles_strings
        self.num_pairs = len(smiles_strings)
        self.encoding_length = 300
        self.encoding_features = 21
        
        #variables for one-hot encoding
        self.AA_mappings = {'G': 0, 'A': 1, 'L': 2, 'M': 3, 'F': 4, 'W': 5, 'K': 6, 'Q': 7, 'E': 8, 'S': 9, 
          'P': 10, 'V': 11, 'I': 12, 'C': 13, 'Y': 14, 'H': 15, 'R': 16, 'N': 17, 'D': 18, 'T': 19, 'X': 20}
        
        self.compound_mappings = {'#': 15, '(': 2, ')': 3, '+': 17, '-': 18, '.': 16, '/': 13, '=': 1, '@': 6,
                         'C': 0, 'F': 10, 'H': 9, 'N': 4, 'O': 5, 'S': 11, '[': 7, '\\': 14, ']': 8,
                         'a': 19, 'l': 12, 'B': 20}
        
    #delete method
    #def __del__(self):
        
        
    #decorator to time each of the function operations
    def fun_timer(f):
        @functools.wraps(f)
        def wrapper(*args, **kwargs):
            start = time.perf_counter()
            output = f(*args, **kwargs)
            end = time.perf_counter()
            runtime = end - start
            print("Finished running {} in {} seconds".format(f.__name__, runtime))
            return output
        return wrapper
        
    #takes a dict of most common characters and creates one hot encoded numpy array size num_stringsx300x20
    @fun_timer
    def oneHotCompound(self):
        array = np.zeros((self.num_pairs, self.encoding_length, self.encoding_features))
        ## iterate through the strings and encode their characters
        for i, string in enumerate(self.smiles_strings):
            counter = 0
            for c in string:
                if counter >= self.encoding_length:
                    break
                elif c in self.compound_mappings:
                    array[i][counter][self.compound_mappings[c]] = 1
                    counter = counter + 1
                else:
                    try:
                        #set integers to be the number of hydrogens
                        toint = int(c)
                        for j in range(counter, counter + toint):
                            if j < self.encoding_length:
                                array[i][j][self.compound_mappings['H']] = 1
                        counter = counter + toint
                    except:
                        counter = counter + 1
        return array
    
    #takes a dict of amino acids and their indices 0-20 and creates a one hot encoded numpy array
    @fun_timer
    def oneHotProtein(self):
        array = np.zeros((self.num_pairs, self.encoding_length, self.encoding_features))
        #iterate through the proteins and encode their amino acids
        for i, string in enumerate(self.protein_strings):
            for j, c in enumerate(string):
                array[i][j][self.AA_mappings[c]] = 1
        return array
      
    #converts the list of Kd values to -log(kd)
    @fun_timer
    def convertKd(self):
        l = [float(x) for x in self.kd_values]
        p = [x**-9 if x > 0 else 0.01**-9 for x in l]
        return np.asarray([-log10(x) for x in p])
      
    #split the data into test and train
    @fun_timer
    def split(self, compound, protein, Kd):
        Xtrain1, Xtest1, Xtrain2, Xtest2, ytrain, ytest = ms.train_test_split(compound, protein, Kd, test_size = 0.2, random_state = 42)
        return (Xtrain1, Xtest1, Xtrain2, Xtest2, ytrain, ytest)
      
    #train my first CNN - just concatenates the data then pools it
    @fun_timer
    def cnnModel1(self):
        (Xtrain1, Xtest1, Xtrain2, Xtest2, ytrain, ytest) = self.split(self.oneHotCompound(), self.oneHotProtein(), self.convertKd())
        Xtrain1 = Xtrain2.reshape(Xtrain1.shape[0], 300, 21, 1)
        Xtrain2 = Xtrain2.reshape(Xtrain1.shape[0], 300, 21, 1)

        main_input = Input(shape = (300, 21, 1), name = 'main_input')
        auxiliary_input = Input(shape = (300, 21, 1), name = 'aux_input')
        x = concatenate([main_input, auxiliary_input])

        x = Conv2D(32, kernel_size = 5, activation = 'relu')(x)
        x = Flatten()(x)
        x = Dense(100, activation='relu')(x)
        x = Dense(20, activation='relu')(x)
        main_output = Dense(1)(x)
        model = Model(inputs=[main_input, auxiliary_input], outputs = main_output)
        model.compile(optimizer = 'adam', loss = 'mean_squared_error')
        model.fit([Xtrain1, Xtrain2], ytrain, epochs = 3)     

ModuleNotFoundError: No module named 'keras'

In [0]:
k = KinaseBinding(compound, protein, kd)
k.cnnModel1()

Finished running oneHotCompound in 7.283366845000273 seconds
Finished running oneHotProtein in 14.574471961999734 seconds
Finished running convertKd in 0.039141035999818996 seconds


In [0]:
k = KinaseBinding(compound, protein, kd)
prot = k.oneHotProtein()
compound = k.oneHotCompound()
kd = k.convertKd()

Finished running oneHotProtein in 14.548467050999534 seconds
Finished running oneHotCompound in 8.05690279600094 seconds
Finished running convertKd in 0.052365761999681126 seconds


In [0]:
import numpy as np
import pandas as pd
import keras
import sklearn.model_selection as ms
from keras.models import Model
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, concatenate, Input

x1 = np.random.rand(500, 300, 21)
x2 = np.random.rand(500, 300, 21)
y = np.random.rand(500, 1)

Xtrain1, Xtest1, Xtrain2, Xtest2, ytrain, ytest = ms.train_test_split(x1, x2, y, test_size = 0.2, random_state = 42)
Xtrain1 = Xtrain2.reshape(Xtrain1.shape[0], 300, 21, 1)
Xtrain2 = Xtrain2.reshape(Xtrain1.shape[0], 300, 21, 1)

main_input = Input(shape = (300, 21, 1), name = 'main_input')
auxiliary_input = Input(shape = (300, 21, 1), name = 'aux_input')
x = concatenate([main_input, auxiliary_input])

x = Conv2D(32, kernel_size = 5, activation = 'relu')(x)
x = Flatten()(x)
x = Dense(100, activation='relu')(x)
x = Dense(20, activation='relu')(x)
main_output = Dense(1)(x)
model = Model(inputs=[main_input, auxiliary_input], outputs = main_output)
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
model.fit([Xtrain1, Xtrain2], ytrain, epochs = 3)

Epoch 1/1


<keras.callbacks.History at 0x7f48cb7d3f60>