<img style="float: left;" src="https://cdn.pixabay.com/photo/2016/12/07/09/45/dna-1889085__340.jpg" width=10%> <h1> Application of AI to Discover Novel Binding of Small Molecules </h1>

---------
### Sample Dataset for Testing Purposes

##### Here we create a sample dataset for two reasons:
- to get a better understanding of the structure of the data
- test any sample code for validity

##### Structure of sample dataset:
1. A dataframe consisting of 50 genes and 1020 profiles [50 x 1020]
2. Columns are a combination of drug, replicate, time, concentration, probe_location, cell type. For the purposes of this project only drug and replicate matters in terms of training. So the column name will be structured as
"*drug + replicate id + unique characters that represent time, concentration, probe_location and cell type*"
3. 20 columns consist of control genes or 'control probes'. Columns are labelled control_x where x is a number from 1 to 20
3. Dataset consists of 25 drugs with 4 replicates and 10 combinations of time, concentration, probe_location and cell type

| Feature      | Quantity | Represented By |
| ----------- | ----------- | ----------- |
| Drug      | 25       | Alphabets A-Y |
| Replicate   | 4        | Numbers 1-4 |
| Other features   | 10        | Random String of length 3 |

***R_3_xcv*** represents a profile of drug 'R', of replicate 3, with other features coresponding to 'xcv'

##### Construction of Sample Dataset

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
genes = ['gene'+str(a) for a in range(50)]
drugs = [chr(a) for a in range(65, 90)]
replicates = [str(a) for a in range(1, 5)]
other_features = set()

while len(other_features)!=10:
    rand_string = "". join([str(chr(int(random.random()*100)%26+97)) for a in range(3)])
    other_features.add(rand_string)

In [3]:
columns = ["_".join([a,b,c]) for a in drugs for b in replicates for c in other_features]
# columns = ["control_"+str(a+1) for a in range(20)] + columns

In [4]:
data = pd.DataFrame(2*np.random.rand(50, len(columns))-1, index=genes, columns=columns)
data.columns = columns
data.fillna(random.random(), inplace = True)
data.shape

(50, 1000)

In [5]:
data.head()

Unnamed: 0,A_1_gps,A_1_gvy,A_1_vis,A_1_ize,A_1_opp,A_1_jbq,A_1_jkc,A_1_eit,A_1_rtv,A_1_qcj,...,Y_4_gps,Y_4_gvy,Y_4_vis,Y_4_ize,Y_4_opp,Y_4_jbq,Y_4_jkc,Y_4_eit,Y_4_rtv,Y_4_qcj
gene0,0.324634,-0.512199,-0.422898,0.322265,-0.295296,-0.563205,0.741714,-0.219386,0.263429,0.744535,...,0.271809,0.096694,0.130743,0.676475,0.05399,0.466794,-0.158463,0.668328,-0.094773,0.852537
gene1,-0.261772,-0.12072,0.79416,-0.343175,0.52089,0.718406,0.892644,-0.876281,0.304717,-0.721386,...,-0.432493,-0.296607,0.739531,-0.426699,-0.595429,-0.041522,-0.463425,0.367083,-0.220199,0.098687
gene2,0.055482,-0.991596,-0.683812,0.0457,0.331425,0.899492,-0.769606,0.045896,0.710955,-0.489117,...,0.184052,-0.143825,0.820035,-0.866245,-0.104654,-0.585007,0.000547,-0.15808,0.488346,-0.175934
gene3,0.880006,-0.163318,0.864493,-0.922229,0.851741,-0.294169,0.432384,0.318402,-0.1196,-0.047593,...,0.103779,0.905737,-0.189061,0.832541,0.299131,0.005537,-0.31703,0.236083,-0.403237,0.324517
gene4,0.510791,-0.089568,-0.228124,0.145007,0.616179,0.808682,-0.276685,0.596809,-0.399864,-0.753906,...,0.551861,-0.001323,0.091154,-0.226857,0.147661,0.007921,0.953207,-0.570335,0.40874,-0.634798


##### Classifying Columns
A label needs to be assigned to each class. This can be done at the biological replicate level or the perturbagen level. We create classifications for each of these.

In [6]:
perturbagen_class = [int(a/25) for a in range(1000)]
replicate_class = [10*a+c for a in range(25) for b in range(4) for c in range(10)]

##### Creating the dataset

In [7]:
#transpose data
workingdata = data.transpose()
workingdata.head()

Unnamed: 0,gene0,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene40,gene41,gene42,gene43,gene44,gene45,gene46,gene47,gene48,gene49
A_1_gps,0.324634,-0.261772,0.055482,0.880006,0.510791,-0.394427,0.265938,-0.270761,-0.332389,0.990229,...,-0.084976,-0.485472,0.763649,0.294632,-0.839028,0.169186,0.587716,0.670123,0.50757,0.23537
A_1_gvy,-0.512199,-0.12072,-0.991596,-0.163318,-0.089568,-0.33072,0.266884,0.15143,-0.963103,-0.251559,...,-0.425571,0.829806,-0.030973,-0.63085,0.590557,0.764148,0.974986,-0.073809,0.62166,0.56361
A_1_vis,-0.422898,0.79416,-0.683812,0.864493,-0.228124,-0.054174,-0.444012,-0.043513,-0.696945,-0.274318,...,0.917263,0.574977,-0.024039,-0.120628,-0.432643,-0.334697,-0.604342,0.541312,0.543612,-0.75181
A_1_ize,0.322265,-0.343175,0.0457,-0.922229,0.145007,0.785617,-0.942278,0.100348,-0.217472,-0.4505,...,-0.786332,-0.660057,0.440792,-0.654501,0.639928,-0.266416,-0.491868,0.709543,-0.20861,0.670833
A_1_opp,-0.295296,0.52089,0.331425,0.851741,0.616179,0.267142,-0.134957,0.925613,0.870644,-0.073282,...,-0.992558,-0.244736,0.589236,-0.009589,-0.067329,0.789043,-0.702025,0.277981,0.689783,0.01713


In [8]:
X_train, X_test, y_train, y_test = train_test_split(workingdata, perturbagen_class, test_size=0.5)
X_test.shape

(500, 50)

##### Computation - Siamese

In [9]:
import keras
from keras.datasets import reuters
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Input
from keras.layers.noise import AlphaDropout
from keras.preprocessing.text import Tokenizer
from keras.layers import Layer
from tensorflow.python.keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
max_words = 50
batch_size = 16
epochs = 40

def create_network(n_dense=6,
                   dense_units=16,
                   activation='selu',
                   dropout=AlphaDropout,
                   dropout_rate=0.1,
                   kernel_initializer='lecun_normal',
                   optimizer='adam',
                   num_classes=1,
                   max_words=max_words):
    
    model = Sequential()
    model.add(Dense(dense_units, input_shape=(max_words,),
                    kernel_initializer=kernel_initializer))
    model.add(Activation(activation))
    model.add(dropout(dropout_rate))

    for i in range(n_dense - 1):
        model.add(Dense(dense_units, kernel_initializer=kernel_initializer))
        model.add(Activation(activation))
        model.add(dropout(dropout_rate))

    #model.add(Dense(num_classes))
    #model.add(Activation('softmax'))
    return model

In [11]:
network = {
    'n_dense': 10,
    'dense_units': 16,
    'activation': 'selu',
    'dropout': AlphaDropout,
    'dropout_rate': 0.1,
    'kernel_initializer': 'lecun_normal',
    'optimizer': 'sgd',
    'num_classes':40
}

In [12]:
model = create_network(**network)

In [13]:
len(model.layers)

30

In [14]:
class ManDist(Layer):
    
    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

In [15]:
left_input = Input(shape=(max_words,))
right_input = Input(shape=(max_words,))

In [16]:
# Model variables
shared_model = model

In [17]:
#TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'- embedding layer is required
#Node error -> from keras not from tf.python.keras
#Input 'b' of 'MatMul' Op has type float32 that does not match type int32 of argument 'a'. ->
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

model.compile(loss='mean_squared_error', optimizer="adam", metrics=['accuracy'])
model.summary()
shared_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 16)           3264        input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
man_dist_1 (ManDist)            (None, 1)            0           sequential_1[1][0]               
          

In [18]:
a = np.asarray(data.transpose().iloc[0:5,:])
b = np.asarray(data.transpose().iloc[6:11,:])

In [19]:
from keras import backend as K

# with a Sequential model
get_3rd_layer_output = K.function([model.layers[0].input,model.layers[1].input]
                                  ,[model.layers[2].get_output_at(1),model.layers[2].get_output_at(2),model.layers[3].output])

layer_output = get_3rd_layer_output([a,b])
layer_output

[array([[-0.19057459,  1.7253517 , -1.6672118 , -0.1841178 ,  0.57209563,
         -1.1599962 ,  0.5523666 ,  0.5079637 ,  0.24208523, -1.1146371 ,
         -1.111214  ,  0.8422288 ,  2.0185504 , -0.52523667,  0.45260412,
          0.33307612],
        [-0.39516768, -0.29591113, -1.5629207 ,  1.0116124 ,  0.17067975,
          0.30304193,  1.8597828 , -0.69617605,  0.8184404 , -0.5511421 ,
         -0.8471207 ,  0.44013354,  0.47050068, -0.13658464, -1.3789027 ,
          1.1954532 ],
        [ 0.44339842,  1.099096  , -1.5248781 , -0.8852858 ,  0.839443  ,
         -1.4595574 ,  0.32434022,  1.4775376 , -1.0955718 , -0.37081409,
         -1.2843187 ,  1.7918526 , -0.5683673 , -1.0145888 ,  0.21341445,
         -0.78439206],
        [-0.52113116, -0.6008608 ,  1.3506528 ,  0.2787183 , -1.4213215 ,
          1.9024509 ,  0.00892367, -1.4003485 ,  0.92487144, -0.45669106,
          1.0628775 , -1.3624649 ,  0.61842984,  0.926004  , -0.45385814,
          1.1992599 ],
        [-0.5786603 

In [20]:
one = layer_output[0][0]

two = layer_output[1][0]

result = sum(abs(one - two))
result

15.431805327534676

In [21]:
#ValueError: Error when checking target: expected man_dist_1 to have shape (1,) but got array with shape (46,)
#==> need to convert code to suit multi-class

malstm_trained = model.fit([X_train,X_test], y_train, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [22]:
prediction = model.predict([X_test,X_train],verbose=1)
print(prediction[0:5])

[[ 1.7313926]
 [14.990277 ]
 [35.39305  ]
 [38.89383  ]
 [12.070194 ]]


In [23]:
score = model.evaluate([X_test,X_train],y_train,verbose=1)
score



[275.4162053222656, 0.036]

In [24]:
model.layers[2].get_weights()[0].shape

(50, 16)

In [25]:
model.layers[2].get_weights()[0]

array([[ 1.33087402e-02,  1.50969541e-02,  1.83586195e-01,
        -1.21911585e-01,  2.19751149e-01, -1.70956686e-01,
        -1.65725246e-01,  2.01449186e-01, -6.53685778e-02,
        -3.84455286e-02,  1.99463703e-02, -3.30686532e-02,
         2.02107176e-01,  1.59298882e-01,  1.53938800e-01,
         9.80899855e-02],
       [ 2.41964743e-01, -5.12437522e-02,  5.88559918e-02,
         2.90473215e-02, -9.78905335e-02, -1.00420691e-01,
         4.85755205e-02,  4.00516056e-02, -2.23506838e-01,
        -6.63971454e-02,  1.73637360e-01,  9.34533179e-02,
         1.32114105e-02, -9.23417136e-02,  6.21325634e-02,
         1.78486127e-02],
       [ 4.36170429e-01,  8.08338821e-02, -1.50025627e-02,
         2.08037905e-03,  1.47101626e-01,  1.40257806e-01,
         2.66554505e-01,  2.35853195e-01,  1.80527627e-01,
         2.45074153e-01, -4.17984836e-02,  5.09905666e-02,
         1.54581219e-01, -2.36850426e-01,  2.13808715e-01,
         8.51523578e-02],
       [ 2.00655133e-01,  1.07609473e

In [26]:
for i in model.layers[2].get_weights():
    print(i.shape)

(50, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)


In [27]:
model.layers[2].get_output_at(0)

<tf.Tensor 'alpha_dropout_10/cond/Merge:0' shape=(?, 16) dtype=float32>

In [28]:
model.layers[2].get_output_at(2)

<tf.Tensor 'sequential_1_1/alpha_dropout_10/cond/Merge:0' shape=(?, 16) dtype=float32>

In [29]:
model.layers[2]

<keras.engine.sequential.Sequential at 0x1a297c9eb8>

In [30]:
from keras.models import Model

intermediate_layer_model = Model(inputs=[model.get_layer(index=0).get_input_at(0),
                                         model.get_layer(index=1).get_input_at(0)]
                                 ,outputs=[model.get_layer(index=1).get_output_at(0),
                                           model.get_layer(index=1).get_output_at(0)])

In [31]:
a = np.asarray(data.transpose().iloc[0:5,:])
b = np.asarray(data.transpose().iloc[6:11,:])

In [32]:
from keras import backend as K

# with a Sequential model
get_3rd_layer_output = K.function([model.layers[0].input,model.layers[1].input]
                                  ,[model.layers[2].get_output_at(1),model.layers[2].get_output_at(2),model.layers[3].output])

In [33]:
layer_output = get_3rd_layer_output([a,b])
layer_output

[array([[-1.4162121 ,  4.810144  , -1.6980445 ,  1.1788609 ,  2.8066883 ,
          0.80700696, -1.7158511 ,  2.5211751 ,  1.3627063 ,  3.1732688 ,
          0.7874196 ,  0.57305425,  0.14707136,  1.3254292 ,  2.5904388 ,
          0.47538853],
        [-0.37818435, -0.52372587, -0.8668064 , -0.22572114, -0.09839456,
          0.16721722,  1.0830376 , -0.9204821 ,  1.9335619 , -1.0867276 ,
         -0.7180021 ,  0.08871941, -1.0329604 ,  0.8129696 ,  1.5617628 ,
         -0.66331667],
        [-0.940143  ,  4.3024282 , -1.637966  ,  0.732653  ,  2.4457934 ,
          0.30171287, -1.7070627 ,  2.733851  ,  1.3159599 ,  2.6855285 ,
          0.51377   ,  0.33176097,  0.01913486,  1.062967  ,  2.3257005 ,
          0.55971825],
        [ 0.88518065, -1.0465701 ,  1.2154307 , -1.34556   , -1.5633222 ,
          1.0690651 ,  1.084691  , -1.4652343 ,  0.10886915, -1.2274033 ,
         -0.72411335, -1.0061591 ,  0.7194428 , -0.5950448 , -1.0028086 ,
         -0.26274145],
        [-1.3458608 

In [34]:
one = layer_output[0][0]

In [35]:
two = layer_output[1][0]

In [36]:
result = sum(abs(one - two))
result

37.524234503507614

In [37]:
y_train

[14,
 20,
 9,
 7,
 37,
 17,
 28,
 35,
 3,
 6,
 7,
 20,
 35,
 15,
 8,
 12,
 5,
 5,
 13,
 3,
 28,
 37,
 19,
 12,
 0,
 26,
 6,
 19,
 29,
 3,
 10,
 13,
 16,
 2,
 0,
 35,
 8,
 31,
 17,
 37,
 11,
 13,
 7,
 29,
 37,
 2,
 10,
 38,
 14,
 35,
 33,
 20,
 8,
 35,
 12,
 30,
 4,
 22,
 25,
 23,
 33,
 22,
 11,
 24,
 35,
 4,
 30,
 14,
 38,
 22,
 29,
 33,
 9,
 32,
 7,
 34,
 35,
 26,
 18,
 29,
 26,
 24,
 5,
 24,
 37,
 2,
 8,
 31,
 38,
 23,
 24,
 24,
 3,
 8,
 17,
 30,
 27,
 34,
 18,
 20,
 9,
 18,
 31,
 30,
 33,
 26,
 27,
 8,
 23,
 28,
 22,
 1,
 17,
 5,
 23,
 18,
 19,
 2,
 29,
 19,
 16,
 39,
 30,
 23,
 14,
 25,
 8,
 29,
 25,
 11,
 3,
 1,
 31,
 35,
 39,
 24,
 27,
 16,
 16,
 14,
 14,
 4,
 11,
 28,
 33,
 23,
 22,
 33,
 3,
 36,
 11,
 1,
 0,
 25,
 27,
 21,
 35,
 32,
 33,
 15,
 7,
 11,
 32,
 29,
 9,
 12,
 32,
 16,
 6,
 23,
 31,
 2,
 15,
 16,
 31,
 16,
 32,
 34,
 25,
 25,
 21,
 9,
 20,
 8,
 10,
 37,
 18,
 1,
 1,
 32,
 6,
 3,
 10,
 18,
 10,
 6,
 37,
 38,
 27,
 1,
 5,
 24,
 33,
 34,
 36,
 38,
 27,
 24,
 19,
 9,
 12