<img style="float: left;" src="https://cdn.pixabay.com/photo/2016/12/07/09/45/dna-1889085__340.jpg" width=10%> <h1> Application of AI to Discover Novel Binding of Small Molecules </h1>

---------
### Sample Dataset for Testing Purposes

##### Here we create a sample dataset for two reasons:
- to get a better understanding of the structure of the data
- test any sample code for validity

##### Structure of sample dataset:
1. A dataframe consisting of 50 genes and 1020 profiles [50 x 1020]
2. Columns are a combination of drug, replicate, time, concentration, probe_location, cell type. For the purposes of this project only drug and replicate matters in terms of training. So the column name will be structured as
"*drug + replicate id + unique characters that represent time, concentration, probe_location and cell type*"
3. 20 columns consist of control genes or 'control probes'. Columns are labelled control_x where x is a number from 1 to 20
3. Dataset consists of 25 drugs with 4 replicates and 10 combinations of time, concentration, probe_location and cell type

| Feature      | Quantity | Represented By |
| ----------- | ----------- | ----------- |
| Drug      | 25       | Alphabets A-Y |
| Replicate   | 4        | Numbers 1-4 |
| Other features   | 10        | Random String of length 3 |

***R_3_xcv*** represents a profile of drug 'R', of replicate 3, with other features coresponding to 'xcv'

##### Construction of Sample Dataset

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
genes = ['gene'+str(a) for a in range(50)]
drugs = [chr(a) for a in range(65, 90)]
replicates = [str(a) for a in range(1, 5)]
other_features = set()

while len(other_features)!=10:
    rand_string = "". join([str(chr(int(random.random()*100)%26+97)) for a in range(3)])
    other_features.add(rand_string)

In [3]:
columns = ["_".join([a,b,c]) for a in drugs for b in replicates for c in other_features]
# columns = ["control_"+str(a+1) for a in range(20)] + columns

In [4]:
data = pd.DataFrame(2*np.random.rand(50, len(columns))-1, index=genes, columns=columns)
data.columns = columns
data.fillna(random.random(), inplace = True)
data.shape

(50, 1000)

In [5]:
data.head()

Unnamed: 0,A_1_iwy,A_1_fjp,A_1_hag,A_1_dgp,A_1_trm,A_1_rkm,A_1_fhu,A_1_iiv,A_1_bwy,A_1_kce,...,Y_4_iwy,Y_4_fjp,Y_4_hag,Y_4_dgp,Y_4_trm,Y_4_rkm,Y_4_fhu,Y_4_iiv,Y_4_bwy,Y_4_kce
gene0,-0.585731,-0.338173,-0.84895,-0.414005,-0.432149,-0.064648,-0.362521,-0.231909,-0.983928,0.980269,...,0.530974,-0.966098,0.074459,0.1824,-0.250329,-0.019644,0.634636,-0.377086,0.239632,0.013087
gene1,0.792248,0.317465,-0.785642,0.864102,0.270879,-0.438737,-0.124037,-0.548125,-0.664701,0.16301,...,0.402171,-0.102463,0.498658,-0.607392,0.508521,0.480039,0.857599,-0.139327,-0.995036,-0.427308
gene2,0.926341,0.533271,-0.173927,-0.541883,0.108371,0.488638,0.391089,-0.902297,0.393086,0.035895,...,0.507484,0.540233,-0.221214,-0.438883,-0.947816,-0.501642,0.817394,0.014788,-0.670321,-0.762475
gene3,-0.404645,0.078589,-0.320743,-0.281696,-0.426724,0.054142,-0.297484,-0.760635,0.53073,0.653217,...,0.633859,-0.958401,-0.985946,-0.982073,-0.87338,0.062543,0.446802,-0.613798,0.786497,0.974863
gene4,-0.77237,-0.850649,0.415149,0.929218,-0.826904,-0.310194,-0.424609,0.206385,0.86404,-0.637466,...,-0.685588,0.810977,0.634511,-0.535636,-0.761013,-0.597972,-0.21594,0.503529,0.886878,0.921023


##### Classifying Columns
A label needs to be assigned to each class. This can be done at the biological replicate level or the perturbagen level. We create classifications for each of these.

In [6]:
perturbagen_class = [int(a/25) for a in range(1000)]
replicate_class = [10*a+c for a in range(25) for b in range(4) for c in range(10)]

##### Creating the dataset

In [7]:
#transpose data
workingdata = data.transpose()
workingdata.head()

Unnamed: 0,gene0,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene40,gene41,gene42,gene43,gene44,gene45,gene46,gene47,gene48,gene49
A_1_iwy,-0.585731,0.792248,0.926341,-0.404645,-0.77237,0.235831,-0.261996,-0.817228,0.571489,0.680681,...,-0.863122,-0.098806,-0.43486,0.061512,-0.102001,-0.859851,0.127113,0.719368,0.270667,0.609866
A_1_fjp,-0.338173,0.317465,0.533271,0.078589,-0.850649,-0.740475,-0.340657,0.315695,0.497341,-0.376417,...,0.420833,-0.548909,0.438191,-0.181363,0.900857,0.833154,-0.04007,-0.062911,0.430353,0.874539
A_1_hag,-0.84895,-0.785642,-0.173927,-0.320743,0.415149,-0.001423,-0.477811,0.558635,0.510741,-0.812126,...,0.274374,0.769566,-0.765945,0.808187,0.450865,0.316246,0.434813,-0.559252,-0.53084,0.286062
A_1_dgp,-0.414005,0.864102,-0.541883,-0.281696,0.929218,-0.269313,0.176925,0.020684,0.873631,0.735225,...,-0.058102,0.96907,-0.02002,-0.328967,0.436305,0.418331,-0.055941,0.745828,-0.232475,-0.887516
A_1_trm,-0.432149,0.270879,0.108371,-0.426724,-0.826904,0.416359,0.333771,0.487096,-0.364629,0.456554,...,-0.933853,-0.352177,-0.321453,0.840804,0.375265,0.100698,-0.626591,0.36912,-0.805015,-0.879459


In [8]:
X_train, X_test, y_train, y_test = train_test_split(workingdata, perturbagen_class, test_size=0.5)
X_test.shape

(500, 50)

##### Computation - Siamese

In [9]:
import keras
from keras.datasets import reuters
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Input
from keras.layers.noise import AlphaDropout
from keras.preprocessing.text import Tokenizer
from keras.layers import Layer
from tensorflow.python.keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [10]:
max_words = 50
batch_size = 16
epochs = 40

def create_network(n_dense=6,
                   dense_units=16,
                   activation='selu',
                   dropout=AlphaDropout,
                   dropout_rate=0.1,
                   kernel_initializer='lecun_normal',
                   optimizer='adam',
                   num_classes=1,
                   max_words=max_words):
    
    model = Sequential()
    model.add(Dense(dense_units, input_shape=(max_words,),
                    kernel_initializer=kernel_initializer))
    model.add(Activation(activation))
    model.add(dropout(dropout_rate))

    for i in range(n_dense - 1):
        model.add(Dense(dense_units, kernel_initializer=kernel_initializer))
        model.add(Activation(activation))
        model.add(dropout(dropout_rate))

    #model.add(Dense(num_classes))
    #model.add(Activation('softmax'))
    return model

In [11]:
network = {
    'n_dense': 10,
    'dense_units': 16,
    'activation': 'selu',
    'dropout': AlphaDropout,
    'dropout_rate': 0.1,
    'kernel_initializer': 'lecun_normal',
    'optimizer': 'sgd',
    'num_classes':40
}

In [12]:
model = create_network(**network)

In [13]:
len(model.layers)

30

In [14]:
class ManDist(Layer):
    
    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

In [15]:
left_input = Input(shape=(max_words,))
right_input = Input(shape=(max_words,))

In [16]:
# Model variables
shared_model = model

In [17]:
#TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'- embedding layer is required
#Node error -> from keras not from tf.python.keras
#Input 'b' of 'MatMul' Op has type float32 that does not match type int32 of argument 'a'. ->
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

model.compile(loss='mean_squared_error', optimizer="adam", metrics=['accuracy'])
model.summary()
shared_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 16)           3264        input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
man_dist_1 (ManDist)            (None, 1)            0           sequential_1[1][0]               
          

In [18]:
a = np.asarray(data.transpose().iloc[0:5,:])
b = np.asarray(data.transpose().iloc[6:11,:])

In [19]:
from keras import backend as K

# with a Sequential model
get_3rd_layer_output = K.function([model.layers[0].input,model.layers[1].input]
                                  ,[model.layers[2].get_output_at(1),model.layers[2].get_output_at(2),model.layers[3].output])

layer_output = get_3rd_layer_output([a,b])
layer_output

[array([[-1.14393604e+00,  1.00285359e-01,  5.65997243e-01,
          1.89393342e+00, -1.26365149e+00,  7.54402161e-01,
          8.35867524e-01, -1.47565651e+00, -1.28676701e+00,
          6.77389055e-02, -4.17713046e-01, -1.60002601e+00,
          1.11427772e+00, -1.02718377e+00, -1.17027390e+00,
         -9.16135371e-01],
        [-1.09757984e+00,  7.96376765e-01, -1.41847044e-01,
          1.14815509e+00, -1.06273246e+00,  5.46619713e-01,
          5.09884238e-01, -1.43541610e+00, -4.63215023e-01,
          2.80170143e-01, -1.01332793e-04, -1.47560346e+00,
          1.02916729e+00, -9.34417903e-01, -3.93301696e-01,
         -4.31014150e-01],
        [-8.79728317e-01,  5.44172168e-01,  3.04633915e-01,
          2.63747096e-01, -1.07638562e+00,  9.43134055e-02,
          5.75842381e-01, -1.07977843e+00, -3.19184870e-01,
          6.93463147e-01,  4.46740448e-01, -1.15901625e+00,
          6.30568802e-01, -1.00124407e+00, -3.46650369e-02,
         -1.20436266e-01],
        [ 1.6120368

In [20]:
one = layer_output[0][0]

two = layer_output[1][0]

result = sum(abs(one - two))
result

20.079696476459503

In [21]:
#ValueError: Error when checking target: expected man_dist_1 to have shape (1,) but got array with shape (46,)
#==> need to convert code to suit multi-class
malstm_trained = model.fit([X_train,X_test], y_train, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [22]:
prediction = model.predict([X_test,X_train],verbose=1)
print(prediction[0:5])

[[19.641825]
 [32.142303]
 [10.714052]
 [27.809505]
 [30.87245 ]]


In [23]:
score = model.evaluate([X_test,X_train],y_train,verbose=1)
score



[204.50296545410157, 0.02800000002980232]

In [24]:
model.layers[2].get_weights()[0].shape

(50, 16)

In [25]:
model.layers[2].get_weights()[0]

array([[-3.98882665e-02,  1.56161025e-01, -1.99687451e-01,
        -1.33198693e-01, -1.47607997e-01, -2.09831446e-01,
        -1.65249035e-01,  1.58143174e-02,  6.42508268e-02,
         8.53428431e-03,  5.35389781e-02,  1.73134655e-01,
        -1.04796641e-01, -8.43193904e-02,  4.96364990e-03,
         1.46715567e-01],
       [-2.05321163e-02, -1.91518858e-01, -9.43870004e-03,
         6.85879728e-03, -1.11483023e-01, -4.46117967e-02,
         4.15540747e-02,  1.16432561e-02, -4.86577190e-02,
         7.36233294e-02,  2.34275490e-01, -3.25187407e-02,
         2.30401624e-02, -2.72202075e-01,  6.43976331e-02,
         1.43086523e-01],
       [-2.51227021e-01,  3.20253000e-02, -2.55489677e-01,
         9.63021163e-03,  2.93699093e-02, -7.69109577e-02,
        -9.06576738e-02,  3.09058372e-02, -1.84841067e-01,
        -2.14128513e-02, -1.84613883e-01,  1.34473041e-01,
         6.16061874e-02, -2.53114216e-02, -1.50752276e-01,
        -1.13795191e-01],
       [ 6.86898455e-02,  6.96950853e

In [26]:
for i in model.layers[2].get_weights():
    print(i.shape)

(50, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)
(16, 16)
(16,)


In [27]:
model.layers[2].get_output_at(0)

<tf.Tensor 'alpha_dropout_10/cond/Merge:0' shape=(?, 16) dtype=float32>

In [28]:
model.layers[2].get_output_at(2)

<tf.Tensor 'sequential_1_1/alpha_dropout_10/cond/Merge:0' shape=(?, 16) dtype=float32>

In [29]:
model.layers[2]

<keras.engine.sequential.Sequential at 0x1a247deeb8>

In [30]:
from keras.models import Model

intermediate_layer_model = Model(inputs=[model.get_layer(index=0).get_input_at(0),
                                         model.get_layer(index=1).get_input_at(0)]
                                 ,outputs=[model.get_layer(index=1).get_output_at(0),
                                           model.get_layer(index=1).get_output_at(0)])

In [31]:
a = np.asarray(data.transpose().iloc[0:5,:])
b = np.asarray(data.transpose().iloc[6:11,:])

In [32]:
from keras import backend as K

# with a Sequential model
get_3rd_layer_output = K.function([model.layers[0].input,model.layers[1].input]
                                  ,[model.layers[2].get_output_at(1),model.layers[2].get_output_at(2),model.layers[3].output])

In [33]:
layer_output = get_3rd_layer_output([a,b])
layer_output

[array([[ 1.2178053 , -0.67925286,  0.8479426 ,  1.3520645 ,  0.9147324 ,
          0.02230643,  1.0416962 ,  0.67125744,  1.3859992 ,  0.33018523,
          1.7640773 ,  2.313894  , -1.1387967 , -1.4419667 ,  1.359672  ,
          0.49491233],
        [-0.46968198,  1.8324721 ,  0.57460415, -0.03054466, -1.6573837 ,
          0.7466898 ,  0.11166595, -1.724245  , -1.593303  , -1.1651725 ,
         -1.5293831 , -1.6696548 ,  0.6350355 , -1.0289538 , -0.7192443 ,
         -0.9055498 ],
        [ 0.6086133 , -1.2438785 ,  0.57854325,  1.103818  ,  1.5043063 ,
         -0.7744901 ,  0.49987602,  1.8905902 ,  1.3587424 , -0.15930353,
          2.1934636 ,  2.3986504 , -0.42720562, -1.4964793 ,  0.621618  ,
          0.60395056],
        [ 0.7530304 , -0.86721796,  0.47572646,  0.9167299 ,  1.6337068 ,
         -0.20925544,  0.73408604,  1.0547231 ,  1.3113996 ,  0.03291405,
          1.7843642 ,  2.7690144 , -0.1605715 , -1.2548864 ,  0.77564466,
          0.6050003 ],
        [ 0.7247334 

In [34]:
one = layer_output[0][0]

In [35]:
two = layer_output[1][0]

In [36]:
result = sum(abs(one - two))
result

30.08117324113846

In [37]:
layer_output = get_3rd_layer_output([b,a])
layer_output

[array([[-0.5632467 ,  1.804676  ,  0.1686847 ,  0.04001371, -1.6628855 ,
          0.6959573 ,  0.12643532, -1.7249122 , -1.5779355 , -1.0692519 ,
         -1.5872409 , -1.6329705 ,  0.590402  , -1.0818702 , -0.6908595 ,
         -0.96589077],
        [-0.28702068,  1.6033876 ,  0.42751333, -0.03574187, -1.6378826 ,
          0.8348357 , -0.41826558, -1.704447  , -1.3783648 , -0.98635346,
         -1.3928478 , -1.6756785 ,  0.4065344 , -0.7133683 , -0.64820457,
         -0.5385731 ],
        [ 0.9274327 , -0.80831265,  0.718575  ,  1.3737953 ,  1.116997  ,
         -0.21497977,  0.8157609 ,  1.0136461 ,  1.4982288 ,  0.05290867,
          2.105413  ,  2.0475025 , -0.9676619 , -1.496131  ,  1.2694614 ,
          0.48783615],
        [-0.89601386,  1.7417355 ,  0.13494128,  0.355679  , -1.6296377 ,
          0.7322764 ,  0.3236893 , -1.728319  , -1.6389868 , -1.1572663 ,
         -1.6292186 , -1.6039343 ,  0.72593266, -1.1258519 , -0.7710033 ,
         -1.1109194 ],
        [ 1.0481032 

In [38]:
one = layer_output[0][0]

In [39]:
two = layer_output[1][0]

In [40]:
result = sum(abs(one - two))
result

30.08117324113846

In [None]:
def make_oneshot_task(N, s="val", language=None):
    """Create pairs of test image, support set for testing N way one-shot learning. """
    if s == 'train':
        X = Xtrain
        categories = train_classes
    else:
        X = Xval
        categories = val_classes
    n_classes, n_examples, w, h = X.shape
    
    indices = rng.randint(0, n_examples,size=(N,))
    if language is not None: # if language is specified, select characters for that language
        low, high = categories[language]
        if N > high - low:
            raise ValueError("This language ({}) has less than {} letters".format(language, N))
        categories = rng.choice(range(low,high),size=(N,),replace=False)
        else: # if no language specified just pick a bunch of random letters
        categories = rng.choice(range(n_classes),size=(N,),replace=False)            
    
    true_category = categories[0]
    ex1, ex2 = rng.choice(n_examples,replace=False,size=(2,))
    test_image = np.asarray([X[true_category,ex1,:,:]]*N).reshape(N, w, h,1)
    support_set = X[categories,indices,:,:]
    support_set[0,:,:] = X[true_category,ex2]
    support_set = support_set.reshape(N, w, h,1)
    targets = np.zeros((N,))
    targets[0] = 1
    targets, test_image, support_set = shuffle(targets, test_image, support_set)
    pairs = [test_image,support_set]
    return pairs, targets

  
def test_oneshot(model, N, k, s = "val", verbose = 0):
    """Test average N way oneshot learning accuracy of a siamese neural net over k one-shot tasks"""
    n_correct = 0
    if verbose:
        print("Evaluating model on {} random {} way one-shot learning tasks ... \n".format(k,N))
    for i in range(k):
        inputs, targets = make_oneshot_task(N,s)
        probs = model.predict(inputs)
        if np.argmax(probs) == np.argmax(targets):
            n_correct+=1
    percent_correct = (100.0 * n_correct / k)
    if verbose:
        print("Got an average of {}% {} way one-shot learning accuracy \n".format(percent_correct,N))
    return percent_correct