<img style="float: left;" src="https://cdn.pixabay.com/photo/2016/12/07/09/45/dna-1889085__340.jpg" width=10%> <h1> Application of AI to Discover Novel Binding of Small Molecules </h1>

---------
### Sample Dataset for Testing Purposes

##### Here we create a sample dataset for two reasons:
- to get a better understanding of the structure of the data
- test any sample code for validity

##### Structure of sample dataset:
1. A dataframe consisting of 50 genes and 1020 profiles [50 x 1020]
2. Columns are a combination of drug, replicate, time, concentration, probe_location, cell type. For the purposes of this project only drug and replicate matters in terms of training. So the column name will be structured as
"*drug + replicate id + unique characters that represent time, concentration, probe_location and cell type*"
3. 20 columns consist of control genes or 'control probes'. Columns are labelled control_x where x is a number from 1 to 20
3. Dataset consists of 25 drugs with 4 replicates and 10 combinations of time, concentration, probe_location and cell type

| Feature      | Quantity | Represented By |
| ----------- | ----------- | ----------- |
| Drug      | 25       | Alphabets A-Y |
| Replicate   | 4        | Numbers 1-4 |
| Other features   | 10        | Random String of length 3 |

***R_3_xcv*** represents a profile of drug 'R', of replicate 3, with other features coresponding to 'xcv'

##### Construction of Sample Dataset

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
genes = ['gene'+str(a) for a in range(50)]
drugs = [chr(a) for a in range(65, 90)]
replicates = [str(a) for a in range(1, 5)]
other_features = set()

while len(other_features)!=10:
    rand_string = "". join([str(chr(int(random.random()*100)%26+97)) for a in range(3)])
    other_features.add(rand_string)

In [3]:
columns = ["_".join([a,b,c]) for a in drugs for b in replicates for c in other_features]
# columns = ["control_"+str(a+1) for a in range(20)] + columns

In [4]:
data = pd.DataFrame(2*np.random.rand(50, len(columns))-1, index=genes, columns=columns)
data.columns = columns
data.fillna(random.random(), inplace = True)
data.shape

(50, 1000)

In [5]:
data.head()

Unnamed: 0,A_1_jtw,A_1_mpr,A_1_kcd,A_1_vcy,A_1_dae,A_1_mni,A_1_sve,A_1_pdp,A_1_qtk,A_1_loa,...,Y_4_jtw,Y_4_mpr,Y_4_kcd,Y_4_vcy,Y_4_dae,Y_4_mni,Y_4_sve,Y_4_pdp,Y_4_qtk,Y_4_loa
gene0,-0.423236,0.276683,0.218765,-0.728761,0.182513,0.431289,0.290331,-0.497359,-0.758895,0.111809,...,0.631057,-0.054836,0.391481,0.005418,0.142443,0.959337,0.774885,0.701003,0.739636,-0.088119
gene1,-0.538742,-0.240416,-0.123649,0.831907,0.164863,0.986713,-0.189137,0.147324,0.339066,-0.117438,...,0.258726,0.403191,0.575844,0.426453,0.28915,0.575881,-0.915945,0.379131,0.332683,0.628461
gene2,0.249619,-0.132087,0.152258,-0.456218,0.497169,0.492773,0.51295,-0.380662,-0.56345,-0.213367,...,-0.074474,-0.980685,-0.947331,-0.328598,0.446323,0.140214,-0.232012,0.386385,-0.658091,-0.246022
gene3,0.715753,0.045646,-0.26904,0.804331,0.427472,0.177128,-0.205411,-0.841251,0.71351,0.59633,...,-0.511774,0.85887,-0.455747,0.765737,-0.06889,0.258469,0.465845,-0.41499,-0.12549,0.463348
gene4,0.432081,0.380906,-0.814176,0.947948,0.807224,-0.514462,-0.218778,0.855692,0.688977,0.326078,...,0.798882,-0.130771,-0.465193,-0.37451,0.879629,0.996358,-0.928168,0.250282,0.403479,0.341243


##### Classifying Columns
A label needs to be assigned to each class. This can be done at the biological replicate level or the perturbagen level. We create classifications for each of these.

In [6]:
perturbagen_class = [int(a/25) for a in range(1000)]
replicate_class = [10*a+c for a in range(25) for b in range(4) for c in range(10)]

##### Creating the dataset

In [7]:
#transpose data
workingdata = data.transpose()
workingdata.head()

Unnamed: 0,gene0,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene40,gene41,gene42,gene43,gene44,gene45,gene46,gene47,gene48,gene49
A_1_jtw,-0.423236,-0.538742,0.249619,0.715753,0.432081,0.293022,-0.440698,-0.683294,-0.663446,-0.610228,...,0.572349,0.938891,-0.156832,0.360899,0.995447,0.123469,0.667855,-0.32459,-0.064716,0.966608
A_1_mpr,0.276683,-0.240416,-0.132087,0.045646,0.380906,0.186072,0.319906,-0.316297,0.092981,-0.22197,...,0.157776,-0.880845,-0.895071,-0.064001,0.966296,-0.742365,-0.260878,0.759828,-0.573509,-0.984445
A_1_kcd,0.218765,-0.123649,0.152258,-0.26904,-0.814176,0.376498,-0.889192,-0.062045,-0.817688,-0.674385,...,0.512048,0.909508,-0.207874,-0.708978,0.578677,0.115878,-0.464261,0.694167,0.052453,-0.308925
A_1_vcy,-0.728761,0.831907,-0.456218,0.804331,0.947948,0.322989,-0.215181,-0.592987,-0.910618,0.058492,...,-0.899567,-0.521414,-0.652895,-0.877215,0.922347,-0.045799,0.691173,-0.738955,0.147985,0.84928
A_1_dae,0.182513,0.164863,0.497169,0.427472,0.807224,0.030435,-0.459536,-0.710242,-0.034296,-0.795936,...,-0.404793,0.685174,0.829054,-0.23162,0.284909,0.195248,-0.72367,0.665463,-0.108551,-0.236633


In [8]:
X_train, X_test, y_train, y_test = train_test_split(workingdata, perturbagen_class, test_size=0.5)
X_test.shape

(500, 50)

##### Computation - Siamese

In [16]:
import keras
from keras.datasets import reuters
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Input, concatenate
from keras.layers.noise import AlphaDropout
from keras.preprocessing.text import Tokenizer
from keras.layers import Layer
from tensorflow.python.keras import backend as K

In [22]:
def create_model():
    max_words = 50
    batch_size = 16
    epochs = 40
    
    def create_network(n_dense=6,
                   dense_units=16,
                   activation='selu',
                   dropout=AlphaDropout,
                   dropout_rate=0.1,
                   kernel_initializer='lecun_normal',
                   optimizer='adam',
                   num_classes=1,
                   max_words=max_words):
    
        model = Sequential()
        model.add(Dense(dense_units, input_shape=(max_words,),
                        kernel_initializer=kernel_initializer))
        model.add(Activation(activation))
        model.add(dropout(dropout_rate))

        for i in range(n_dense - 1):
            model.add(Dense(dense_units, kernel_initializer=kernel_initializer))
            model.add(Activation(activation))
            model.add(dropout(dropout_rate))

        return model
    
    #create model
    network = {
    'n_dense': 10,
    'dense_units': 16,
    'activation': 'selu',
    'dropout': AlphaDropout,
    'dropout_rate': 0.1,
    'kernel_initializer': 'lecun_normal',
    'optimizer': 'sgd',
    'num_classes':40
    }
    anchor_input = Input((50, ), name='anchor_input')
    positive_input = Input((50, ), name='positive_input')
    negative_input = Input((50, ), name='negative_input')

    # Shared embedding layer for positive and negative items
    Shared_DNN = create_network()


    encoded_anchor = Shared_DNN(anchor_input)
    encoded_positive = Shared_DNN(positive_input)
    encoded_negative = Shared_DNN(negative_input)


    merged_vector = concatenate([encoded_anchor, encoded_positive, encoded_negative], axis=-1, name='merged_layer')

    model = Model(inputs=[anchor_input,positive_input, negative_input], outputs=merged_vector)
    model.compile(loss=triplet_loss, optimizer='adam')
    
    return model

In [24]:
model = create_model()
model.summary()

y_pred.shape =  Tensor("merged_layer_3/concat:0", shape=(?, 48), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       (None, 50)           0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     (None, 50)           0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     (None, 50)           0                                            
__________________________________________________________________________________________________
sequential_5 (Sequential)       (None, 16)           2176        anchor_input[0][0]               
                             

In [14]:
def generate_triplet(x,y,testsize=0.3,ap_pairs=10,an_pairs=10):
    data_xy = tuple([x,y])

    trainsize = 1-testsize

    triplet_train_pairs = []
    triplet_test_pairs = []
    
    for data_class in sorted(set(data_xy[1])):

        same_class_idx = np.where((data_xy[1] == data_class))[0]
        diff_class_idx = np.where(data_xy[1] != data_class)[0]
        A_P_pairs = random.sample(list(permutations(same_class_idx,2)),k=ap_pairs) #Generating Anchor-Positive pairs
        Neg_idx = random.sample(list(diff_class_idx),k=an_pairs)
        

        #train
        A_P_len = len(A_P_pairs)
        Neg_len = len(Neg_idx)
        for ap in A_P_pairs[:int(A_P_len*trainsize)]:
            Anchor = data_xy[0][ap[0]]
            Positive = data_xy[0][ap[1]]
            for n in Neg_idx:
                Negative = data_xy[0][n]
                triplet_train_pairs.append([Anchor,Positive,Negative])               
        #test
        for ap in A_P_pairs[int(A_P_len*trainsize):]:
            Anchor = data_xy[0][ap[0]]
            Positive = data_xy[0][ap[1]]
            for n in Neg_idx:
                Negative = data_xy[0][n]
                triplet_test_pairs.append([Anchor,Positive,Negative])    
                
    return np.array(triplet_train_pairs), np.array(triplet_test_pairs)

In [15]:
def triplet_loss(y_true, y_pred, alpha = 0.4):
    """
    Implementation of the triplet loss function
    Arguments:
    y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
    y_pred -- python list containing three objects:
            anchor -- the encodings for the anchor data
            positive -- the encodings for the positive data (similar to anchor)
            negative -- the encodings for the negative data (different from anchor)
    Returns:
    loss -- real number, value of the loss
    """
    print('y_pred.shape = ',y_pred)
    
    total_lenght = y_pred.shape.as_list()[-1]
#     print('total_lenght=',  total_lenght)
#     total_lenght =12
    
    anchor = y_pred[:,0:int(total_lenght*1/3)]
    positive = y_pred[:,int(total_lenght*1/3):int(total_lenght*2/3)]
    negative = y_pred[:,int(total_lenght*2/3):int(total_lenght*3/3)]

    # distance between the anchor and the positive
    pos_dist = K.sum(K.square(anchor-positive),axis=1)

    # distance between the anchor and the negative
    neg_dist = K.sum(K.square(anchor-negative),axis=1)

    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.maximum(basic_loss,0.0)
 
    return loss