In [22]:
from cmapPy.pandasGEXpress.parse import parse
import numpy as np
import json
from collections import Counter

##### Create target perturbagen based on column name (target)

##### Prepare the dataset
- Transpose the dataset and add the class labels
- Sort the dataset based on perturbagen values

##### Create 2 dictionaries
- <b> pert2profile: </b> <u>perturbagen: number of profiles</u> for that particular perturbagen
- <b> location_pert: </b> <u>perturbagen: location </u>of 1st profile of perturbagen

##### Create training data and target values
- generates 'batch_size' number of pairs of input [a,b]
- half of the pairs are of the same class and half are of differnet class

In [2]:
def get_batch(pert2profiles, location_pert, data,batch_size):
    
    rng = np.random
    
    list_of_perturbagens = data.target
    num_perturbagens= len(list_of_perturbagens) 
    dim = 978
    #print(list_of_perturbagens, num_perturbagens)
    
    batch_perturbagens = rng.choice(list_of_perturbagens,size=(batch_size,),replace=False)
    pairs=[np.zeros((batch_size, dim)) for i in range(2)]
    #print(batch_perturbagens)
    
    targets=np.zeros((batch_size,))
    targets[batch_size//2:] = 1
    
    for i in range(batch_size):
        pert1 = batch_perturbagens[i]
        idx_1 = rng.randint(0, pert2profiles[pert1])
        pairs[0][i,:] = data.iloc[location_pert[pert1]+idx_1,0:978]
        
        pert2=pert1
        if i < batch_size // 2:
            pert2 = rng.choice(batch_perturbagens)
        idx_2 = rng.randint(0, pert2profiles[pert2])
        pairs[1][i,:] = data.iloc[location_pert[pert2]+idx_2,0:978]
        
    return np.asarray(pairs), np.asarray(targets)
    

In [23]:
def get_target_labels(working_data, mydict):
    print("creating target labels")
    target = []
    cnt=0
    for i in working_data.columns:
        for pert, collist in mydict.items():
            if i in collist:
                if(cnt%1000==0):
                    print(cnt,end=' ')
                target.append(pert)
                cnt+=1
    return target

def create_pert2profile(data):
    print("pert2profile")
    return Counter(data.target)

def create_location_pert(data):
    print("creating location_pert")
    location_pert=dict()
    cnt=0
    for i in set(data.target.values):
        loc  = np.where(data['target'] == i)[0][0]
        location_pert[i] = loc
        if(cnt%100==0):
            print(cnt,end=' ')
        cnt+=1
    return location_pert

def get_annotated_data():
    obj = parse("../Data/Sig Annotated Level 5 Data.gctx")
    working_data = obj.data_df
    
    with open('../Data/sig-pert mapping.json', 'r') as fp:
        mydict = json.load(fp)
        
    
    data = working_data.transpose()
    data['target'] = get_target_labels(working_data,mydict)
    
    data = data.sort_values('target')
    
    pert2profiles = create_pert2profile(data)
    
    location_pert= create_location_pert(data)
    
    X,y = get_batch(pert2profiles, location_pert, data,10000)
    
    return X,y

In [24]:
X,y = get_annotated_data()

The given path to the gctx file cannot be found. full_path: ../Data/Sig Annotated Level 5 Data.gctx


Exception: The given path to the gctx file cannot be found. full_path: ../Data/Sig Annotated Level 5 Data.gctx

In [6]:
def generate(batch_size, s="train"):
    """
    a generator for batches, so model.fit_generator can be used.
    """
    while True:
        pairs, targets = get_batch(batch_size,s)
        yield (pairs, targets)

##### Creates a support set to evaluate the embeddings
- N-way indicates that it will compare the query sample with N candidtates in the support set

In [7]:
def make_oneshot_task(N):
    rng = np.random
    
    list_of_perturbagens = np.unique(data.target)
    num_perturbagens= len(list_of_perturbagens) 
    dim = 978
    
    perturbagens = rng.choice(list_of_perturbagens,size=(N,),replace=False) 
    true_category = perturbagens[0]
    
    ex1, ex2 = rng.choice(pert2profiles[true_category],size=(2,))
    test_image = np.asarray([data.iloc[location_pert[true_category]+ex1,0:978]]*N)
    
    support_set=np.zeros((N,978))
    support_set[0,:]=data.iloc[location_pert[true_category]+ex2,0:978]
    cnt=1
    
    for i in perturbagens[1:]:
        loc = location_pert[i]
        idx = rng.randint(pert2profiles[i])
        support_set[cnt,:]= np.asarray(data.iloc[loc+idx,0:978])
        cnt+=1
    
    targets = np.zeros((N,))
    targets[0] = 1
    pairs = [test_image,support_set]
    return pairs, targets

In [8]:
def test_oneshot(model, N, k,verbose = 0):
    """Test average N way oneshot learning accuracy of a siamese neural net over k one-shot tasks"""
    n_correct = 0
    if verbose:
        print("Evaluating model on {} random {} way one-shot learning tasks ... \n".format(k,N))
    for i in range(k):
        inputs, targets = make_oneshot_task(N)
        probs = model.predict(inputs)
        if np.argmax(probs) == np.argmax(targets):
            n_correct+=1
    percent_correct = (100.0 * n_correct / k)
    if verbose:
        print("Got an average of {}% {} way one-shot learning accuracy \n".format(percent_correct,N))
    return percent_correct

In [9]:
def nearest_neighbour_correct(pairs,targets):
    """returns 1 if nearest neighbour gets the correct answer for a one-shot task
        given by (pairs, targets)"""
    L2_distances = np.zeros_like(targets)
    for i in range(len(targets)):
        L2_distances[i] = np.sum(np.sqrt(pairs[0][i]**2 - pairs[1][i]**2))
    if np.argmin(L2_distances) == np.argmax(targets):
        return 1
    return 0

  
def test_nn_accuracy(N_ways,n_trials):
    """Returns accuracy of NN approach """
    print("Evaluating nearest neighbour on {} unique {} way one-shot learning tasks ...".format(n_trials,N_ways))
    n_right = 0
    
    for i in range(n_trials):
        pairs,targets = make_oneshot_task(N_ways,"val")
        correct = nearest_neighbour_correct(pairs,targets)
        n_right += correct
    return 100.0 * n_right / n_trials

In [10]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from itertools import chain

import keras
from keras.datasets import reuters
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Input
from keras.layers.noise import AlphaDropout
from keras.preprocessing.text import Tokenizer
from keras.layers import Layer
from tensorflow.python.keras import backend as K

Using TensorFlow backend.


In [11]:
max_words = 978
batch_size = 16
epochs = 40

def create_network(n_dense=6,
                   dense_units=16,
                   activation='selu',
                   dropout=AlphaDropout,
                   dropout_rate=0.1,
                   kernel_initializer='lecun_normal',
                   optimizer='adam',
                   num_classes=1,
                   max_words=max_words):
    
    model = Sequential()
    model.add(Dense(dense_units, input_shape=(max_words,),
                    kernel_initializer=kernel_initializer))
    model.add(Activation(activation))
    model.add(dropout(dropout_rate))

    for i in range(n_dense - 1):
        model.add(Dense(dense_units, kernel_initializer=kernel_initializer))
        model.add(Activation(activation))
        model.add(dropout(dropout_rate))

    #model.add(Dense(num_classes))
    #model.add(Activation('softmax'))
    return model

In [12]:
network = {
    'n_dense': 10,
    'dense_units': 16,
    'activation': 'selu',
    'dropout': AlphaDropout,
    'dropout_rate': 0.1,
    'kernel_initializer': 'lecun_normal',
    'optimizer': 'sgd',
    'num_classes':40
}

In [13]:
shared_model = create_network(**network)

In [14]:
class ManDist(Layer):
    
    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

In [15]:
def contrastive_loss(y_true, y_pred):
    margin = 1
    return K.mean(y_true * K.square(y_pred) +
                  (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

In [16]:
left_input = Input(shape=(max_words,))
right_input = Input(shape=(max_words,))

In [17]:
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])
model.compile(loss=contrastive_loss, optimizer="adam", metrics=['accuracy'])

In [18]:
import os
model_path = './weights/'

In [19]:
X.shape
y.shape

(10000,)

In [20]:
model.fit([X[0],X[1]],y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x11d571748>

In [21]:
Counter(y)

Counter({0.0: 5000, 1.0: 5000})