In [1]:
import tensorflow as tf
import pickle
import sys
import matplotlib.pyplot as plt1 
import numpy as np
from IPython.display import Audio, display
import time
from sklearn.model_selection import train_test_split
from keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D, concatenate, Conv1D,Conv2D, Flatten, Reshape, Embedding, GRU, SpatialDropout1D, LSTM, Dropout, BatchNormalization
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras import backend as K
from keras.optimizers import SGD,Adam
from keras.losses import binary_crossentropy
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from itertools import permutations
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from scipy.stats import trim_mean
from collections import Counter

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
full = pd.read_csv("../data/drug_class_identification/all3.csv")
full = full.dropna()
full['atc'] = full['atc'].apply(lambda x : x[0])
full = full[full.atc.isin(['C','L','N'])]

In [3]:
X = full["smiles"]
y = full['atc']

In [4]:
def getVocabulary(sample):
    vocabulary = set()
    for word in sample:
        for character in word:
            vocabulary.add(character)
    return (vocabulary)

In [5]:
characters = getVocabulary(X)
token_index = dict(zip(characters, range(1, len(characters) + 1)))

samples = X.tolist()
max_length = 70
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.

In [6]:
X = np.asarray(results)
X.shape

(422, 70, 30)

In [7]:
# Define our own plot function
def scatter(x, y, subtitle=None):
    le = LabelEncoder()
    labels = le.fit_transform(y)

    # We choose a color palette with seaborn.
    palette = np.array(sns.color_palette("hls", 3))

    # We create a scatter plot.
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect='equal')
    sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40,
                    c=palette[labels.astype(np.int)])
    plt.xlim(-25, 25)
    plt.ylim(-25, 25)
    ax.axis('off')
    ax.axis('tight')

    # We add the labels for each digit.
    txts = []
    for i in range(3):
        # Position of each label.
        xtext, ytext = trim_mean(x[labels == i, :], axis=0, proportiontocut=0.2)
        letter = le.inverse_transform([i])[0]
        txt = ax.text(xtext, ytext, str(letter), fontsize=24)
        txt.set_path_effects([
            PathEffects.Stroke(linewidth=5, foreground="w"),
            PathEffects.Normal()])
        txts.append(txt)
        
    if subtitle != None:
        plt.suptitle(subtitle)
        
    plt.savefig(subtitle)

In [8]:
x_train, x_test,y_train,y_test = train_test_split(X,y)
print(x_train.shape)
x_train_flat = x_train.reshape(-1,70*30)
x_test_flat = x_test.reshape(-1,70*30)

tsne = TSNE()
train_tsne_embeds = tsne.fit_transform(x_train_flat)
scatter(train_tsne_embeds, y_train, "Samples from Training Data")

eval_tsne_embeds = tsne.fit_transform(x_test_flat)
scatter(eval_tsne_embeds, y_test, "Samples from Validation Data")

(316, 70, 30)


  return np.mean(atmp[sl], axis=axis)
  return np.mean(atmp[sl], axis=axis)


In [9]:
def triplet_loss(y_true, y_pred, alpha = 0.4):
    """
    Implementation of the triplet loss function
    Arguments:
    y_true -- true labels, required when you define a loss in Keras, you don't need it in this function.
    y_pred -- python list containing three objects:
            anchor -- the encodings for the anchor data
            positive -- the encodings for the positive data (similar to anchor)
            negative -- the encodings for the negative data (different from anchor)
    Returns:
    loss -- real number, value of the loss
    """
    print('y_pred.shape = ',y_pred)
    
    total_lenght = y_pred.shape.as_list()[-1]
    
    anchor = y_pred[:,0:int(total_lenght*1/3)]
    positive = y_pred[:,int(total_lenght*1/3):int(total_lenght*2/3)]
    negative = y_pred[:,int(total_lenght*2/3):int(total_lenght*3/3)]

    # distance between the anchor and the positive
    pos_dist = K.sum(K.square(anchor-positive),axis=1)

    # distance between the anchor and the negative
    neg_dist = K.sum(K.square(anchor-negative),axis=1)

    # compute loss
    basic_loss = pos_dist-neg_dist+alpha
    loss = K.maximum(basic_loss,0.0)
 
    return loss

def baseNetwork():    
        model = Sequential()
        model.add(Reshape((70, 30), input_shape=(1004,None, None)))
        model.add(Conv1D(20,10,activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv1D(20,5,activation='relu'))
        model.add(Conv1D(20,3,activation='relu'))
        model.add(Flatten())
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(3, activation='softmax'))
        return model
    
anchor_input = Input((70,30,1, ), name='anchor_input')
positive_input = Input((70,30,1, ), name='positive_input')
negative_input = Input((70,30,1, ), name='negative_input')

# Shared embedding layer for positive and negative items
Shared_DNN = baseNetwork()


encoded_anchor = Shared_DNN(anchor_input)
encoded_positive = Shared_DNN(positive_input)
encoded_negative = Shared_DNN(negative_input)


merged_vector = concatenate([encoded_anchor, encoded_positive, encoded_negative], axis=-1, name='merged_layer')

model = Model(inputs=[anchor_input,positive_input, negative_input], outputs=merged_vector)
model.compile(loss=triplet_loss, optimizer='adam')
model.summary()

y_pred.shape =  Tensor("merged_layer/concat:0", shape=(?, 9), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       (None, 70, 30, 1)    0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     (None, 70, 30, 1)    0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     (None, 70, 30, 1)    0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 3)            42463       anchor_input[0][0]               
                                

In [10]:
def generateTriplet(x,y,testsize=0.2,ap_pairs=10,an_pairs=10):
    data_xy = tuple([x,y])

    trainsize = 1-testsize

    triplet_train_pairs = []
    triplet_test_pairs = []
    for data_class in sorted(set(data_xy[1])):

        same_class_idx = np.where((data_xy[1] == data_class))[0]
        diff_class_idx = np.where(data_xy[1] != data_class)[0]
        A_P_pairs = random.sample(list(permutations(same_class_idx,2)),k=ap_pairs) #Generating Anchor-Positive pairs
        Neg_idx = random.sample(list(diff_class_idx),k=an_pairs)
        

        #train
        A_P_len = len(A_P_pairs)
        Neg_len = len(Neg_idx)
        for ap in A_P_pairs[:int(A_P_len*trainsize)]:
            Anchor = data_xy[0][ap[0]]
            Positive = data_xy[0][ap[1]]
            for n in Neg_idx:
                Negative = data_xy[0][n]
                triplet_train_pairs.append([Anchor,Positive,Negative])               
        #test
        for ap in A_P_pairs[int(A_P_len*trainsize):]:
            Anchor = data_xy[0][ap[0]]
            Positive = data_xy[0][ap[1]]
            for n in Neg_idx:
                Negative = data_xy[0][n]
                triplet_test_pairs.append([Anchor,Positive,Negative])    
                
    return np.array(triplet_train_pairs), np.array(triplet_test_pairs)

In [11]:
X_train, X_test = generateTriplet(X,y)

In [12]:
Anchor = X_train[:,0,:].reshape(-1,70,30,1)
Positive = X_train[:,1,:].reshape(-1,70,30,1)
Negative = X_train[:,2,:].reshape(-1,70,30,1)
Anchor_test = X_test[:,0,:].reshape(-1,70,30,1)
Positive_test = X_test[:,1,:].reshape(-1,70,30,1)
Negative_test = X_test[:,2,:].reshape(-1,70,30,1)

Y_dummy = np.empty((Anchor.shape[0],300))
Y_dummy2 = np.empty((Anchor_test.shape[0],1))

model.fit([Anchor,Positive,Negative],y=Y_dummy,validation_data=([Anchor_test,Positive_test,Negative_test],Y_dummy2), epochs=50)

Train on 240 samples, validate on 60 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a266256d8>

In [None]:
trained_model = Model(inputs=anchor_input, outputs=encoded_anchor)

In [None]:
tsne = TSNE()
X_train_trm = trained_model.predict(x_train.reshape(-1,70,30,1))
X_test_trm = trained_model.predict(x_test.reshape(-1,70,30,1))
train_tsne_embeds = tsne.fit_transform(X_train_trm)
eval_tsne_embeds = tsne.fit_transform(X_test_trm)

In [None]:
scatter(train_tsne_embeds, y_train, "Training Data After TNN")
scatter(eval_tsne_embeds, y_test, "Validation Data After TNN")

In [None]:
score = trained_model.evaluate(x_train.reshape(-1,70,30,1),y_train)