# Evaluation using the SNPPhenA corpus  

# ( imroved by Transfer Learning)

SNPPhenA is a corpus for extracting ranked associations of single-nucleotide polymorphisms and phenotypes from literature

We selected the SNP-phenotype dataset for transferring knowledge from the gene-disease domain. The rich features transferred from the base model can help to train the new model with SNP-phenotype sequences. 

#  -------------------------------------------------------------------------------------------

# imports

In [1]:
import tensorflow as tf
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras_tqdm import TQDMNotebookCallback
import numpy as np
np.random.seed(1337)
from keras_tqdm import TQDMNotebookCallback
import nltk
import xml.etree.ElementTree as ET
import pandas as pd
import os
import string
from nltk.tokenize import TreebankWordTokenizer
from numpy.random import random_sample
import re
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from keras.layers import Embedding, Flatten,LSTM
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation,  Input, merge,Conv1D,MaxPooling1D,GlobalMaxPooling1D,Convolution1D
from keras import regularizers
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from keras.layers import Concatenate, concatenate
from keras import backend as K
from keras.layers import multiply
from keras.layers import merge
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *
import keras.optimizers
random_seed=137

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


###  Define Callback functions to generate Measures

In [2]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


# Experiments to reproduce the results of Table 9 

## Load our Integrated corpus to use for training the Base Model 

In [3]:
with open('data/my_corpus_integrated.pickle', 'rb') as handle:    
    
    W_train_my_corpus = pickle.load(handle)
    Y_train_my_corpus = pickle.load(handle)
    Tr_word_list_my_corpus = pickle.load(handle)
    
    W_val_my_corpus = pickle.load(handle)
    Y_val_my_corpus = pickle.load(handle)    
    V_word_list_my_corpus = pickle.load(handle)
    
    W_test_my_corpus = pickle.load(handle)
    Y_test_my_corpus = pickle.load(handle)
    Te_word_list_my_corpus = pickle.load(handle)
    
    word_vectors = pickle.load(handle)
    word_dict = pickle.load(handle)

    label_dict_my_corpus = pickle.load(handle)
    sentMax_my_corpus = pickle.load(handle)


W=np.concatenate((W_train_my_corpus,W_val_my_corpus,W_test_my_corpus),axis=0)

Y=np.concatenate((Y_train_my_corpus,Y_val_my_corpus,Y_test_my_corpus),axis=0)

### Load pre processed Data

In [4]:
with open('data/train_and_test_data_sentences_snp_2class_transfer.pickle', 'rb') as handle:        
    W_train = pickle.load(handle)
    Y_train = pickle.load(handle)
    Tr_word_list = pickle.load(handle)
    W_test = pickle.load(handle)
    Y_test = pickle.load(handle)
    Te_word_list = pickle.load(handle)
    label_dict = pickle.load(handle)
    MAX_SEQUENCE_LENGTH = pickle.load(handle)


In [5]:
MAX_SEQUENCE_LENGTH

100

### Prepare Word Embedding Layer

In [6]:
EMBEDDING_DIM=word_vectors.shape[1]
embedding_matrix=word_vectors

def create_embedding_layer(l2_reg=0.1,use_pretrained=True,is_trainable=False):
    
    if use_pretrained:        
        return Embedding(len(word_dict) ,EMBEDDING_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH,trainable=is_trainable,embeddings_regularizer=regularizers.l2(l2_reg))
    else:    
        return Embedding(len(word_dict) ,EMBEDDING_DIM,input_length=MAX_SEQUENCE_LENGTH)
            

## Create the Base Model

In [7]:
# only positive
def build_model():    
    tf.set_random_seed(1337)
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding_layer=create_embedding_layer(use_pretrained=True,is_trainable=False)
    embedded_sequences = embedding_layer(sequence_input)

    forward = LSTM(sentMax_my_corpus,recurrent_dropout=0.05)(embedded_sequences)
    backward = LSTM(sentMax_my_corpus, go_backwards=True,recurrent_dropout=0.05)(embedded_sequences)
    lstm_sequence = concatenate([forward,backward])
    
    x = Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.05))(lstm_sequence)
    x = Dropout(0.9)(x)

    preds = Dense(2, activation='softmax')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc',f1])
    model.summary()
    return model
    

# Train the Base Model

In [8]:
param='macro'
model = None # Clearing the NN.
model = build_model()
history=model.fit(W, Y,epochs=1,validation_data=(W_test,Y_test), batch_size=64,verbose=1,callbacks=[TQDMNotebookCallback()])        

predicted = np.argmax(model.predict(W_test), axis=1)
y_test_to_label = np.argmax(Y_test, axis=1)
prec, reca, fscore, sup = precision_recall_fscore_support(y_test_to_label, predicted, average=param)
print(prec,' - ', reca,'-',fscore)        


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 200)     3468000     input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 100)          120400      embedding_1[0][0]                
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 100)          120400      embedding_1[0][0]                
__________________________________________________________________________________________________
concatenat

Epoch 1/1



0.6454092603211009  -  0.6223981900452489 - 0.6141986346549682


# Extract some layers from pretrained model to use as Feature Extractor

In [9]:
new_model = Model(model.inputs, model.layers[-4].output)
new_model.set_weights(model.get_weights())

##  Extract Feature using pretrained model

In [10]:
Features=new_model.predict(W_train)

Features_test=new_model.predict(W_test)

In [11]:
Features=np.expand_dims(Features,axis=2)


Features_test=np.expand_dims(Features_test,axis=2)

In [12]:
Features_test.shape

(365, 200, 1)

In [21]:
param='macro'
from keras import optimizers


# Make use of Transfer Learning

# Create the Target Model

In [42]:
def build_model_transfer():    
    tf.set_random_seed(1337)
    feature_input = Input(shape=(Features.shape[1],Features.shape[2]), dtype='float')
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding_layer=create_embedding_layer(use_pretrained=True,is_trainable=False)
    embedded_sequences = embedding_layer(sequence_input)

    # use extracted features from Base model
    x = Conv1D(32, 5, activation='relu')(feature_input)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)
    f5=Flatten()(x)

    # use extracted features from Base model
    x = Conv1D(32, 3, activation='relu')(feature_input)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)
    f3=Flatten()(x)

    # use extracted features from Base model
    x = Conv1D(32, 2, activation='relu')(feature_input)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)
    f2=Flatten()(x)

    feature_output = concatenate([f5,f3,f2])
    x = Conv1D(256, 7, activation='relu')(embedded_sequences)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)

    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Dropout(0.5)(x)
    conv_sequence=GlobalMaxPooling1D()(x)    #x = Flatten()(x)
    forward = LSTM(100,recurrent_dropout=0.05)(embedded_sequences)
    backward = LSTM(100, go_backwards=True,recurrent_dropout=0.05)(embedded_sequences)
    lstm_sequence = concatenate([forward,backward])
    
    merge = concatenate([conv_sequence,lstm_sequence,feature_output])
    x = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.05))(merge)
    x = Dropout(0.5)(x)

    preds = Dense(2, activation='softmax')(x)

    model = Model([sequence_input,feature_input], preds)
    
    adam=optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None)

    model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['acc',f1])
    #model.summary()
    return model


# Train the Target Model using the Extracted Features from the Base Model

In [None]:
W_train=W_train[0]
W_test=W_test[0]


In [43]:

def train_and_evaluate_model_lstm():
        modelFinal = None # Clearing the NN.
        modelFinal = build_model_transfer()   
        earlystopper = EarlyStopping(monitor='val_loss', patience=2, verbose=0)        
        history=modelFinal.fit([W_train,Features], Y_train,epochs=20,validation_data=([W_test,Features_test],Y_test), batch_size=32,verbose=0,callbacks=[TQDMNotebookCallback()])        
        predicted = np.argmax(modelFinal.predict([W_test,Features_test]), axis=1)
        y_test_to_label = np.argmax(Y_test, axis=1)
        prec, reca, fscore, sup = precision_recall_fscore_support(y_test_to_label, predicted, average=param)
        print("Precision:{:.2f}% Recall:{:.2f}% Fscore:{:.2f}% ".format(prec*100, reca*100, fscore*100))        
        return history,prec, reca, fscore


hists=[]
precission=[]
recall=[]
fscores=[]
for i in range(10):
    hist,prec, reca, fscore=train_and_evaluate_model_lstm()
    hists.append(hist)
    precission.append(prec)
    recall.append(reca)
    fscores.append(fscore)
print("Final Precision:{:.2f}% Recall:{:.2f}% Fscore:{:.2f}% ".format( np.average(precission)*100,  np.average(recall)*100,  np.average(fscores)*100))         

Precision:78.74% Recall:77.45% Fscore:77.61% 


Precision:80.91% Recall:78.29% Fscore:78.45% 


Precision:80.72% Recall:79.03% Fscore:79.22% 


Precision:81.36% Recall:76.26% Fscore:76.18% 


Precision:77.98% Recall:77.98% Fscore:77.98% 


Precision:79.35% Recall:78.37% Fscore:78.53% 


Precision:81.30% Recall:79.58% Fscore:79.78% 


Precision:80.48% Recall:77.37% Fscore:77.48% 


Precision:77.93% Recall:76.61% Fscore:76.75% 


Precision:80.12% Recall:78.85% Fscore:79.03% 
Final Precision:79.89% Recall:77.98% Fscore:78.10% 
