In [1]:
# http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin

In [2]:
# Importing the required packages
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from gensim.models import KeyedVectors

In [22]:
import keras
print(keras.__version__)

2.1.2


In [23]:
import tensorflow
print(tensorflow.__version__)

1.3.0


In [4]:
EMBEDDING_FILE = 'wikipedia-pubmed-and-PMC-w2v.bin'
print('Indexing word vectors')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE,binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

Indexing word vectors
Found 5443656 word vectors of word2vec


In [5]:
import copy
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [6]:
# Reading the text file 'DRUG-AE.rel' which provides relations between drugs and adverse effects.
TEXT_FILE = 'DRUG-AE.rel'

In [7]:
# Creating lists for the input fields and corresponding labels
input_data_ae = []
op_labels_ae = []

sentences = []

In [8]:
f = open(TEXT_FILE, 'r')

for each_line in f.readlines():
    sent_list = np.zeros([0,200])
    labels = np.zeros([0,3])
    tokens = each_line.split("|")
    sent = tokens[1]
    if sent in sentences:
        continue
    sentences.append(sent)
    begin_offset = int(tokens[3])
    end_offset = int(tokens[4])
    mid_offset = range(begin_offset+1, end_offset)
    word_tokens = nltk.word_tokenize(sent)
    offset = 0
    for each_token in word_tokens:
        offset = sent.find(each_token, offset)
        offset1 = copy.deepcopy(offset)
        offset += len(each_token)
        if each_token in punctuation or re.search(r'\d', each_token):
            continue
        each_token = each_token.lower()
        each_token = re.sub("[^A-Za-z\-]+","", each_token)
        if each_token in word2vec.vocab:
            new_word = word2vec.word_vec(each_token)
        if offset1 == begin_offset:
            sent_list = np.append(sent_list, np.array([new_word]), axis=0)
            labels = np.append(labels, np.array([[0,0,1]]), axis=0)
        elif offset == end_offset or offset in mid_offset:
            sent_list = np.append(sent_list, np.array([new_word]), axis=0)
            labels = np.append(labels, np.array([[0,1,0]]), axis=0)
        else:
            sent_list = np.append(sent_list, np.array([new_word]), axis=0)
            labels = np.append(labels, np.array([[1,0,0]]), axis=0)

    input_data_ae.append(sent_list)
    op_labels_ae.append(labels)
input_data_ae = np.array(input_data_ae)
op_labels_ae  = np.array(op_labels_ae)

In [9]:
input_data_ae = pad_sequences(input_data_ae, maxlen=30, dtype='float64', padding='post')
op_labels_ae = pad_sequences(op_labels_ae, maxlen=30, dtype='float64', padding='post')

In [21]:
print(len(input_data_ae))
print(len(op_labels_ae))

4271
4271


In [10]:
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional, TimeDistributed
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [12]:
# Creating Train and Validation datasets, for 4271 entries, 4000 in train dataset, and 271 in validation dataset
x_train= input_data_ae[:4000]
x_test = input_data_ae[4000:]
y_train = op_labels_ae[:4000]
y_test =op_labels_ae[4000:]

In [13]:
batch = 1      # Making the batch size as 1, as showing model each of the instances one-by-one
# Adding Bidirectional LSTM with Dropout, and Time Distributed layer with Dropout
# Finally using Adam optimizer for training purpose
xin = Input(batch_shape=(batch,30,200), dtype='float')
seq = Bidirectional(LSTM(300, return_sequences=True),merge_mode='concat')(xin)
mlp1 = Dropout(0.2)(seq)
mlp2 = TimeDistributed(Dense(60, activation='softmax'))(mlp1)
mlp3 = Dropout(0.2)(mlp2)
mlp4 = TimeDistributed(Dense(3, activation='softmax'))(mlp3)
model = Model(inputs=xin, outputs=mlp4)
model.compile(optimizer='Adam', loss='categorical_crossentropy')

In [14]:
model.fit(x_train, y_train,
          batch_size=batch,
          epochs=50,
          validation_data=(x_test, y_test))

Train on 4000 samples, validate on 271 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f48213a3b38>

In [33]:
val_pred = model.predict(x_test,batch_size=batch)
labels = []
for i in range(len(val_pred)):
    b = np.zeros_like(val_pred[i])
    b[np.arange(len(val_pred[i])), val_pred[i].argmax(1)] = 1
    labels.append(b)

In [36]:
print(val_pred.shape)

(271, 30, 3)


In [16]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score  

In [17]:
score =[]
f1 = []
precision =[]
recall =[]
point = []

In [18]:
for i in range(len(y_test)):
    if(f1_score(labels[i],y_test[i],average='weighted')>.6):
        point.append(i)
    score.append(f1_score(labels[i],y_test[i],average='weighted'))
    precision.append(precision_score(labels[i],y_test[i],average='weighted'))
    recall.append(recall_score(labels[i],y_test[i],average='weighted'))

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [41]:
print(len(point)/len(labels)*100)

69.37269372693727


In [42]:
print(np.mean(score))

0.686425174879


In [43]:
print(np.mean(precision))

0.975002548625


In [44]:
print(np.mean(recall))

0.576137761378


In [20]:
print(score)
print("\n------x------\n")
print(precision)
print("\n------x------\n")
print(recall)

[0.82352941176470584, 0.82352941176470584, 0.32727272727272727, 0.88888888888888895, 0.45833333333333331, 0.60465116279069764, 0.88888888888888895, 0.72340425531914887, 0.12444444444444444, 0.98305084745762705, 0.80000000000000004, 1.0, 0.72340425531914887, 0.42727272727272719, 0.86792452830188693, 0.72340425531914887, 0.5864864864864866, 1.0, 0.80000000000000004, 0.60465116279069764, 0.80000000000000004, 0.94736842105263164, 0.20413793103448274, 1.0, 0.38274509803921569, 0.96551724137931039, 0.74999999999999989, 0.39298245614035082, 0.77551020408163263, 0.4210526315789474, 0.80000000000000004, 1.0, 0.66666666666666663, 0.39629629629629631, 0.12473118279569892, 0.74999999999999989, 0.4210526315789474, 0.46153846153846151, 0.69565217391304357, 0.59078590785907859, 0.78974358974358982, 0.86792452830188693, 0.51239316239316246, 0.57142857142857151, 0.61515151515151512, 0.29999999999999999, 0.5, 0.51869918699187001, 0.66666666666666663, 0.71449275362318843, 0.77551020408163263, 0.675757575