## Testing the Neural Entity Detector trained using Pubmed Word Embeddings

### Step 1

#### Copy the Embeddings from source location to destination location

In [None]:
!cp "Location of Word2Vec_Model.p" .

#### Copy the Training Data, Testing Data, Evaluation Script to destination location

In [1]:
!mkdir Drugs
!cp "Location of train_drugs.txt" Drugs
!cp "Location of test_srugs.txt" Drugs
!cp "Location of evaluation script" Drugs
!wget https://wcds2017summernlp.blob.core.windows.net/entityrecognition/NERmodel_Drugs.model
!chmod 777 Drugs/evaldrugs.pl

mkdir: cannot create directory ‘Drugs_and_Diseases’: File exists


In [1]:
%%writefile Data_Preparation.py
from keras.preprocessing import sequence
import numpy as np
import cPickle as cpickle

class Data_Preparation:

    def __init__ (self, classes, seq_length, train_file=None, test_file=None, vector_size = 100):
        
        # Some constants
        self.DEFAULT_N_CLASSES = classes
        self.DEFAULT_N_FEATURES = vector_size
        self.DEFAULT_MAX_SEQ_LENGTH = seq_length
        
        # Other stuff
        self.wordvecs = None
        self.word_to_ix_map = {}
        self.n_features = 0
        self.n_tag_classes = 0
        self.n_sentences_all = 0
        self.tag_vector_map = {}
        
        self.max_sentence_len_train = 0
        self.max_sentence_len_test = 0
        self.max_sentence_len = 0
        
        self.all_X_train = []
        self.all_Y_train = []
        self.all_X_test = []
        self.all_Y_test = []
        self.unk_words = []
        
        self.read_and_parse_data(train_file, test_file)
            
    def get_data (self):
        return (self.all_X_train, self.all_Y_train, self.all_X_test, self.all_Y_test, self.wordvecs)
    
    def decode_prediction_sequence (self, pred_seq):
        
        pred_tags = []
        for class_prs in pred_seq:
            class_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
            class_vec[np.argmax(class_prs)] = 1
            if tuple(class_vec.tolist()) in self.tag_vector_map:
                pred_tags.append(self.tag_vector_map[tuple(class_vec.tolist())])
            else:
                print tuple(class_vec.tolist())
        return pred_tags
    
    def read_and_parse_data (self, train_file, test_file, skip_unknown_words = False):
        
        ###Load the Word2Vec Model###
        print("Loading W2V model")
        W2V_model = cpickle.load(open("Word2Vec_Model.p", "rb"))
        
        vocab = list(W2V_model.keys())
        
        self.word_to_ix_map = {}
        self.wordvecs = []
        
        ###Create LookUp Table for words and their word vectors###
        print("Creating LookUp table")
        for index, word in enumerate(vocab):
            self.word_to_ix_map[word] = index
            self.wordvecs.append(W2V_model[vocab[index]])
        
        self.wordvecs = np.array(self.wordvecs)
        print(len(self.wordvecs))
        self.n_features = len(self.wordvecs[0])
        print(self.n_features)
        
        # Add a zero vector for the Paddings
        self.wordvecs = np.vstack((self.wordvecs, np.zeros(self.DEFAULT_N_FEATURES)))
        zero_vec_pos = self.wordvecs.shape[0] - 1
        
        ##########################  READ TRAINING DATA  ######################### 
        with open(train_file, 'r') as f_train:
            
            self.n_tag_classes = self.DEFAULT_N_CLASSES
            self.tag_vector_map = {}    # For storing one hot vector notation for each Tag
            tag_class_id = 0            # Used to put 1 in the one hot vector notation
            raw_data_train = []
            raw_words_train = []
            raw_tags_train = []        

            # Process all lines in the file
            for line in f_train:
                line = line.strip()
                if not line:
                    raw_data_train.append( (tuple(raw_words_train), tuple(raw_tags_train)))
                    raw_words_train = []
                    raw_tags_train = []
                    continue
                
                word, tag = line.split('\t')
                
                raw_words_train.append(word)
                raw_tags_train.append(tag)
                
                if tag not in self.tag_vector_map:
                    one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
                    one_hot_vec[tag_class_id] = 1
                    self.tag_vector_map[tag] = tuple(one_hot_vec)
                    self.tag_vector_map[tuple(one_hot_vec)] = tag
                    tag_class_id += 1
                    
        print("raw_nd = " + str(len(raw_data_train)))
        
        #Adding a None Tag
        one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype = np.int32)
        one_hot_vec[tag_class_id] = 1
        self.tag_vector_map['NONE'] = tuple(one_hot_vec)
        self.tag_vector_map[tuple(one_hot_vec)] = 'NONE'
        tag_class_id += 1
        
        self.n_sentences_all = len(raw_data_train)

        # Find the maximum sequence length for Training data
        self.max_sentence_len_train = 0
        for seq in raw_data_train:
            if len(seq[0]) > self.max_sentence_len_train:
                self.max_sentence_len_train = len(seq[0])
                
                
        ##########################  READ TEST DATA  ######################### 
        with open(test_file, 'r') as f_test:
            
            self.n_tag_classes = self.DEFAULT_N_CLASSES
            tag_class_id = 0 
            raw_data_test = []
            raw_words_test = []
            raw_tags_test = []        

            # Process all lines in the file
            for line in f_test:
                line = line.strip()
                if not line:
                    raw_data_test.append( (tuple(raw_words_test), tuple(raw_tags_test)))
                    raw_words_test = []
                    raw_tags_test = []
                    continue
                
                word, tag = line.split('\t') 
                
                if tag not in self.tag_vector_map:
                    print "added"
                    one_hot_vec = np.zeros(self.DEFAULT_N_CLASSES, dtype=np.int32)
                    one_hot_vec[tag_class_id] = 1
                    self.tag_vector_map[tag] = tuple(one_hot_vec)
                    self.tag_vector_map[tuple(one_hot_vec)] = tag
                    tag_class_id += 1
                
                raw_words_test.append(word)
                raw_tags_test.append(tag)
                
                                    
        print("raw_nd = " + str(len(raw_data_test)))
        self.n_sentences_all = len(raw_data_test)

        # Find the maximum sequence length for Test Data
        self.max_sentence_len_test = 0
        for seq in raw_data_test:
            if len(seq[0]) > self.max_sentence_len_test:
                self.max_sentence_len_test = len(seq[0])
                
        #Find the maximum sequence length in both training and Testing dataset
        self.max_sentence_len = max(self.max_sentence_len_train, self.max_sentence_len_test)
        
        ############## Create Train Vectors################
        self.all_X_train, self.all_Y_train = [], []
        
        self.unk_words = []
        count = 0
        for word_seq, tag_seq in raw_data_train:  
            
            elem_wordvecs, elem_tags = [], []            
            for ix in range(len(word_seq)):
                w = word_seq[ix]
                t = tag_seq[ix]
                w = w.lower()
                if w in self.word_to_ix_map :
                    count += 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(self.tag_vector_map[t])

                elif "UNK" in self.word_to_ix_map :
                    elem_wordvecs.append(self.word_to_ix_map["UNK"])
                    elem_tags.append(self.tag_vector_map[t])
                
                else:
                    w = "UNK"       
                    new_wv = 2 * np.random.randn(self.DEFAULT_N_FEATURES) - 1 # sample from normal distribution
                    norm_const = np.linalg.norm(new_wv)
                    new_wv /= norm_const
                    self.wordvecs = np.vstack((self.wordvecs, new_wv))
                    self.word_to_ix_map[w] = self.wordvecs.shape[0] - 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(list(self.tag_vector_map[t]))

            
            # Pad the sequences for missing entries to make them all the same length
            nil_X = zero_vec_pos
            nil_Y = np.array(self.tag_vector_map['NONE'])
            pad_length = self.max_sentence_len - len(elem_wordvecs)
            self.all_X_train.append( ((pad_length)*[nil_X]) + elem_wordvecs)
            self.all_Y_train.append( ((pad_length)*[nil_Y]) + elem_tags)

        self.all_X_train = np.array(self.all_X_train)
        self.all_Y_train = np.array(self.all_Y_train)
        
        ########################Create TEST Vectors##########################

        self.all_X_test, self.all_Y_test = [], []
        
        for word_seq, tag_seq in raw_data_test:  
            
            elem_wordvecs, elem_tags = [], []            
            for ix in range(len(word_seq)):
                w = word_seq[ix]
                t = tag_seq[ix]
                w = w.lower()
                if w in self.word_to_ix_map:
                    count += 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(self.tag_vector_map[t])
                    
                elif "UNK" in self.word_to_ix_map :
                    self.unk_words.append(w)
                    elem_wordvecs.append(self.word_to_ix_map["UNK"])
                    elem_tags.append(self.tag_vector_map[t])
                    
                else:
                    self.unk_words.append(w)
                    w = "UNK"
                    self.word_to_ix_map[w] = self.wordvecs.shape[0] - 1
                    elem_wordvecs.append(self.word_to_ix_map[w])
                    elem_tags.append(self.tag_vector_map[t])
                
            # Pad the sequences for missing entries to make them all the same length
            nil_X = zero_vec_pos
            nil_Y = np.array(self.tag_vector_map['NONE'])
            pad_length = self.max_sentence_len - len(elem_wordvecs)
            self.all_X_test.append( ((pad_length)*[nil_X]) + elem_wordvecs)
            self.all_Y_test.append( ((pad_length)*[nil_Y]) + elem_tags)

        self.all_X_test = np.array(self.all_X_test)
        self.all_Y_test = np.array(self.all_Y_test)
        
        print("UNK WORD COUNT " + str(len(self.unk_words)))
        print("Found WORDS COUNT " + str(count))
        print("TOTAL WORDS " + str(count+len(self.unk_words)))
        
        return (self.all_X_train, self.all_Y_train, self.all_X_test, self.all_Y_test, self.wordvecs)
 

Overwriting Data_Preparation.py


In [7]:
from Data_Preparation import Data_Preparation
from NER_Model import NER_Model
import cPickle as cp
from keras.models import load_model
import numpy as np

TRAIN_FILEPATH = "Drugs//train_drugs.txt"
TEST_FILEPATH = "Drugs//test_drugs.txt"

vector_size = 50
classes = 7 + 1
seq_length = 613
layer_arg = 2
ep_arg = 10

if __name__ == "__main__":

    reader = Data_Preparation(classes, seq_length, TRAIN_FILEPATH, TEST_FILEPATH, vector_size)
    
    X_train, Y_train, X_test, Y_test, wordvecs = reader.get_data()

    nermodel = load_model("NERmodel_Drugs.model")
    
    # Evaluate the model
    print("Evaluating model...")
    target = open("Pubmed_Output.txt", 'w')
    predicted_tags= []
    test_data_tags = []
    ind = 0
    for x,y in zip(X_test, Y_test):
        tags = nermodel.predict(np.array([x]), batch_size=1)[0]
        pred_tags = reader.decode_prediction_sequence(tags)
        test_tags = reader.decode_prediction_sequence(y)
        ind += 1
        ### To see Progress ###
        if ind%500 == 0: 
            print("Sentence" + str(ind))

        pred_tag_wo_none = []
        test_tags_wo_none = []

        for index, test_tag in enumerate(test_tags):
            if test_tag != "NONE":
                test_tags_wo_none.append(test_tag)
                pred_tag_wo_none.append(pred_tags[index])

        for wo in pred_tag_wo_none:
            target.write(str(wo))
            target.write("\n")
        target.write("\n")

        for i,j in zip(pred_tags, test_tags):
            if i != "NONE" and j != "NONE":
                test_data_tags.append(j)
                predicted_tags.append(i)

    target.close()

    predicted_tags = np.array(predicted_tags)
    test_data_tags = np.array(test_data_tags)
    
    print("Done.") 

Loading W2V model
Creating LookUp table
84064
50
raw_nd = 5916
raw_nd = 233
UNK WORD COUNT 847
Found WORDS COUNT 123864
TOTAL WORDS 124711
making predictions
Evaluating model...
Done.


In [8]:
file1 = open("Pubmed_Output.txt")
file2 = open("Drugs//test_drugs.txt")
target = open("Drugs//eval2.txt", "w")

list1 = []
list2 = []

for line in file1:
    list1.append(line)
    
for line in file2:
    list2.append(line)
    
for ind, line in enumerate(list2):
    x = line.split("\t")
    if len(x) == 1:
        target.write("\n")
    else:
        target.write(x[0])
        target.write("\t")
        if list1[ind] == "NONE":
            target.write("O")
        else:
            target.write(list1[ind])
    ind += 1

file1.close()
file2.close()
target.close()

In [9]:
!./Drugs/evaldrugs.pl Drugs/eval2.txt Drugs/test_drugs.txt

                              Biomedical Entity Recognition Performance (Genaral)                                         
                                                                                         number(recall/precision/f-score) 
+------------------+---------------------------------+---------------------------------+---------------------------------+
|                  |          complete match         |       right boundary match      |       left boundary match       |
+------------------+---------------------------------+---------------------------------+---------------------------------+
|   Drug    ( 450) |  358 (79.56% / 93.23% / 85.85%) |  358 (79.56% / 93.23% / 85.85%) |  362 (80.44% / 94.27% / 86.81%) |
+------------------+---------------------------------+---------------------------------+---------------------------------+
|  [-ALL-]  ( 450) |  358 (79.56% / 93.23% / 85.85%) |  358 (79.56% / 93.23% / 85.85%) |  362 (80.44% / 94.27% / 86.81%) |
+-------