# Drug Name Entity Classifier
## AHLT - MIRI 2018



## Initialization

Load needed modules and specify the working directory

In [1]:
# Load needed packages
from lxml import etree # XML file parsing
from os import listdir
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # Parameter selection
import time # Execution time of some blocks
from nltk.tag import StanfordPOSTagger
import statistics
import scipy.stats # for RandomizedSearchCV

# Import our defined functions
from drug_functions import *

In [2]:
# Set the data directories
train_dirs_whereto_parse = ['data/small_train_DrugBank']
test_dirs_whereto_parse = ['data/small_test_DrugBank']

## Reading the train and test data from the XML files
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the token 'STOP' at the end of each sentence.

In [3]:
## TRAINING DATA

# Initialise the different lists with the data
entities=[]
texts=[]
train_texts_entities = []

# Iterate over all the different .xml files located in the specified directories
for directory in train_dirs_whereto_parse:
    
    # Get the names of all the files in the directory and create a 'xml.root' object for
    # each xml file
    roots = [etree.parse(directory+'/'+a).getroot() for a in listdir(directory) if a.endswith('.xml')]
    
    # Iterate over all the different 'xml.root' objects to extract the needed information
    for root in roots:
        for sentence in root.findall('sentence'):
            for entity in sentence.findall('entity'):
                entities = entities+[entity.get('text')]
            # we do not add to the train set those sentences with no entities
            if entities:
                train_texts_entities = train_texts_entities + [('START '+sentence.get('text')+' STOP', entities)]
                entities =[]

# train_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
# Example: 
# [('I love Ibuprofeno and Frenadol', ['Ibuprofeno', 'Frenadol']), ('Give me a Fluimucil', ['Fluimucil'])]

train_texts_entities[0:2]

[('START Formal drug interaction studies have not been conducted with ORENCIA. STOP',
  ['ORENCIA']),
 ('START Population pharmacokinetic analyses revealed that MTX, NSAIDs, corticosteroids, and TNF blocking agents did not influence abatacept clearance. STOP',
  ['MTX', 'NSAIDs', 'corticosteroids', 'TNF blocking agents', 'abatacept'])]

In [4]:
## TESTING DATA

# Same process as with the training data
# In the testing data, for each sentance we have two related files:
# - A file with a sentence to be parsed, in which we may encounter drug names (ending with 'text.txt')
# - A file with the drug entities recognised in the sentence (ending with 'entities.txt')

test_texts = []
test_entities = []

for directory in test_dirs_whereto_parse:
    
    # Si no poso el sorted, em llegeix els files amb un ordre aleatori.
    # Amb el sorted m'asseguro que els corresponents files text.txt i entities.txt estan en la mateixa posicio
    
    # Read the pairs of files in alphabetical order
    text_file_names = sorted([directory + '/' + file for file in listdir(directory) if file.endswith('text.txt')])
    entities_file_names = sorted([directory + '/' + file for file in listdir(directory) if file.endswith('entities.txt')])
    
    for file in text_file_names:
        file = open(file,'r')
        test_texts = test_texts + ['START '+file.read()[:-1]+' STOP'] # each file.read() string ends with a \n I do not want
        
    for file in entities_file_names:
        read_entities = []
        with open(file,'r') as f:
            for line in f:
                read_entities = read_entities+[' '.join(line.split()[0:-1])] # separo en words, el.limino la ultima i torno a unir
                
        test_entities.append(read_entities)


test_texts_entities=list(zip(test_texts,test_entities))


# test_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there.
print(test_texts_entities[4])

('START Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on Day 29 and every 8 weeks thereafter. STOP', ['testosterone', 'Plenaxis'])


# Creating the features for the classifier

## BIO Tagger and Feature Creation

In this section we will tag each sentence with the BIO format. For this, we have created a function called 'BIOTagger' which will perform the following actions:

Given a sentence 'text' and a set of drugs 'drugs', this function returns a list of str that
contains a tag for each of the tokens in text. The tags can be either 'B', 'I' or 'O'. 'B' means
the token is the first part of a drug entity, 'I' means the token is the continuation of a drug entity,
and 'O' means that the token does not belong to a drug entity.

Apart from that, we have also downloaded the DrugBank database (ref: https://www.drugbank.ca/) from we will extract all the named entities. We will create a list out of these set of entities and for each token processed, we will check if the token is already in the database, meaning that has a very high probability of being a NE.

In [5]:
# Load the DrugBank list of entities (it has already been processed for the extraction of the NE).
# Each line of the file contains a different named entity.
with(open('data/DrugBank_names_DB.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()
        
# Initialise the needed lists
tokens = []
tags = []
removed_columns = []
features = pd.DataFrame()

# Creating StanfordPOStagger. We will need it as a createFeatureVector function parameter
jar='Stanford_POStagger/stanford-postagger.jar'
model='Stanford_POStagger/models/english-bidirectional-distsim.tagger'
st = StanfordPOSTagger(model,jar, encoding='utf-8')

#
# Iterate over all the train entities (tuples of (sentence, drugs)) and apply the BIOTagger function
for text,drugs in train_texts_entities:
    tokenized_sentence = nltk.word_tokenize(text)
    features = pd.concat([features,createFeatureVector(tokenized_sentence, drugbank_db,st)])
    tuples = BOTagger(text, drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

# computing one-hot coding for 'Aa1-' feature.
training_dummies = pd.get_dummies(features['Aa1-'])
features = features.drop('Aa1-',axis=1)
# joining both data frames
for name in training_dummies.columns:
    features[name]=training_dummies[name]
'''
# Adding the lessFrequentTokensFeature. We will needed first the list with all the tokens.
less_frequent_tokens = lessFrequentTokens(tokens)
for token in tokens:
    if token in less_frequent_tokens:
        features['less frequent token'] +=[1]
    else:
        features['less frequent token'] +=[0]
'''


# Create a training set with the features,tokens and the BIO tags
train_df = features
train_df['token'] = tokens
train_df['output'] = tags


The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


In [6]:
# How many tokens have we tagged with the DrugBank?
print(sum(train_df['is_token_in_DrugBank_db'] == 1))

# How many tokens are actually tagged with a 'B' or a 'I'?
print(sum(train_df['output'] == 'B'))
print(sum(train_df['output'] == 'I'))
#train_df[train_df['is_token_in_DrugBank_db'] == 1]

# Which are our unique values?
print(train_df.output.unique())


train_df.head()

198
202
0
['O' 'B']


Unnamed: 0,token_length,prefix_feature,suffix_feature,all_uppercase_letters,all_lowercase_letters,initial_capital_letter,contains_slash,all_letters,all_digits,contains_digit,...,-A1,-a,-a1,-aA,1,A,a,aA,token,output
0,5,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,START,O
1,6,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,Formal,O
2,4,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,drug,O
3,11,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,interaction,O
4,7,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,studies,O


# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [7]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X_train = train_df.loc[:, [all(x) for x in list(zip(train_df.columns!=target_name,train_df.columns!=token_name))]]
Y_train = train_df[target_name]

### Tunning SVM in python

In [8]:
# Create a SVM object with the corresponding tunned parameters
svc = svm.SVC()

# Look for the best parameters of the SVM model with GridSearchCV
start = time.time()
clf = RandomizedSearchCV(svc,{'C': scipy.stats.norm(15,5), 'gamma': scipy.stats.expon(scale=.1),
                              'kernel': ['rbf'], 'class_weight':['balanced', None]},20) # 40 iterations
end = time.time()
print('Execution time for GridSearchCV: ', str(end - start))

Execution time for GridSearchCV:  0.002115011215209961


In [9]:
# Train the SVM model with the parameters selected before
start = time.time()
clf.fit(X_train,Y_train)
end = time.time()
print('Training time of the SVM: ', str(end - start))

Training time of the SVM:  3.3476500511169434


In [10]:
# Computing training error. If there is a significant drop from training error to test error, we will be suffering 
# from overfitting

train_predictions = []
for text,entities in train_texts_entities:
    #print('text: ', text)
    #print('real entities: ',entities,'\n')
    
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # computing predictions
    features = createFeatureVector(tokens, drugbank_db,st)
    
    # computing one-hot coding for 'Aa1-' feature.
    dummies = pd.get_dummies(features['Aa1-'])
    features = features.drop('Aa1-',axis=1)
    # joining both data frames
    for name in dummies.columns:
        features[name]=dummies[name]

    #print(tokens)
    
    # adding those columns related to Aa1- that we cannot see with the sentence in question
    for name in training_dummies.columns:
        if name not in dummies.columns:
            features[name]=[0]*len(dummies[dummies.columns.values[0]])
    
        
    predicted_tags = clf.predict(features)
    
    
    train_predictions.append((list(predicted_tags),entities,text)) 
    #print('predicted bio tags: ',predicted_tags,'\n')
    pred_entities = BOTagsToEntities(tokens = tokens, bo_tags = predicted_tags)
    #print('predicted entities: ', pred_entities, '\n')
    
# predictions is a list of tupples comprised of predicted tags and the true drugs we should extract from there
#print('predictions of text 1: ',predictions[1])

text:  START Formal drug interaction studies have not been conducted with ORENCIA. STOP
real entities:  ['ORENCIA'] 

['START', 'Formal', 'drug', 'interaction', 'studies', 'have', 'not', 'been', 'conducted', 'with', 'ORENCIA', '.', 'STOP']
predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'O'] 

predicted entities:  ['ORENCIA'] 

text:  START Population pharmacokinetic analyses revealed that MTX, NSAIDs, corticosteroids, and TNF blocking agents did not influence abatacept clearance. STOP
real entities:  ['MTX', 'NSAIDs', 'corticosteroids', 'TNF blocking agents', 'abatacept'] 

['START', 'Population', 'pharmacokinetic', 'analyses', 'revealed', 'that', 'MTX', ',', 'NSAIDs', ',', 'corticosteroids', ',', 'and', 'TNF', 'blocking', 'agents', 'did', 'not', 'influence', 'abatacept', 'clearance', '.', 'STOP']
predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'B' 'O' 'B' 'O' 'O' 'B' 'O' 'O' 'O' 'O'
 'O' 'B' 'O' 'O' 'O'] 

predicted entities:  ['MTX', 'NSAIDs', 'corticoste

['START', 'Formal', 'drug', 'interaction', 'studies', 'with', 'Abciximab', 'have', 'not', 'been', 'conducted', '.', 'STOP']
predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'O' 'O' 'O' 'O' 'O'] 

predicted entities:  ['Abciximab'] 

text:  START Abciximab has been administered to patients with ischemic heart disease treated concomitantly with a broad range of medications used in the treatment of angina myocardial infarction and hypertension. STOP
real entities:  ['Abciximab'] 

['START', 'Abciximab', 'has', 'been', 'administered', 'to', 'patients', 'with', 'ischemic', 'heart', 'disease', 'treated', 'concomitantly', 'with', 'a', 'broad', 'range', 'of', 'medications', 'used', 'in', 'the', 'treatment', 'of', 'angina', 'myocardial', 'infarction', 'and', 'hypertension', '.', 'STOP']
predicted bio tags:  ['O' 'B' 'O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'B' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'] 

predicted entities:  ['Abciximab', 'ischemic', 'medicati

['START', 'Acarbose', 'may', 'affect', 'digoxin', 'bioavailabillty', 'and', 'may', 'require', 'dose', 'adjustment', 'of', 'digoxin', 'by', '16', '%', '(', '90', '%', 'confidence', 'interval', ':', '8-23', '%', ')', ',', 'decrease', 'mean', 'C', 'max', 'digoxin', 'by', '26', '%', '(', '90', '%', 'confidence', 'interval', ':', '16-34', '%', ')', 'and', 'decrease', 'mean', 'trough', 'concentrations', 'of', 'digoxin', 'by', '9', '%', '(', '90', '%', 'confidence', 'limit', ':', '19', '%', 'decrease', 'to', '2', '%', 'increase', ')', '.', 'STOP']
predicted bio tags:  ['O' 'B' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'] 

predicted entities:  ['Acarbose', 'digoxin', 'digoxin', 'digoxin'] 

text:  START The amount of metformin absorbed while taking Acarbose was bioequiv

['START', 'Blunting', 'of', 'the', 'antihypertensive', 'effect', 'of', 'beta-adrenoceptor', 'blocking', 'agents', 'by', 'nonsteroidal', 'anti-inflammatory', 'drugs', 'has', 'been', 'reported', '.', 'STOP']
predicted bio tags:  ['O' 'O' 'O' 'O' 'B' 'O' 'O' 'B' 'B' 'O' 'O' 'B' 'B' 'B' 'O' 'O' 'O' 'O'
 'O'] 

predicted entities:  ['antihypertensive', 'beta-adrenoceptor blocking', 'nonsteroidal anti-inflammatory drugs'] 

text:  START No significant interactions with digoxin, hydrochlorothiazide, hydralazine, sulfinpyrazone, oral contraceptives, tolbutamide, or warfarin have been observed. STOP
real entities:  ['digoxin', 'hydrochlorothiazide', 'hydralazine', 'sulfinpyrazone', 'contraceptives', 'tolbutamide', 'warfarin'] 

['START', 'No', 'significant', 'interactions', 'with', 'digoxin', ',', 'hydrochlorothiazide', ',', 'hydralazine', ',', 'sulfinpyrazone', ',', 'oral', 'contraceptives', ',', 'tolbutamide', ',', 'or', 'warfarin', 'have', 'been', 'observed', '.', 'STOP']
predicted bio tags:

In [11]:
import statistics
train_precision = []
train_recall = []
for tags, true_entities, text in train_predictions:
    # I need the tokens for the bioTagsToEntities function
    tokens = nltk.word_tokenize(text)
    predicted_entities = BOTagsToEntities(tokens,tags)
    train_precision = train_precision + [compute_precision(predicted_entities,true_entities)]
    train_recall = train_recall + [compute_recall(predicted_entities,true_entities)]

    
avg_precision = statistics.mean(train_precision)
avg_recall = statistics.mean(train_recall)
print('train precision: ',avg_precision)
print('train recall: ',avg_recall)

# F1 metric
F1_train = round((2*avg_precision*avg_recall) / (avg_precision + avg_recall),2)
print('F1 train: ', F1_train)

train precision:  74.85454545454546
train recall:  84.34545454545454
F1 train:  79.32


### Making predictions

In [12]:
predictions = []
for text,entities in test_texts_entities:
    # print('original text: ', BOTagger(text = text, drugs = entities), '\n')
    
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # computing predictions
    features = createFeatureVector(tokens, drugbank_db,st)
    
    # computing one-hot coding for 'Aa1-' feature.
    dummies = pd.get_dummies(features['Aa1-'])
    features = features.drop('Aa1-',axis=1)
    # joining both data frames
    for name in dummies.columns:
        features[name]=dummies[name]
    
    # adding those columns related to Aa1- that we cannot see with the sentence in question
    for name in training_dummies.columns:
        if name not in dummies.columns:
            features[name]=[0]*len(dummies[dummies.columns.values[0]])
   
    
    predicted_tags = clf.predict(features)
    
    predictions.append((list(predicted_tags),entities,text)) 
    # print('predicted bio tags: ',str(list(zip(tokens, predicted_tags))),'\n')
    pred_entities = BOTagsToEntities(tokens = tokens, bo_tags = predicted_tags)
    # print('predicted entities: ', pred_entities, '\n')
    
    # TODO: Something is wrong with the BIOTagsToEntities()
    
# predictions is a list of tupples comprised of predicted tags and the true drugs we should extract from there
#print('predictions of text 1: ',predictions[1])

Then, let's define a function that recover's the whole drug name from BIO taggs

### Evaluation

Evaluation will be based on $$F1=\frac{2*precision*recall}{precision+recall}$$

Let's recover all the words from the predicted bio_tags and try to compute F1 for each sentence

In [13]:
precision = []
recall = []
for tags, true_entities, text in predictions:
    # I need the tokens for the bioTagsToEntities function
    tokens = nltk.word_tokenize(text)
    predicted_entities = BOTagsToEntities(tokens,tags)
    precision = precision + [compute_precision(predicted_entities,true_entities)]
    recall = recall + [compute_recall(predicted_entities,true_entities)]

    
avg_precision = statistics.mean(precision)
avg_recall = statistics.mean(recall)
print('precision: ',avg_precision)
print('recall: ',avg_recall)

# F1 metric
F1 = round((2*avg_precision*avg_recall) / (avg_precision + avg_recall),2)
print('F1: ', F1)

precision:  63.5
recall:  74.61111111111111
F1:  68.61


In [14]:
'''
## Log of results
date, precision, recall, F1, features, test
14-May, 46.2, 52.1, 48.99, Token length; Prefixes/Suffixes; POS tag; Binary features (+-2); Token position; DrugBank DB; Shape, yes
'''

'\n## Log of results\ndate, precision, recall, F1, features, test\n14-May, 46.2, 52.1, 48.99, Token length; Prefixes/Suffixes; POS tag; Binary features (+-2); Token position; DrugBank DB; Shape, yes\n'

In [15]:
from operator import itemgetter
a = [('h',1),('a',2),('d',3)]
print(sorted(a,key=itemgetter(1),reverse=True))

print(round(1.23,0))

[('d', 3), ('a', 2), ('h', 1)]
1.0
