# Drug Name Entity Classifier
## AHLT - MIRI 2018



## Initialization

Load needed modules and specify the working directory

In [1]:
# Load needed packages
from lxml import etree # XML file parsing
from os import listdir
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV # Parameter selection
import time # Execution time of some blocks

# Import our defined functions
from drug_functions import *

In [2]:
# Set the data directories
train_dirs_whereto_parse = ['data/small_train_DrugBank']
test_dirs_whereto_parse = ['data/small_test_DrugBank']

## Reading the train and test data from the XML files
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the token 'STOP' at the end of each sentence.

In [3]:
## TRAINING DATA

# Initialise the different lists with the data
entities=[]
texts=[]
train_texts_entities = []

# Iterate over all the different .xml files located in the specified directories
for directory in train_dirs_whereto_parse:
    
    # Get the names of all the files in the directory and create a 'xml.root' object for
    # each xml file
    roots = [etree.parse(directory+'/'+a).getroot() for a in listdir(directory) if a.endswith('.xml')]
    
    # Iterate over all the different 'xml.root' objects to extract the needed information
    for root in roots:
        for sentence in root.findall('sentence'):
            for entity in sentence.findall('entity'):
                entities = entities+[entity.get('text')]
            train_texts_entities = train_texts_entities + [('START '+sentence.get('text')+' STOP', entities)]
            entities =[]

# train_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
# Example: 
# [('I love Ibuprofeno and Frenadol', ['Ibuprofeno', 'Frenadol']), ('Give me a Fluimucil', ['Fluimucil'])]

train_texts_entities[0:2]

[('START Formal drug interaction studies have not been conducted with ORENCIA. STOP',
  ['ORENCIA']),
 ('START Population pharmacokinetic analyses revealed that MTX, NSAIDs, corticosteroids, and TNF blocking agents did not influence abatacept clearance. STOP',
  ['MTX', 'NSAIDs', 'corticosteroids', 'TNF blocking agents', 'abatacept'])]

In [4]:
## TESTING DATA

# Same process as with the training data
# In the testing data, for each sentance we have two related files:
# - A file with a sentence to be parsed, in which we may encounter drug names (ending with 'text.txt')
# - A file with the drug entities recognised in the sentence (ending with 'entities.txt')

test_texts = []
test_entities = []

for directory in test_dirs_whereto_parse:
    
    # Si no poso el sorted, em llegeix els files amb un ordre aleatori.
    # Amb el sorted m'asseguro que els corresponents files text.txt i entities.txt estan en la mateixa posicio
    
    # Read the pairs of files in alphabetical order
    text_file_names = sorted([directory + '/' + file for file in listdir(directory) if file.endswith('text.txt')])
    entities_file_names = sorted([directory + '/' + file for file in listdir(directory) if file.endswith('entities.txt')])
    
    for file in text_file_names:
        file = open(file,'r')
        test_texts = test_texts + [file.read()]
        
    for file in entities_file_names:
        read_entities = []
        with open(file,'r') as f:
            for line in f:
                read_entities = read_entities+[' '.join(line.split()[0:-1])] # separo en words, el.limino la ultima i torno a unir
                
        test_entities.append(read_entities)


test_texts_entities=list(zip(test_texts,test_entities))


# test_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there.
print(test_texts_entities[4])

('Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on Day 29 and every 8 weeks thereafter.\n', ['testosterone', 'Plenaxis'])


# Creating the features for the classifier

## BIO Tagger and Feature Creation

In this section we will tag each sentence with the BIO format. For this, we have created a function called 'bioTagger' which will perform the following actions:

Given a sentence 'text' and a set of drugs 'drugs', this function returns a list of str that
contains a tag for each of the tokens in text. The tags can be either 'B', 'I' or 'O'. 'B' means
the token is the first part of a drug entity, 'I' means the token is the continuation of a drug entity,
and 'O' means that the token does not belong to a drug entity.

Apart from that, we have also downloaded the DrugBank database (ref: https://www.drugbank.ca/) from we will extract all the named entities. We will create a list out of these set of entities and for each token processed, we will check if the token is already in the database, meaning that has a very high probability of being a NE.

In [11]:
?str.

In [22]:
# Load the DrugBank list of entities (it has already been processed for the extraction of the NE).
# Each line of the file contains a different named entity.
with(open('data/DrugBank_names_DB2.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()
        
# Initialise the needed lists
tokens = []
tags = []
removed_columns = []
features = pd.DataFrame()

# Iterate over all the train entities (tuples of (sentence, drugs)) and apply the bioTagger function
for text,drugs in train_texts_entities:
    features = pd.concat([features,createFeatureVector(text, drugbank_db)])
    tuples = bioTagger(text, drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

# Remove these features columns that are all 0. We will find many in those indicating pos_tags
for column in features.columns.values:
    if sum(features[column])==0:
        features = features.drop(column,axis=1)
        removed_columns.append(column)

# Create a training set with the features,tokens and the BIO tags
train_df = features
train_df['token'] = tokens
train_df['output'] = tags
# train_set = {'token':tokens,'output':tags}
# train_df = pd.DataFrame(train_set)
# train_df.head()

In [27]:
print(sum(train_df['is_token_in_DrugBank_db'] == 1))
train_df[train_df['is_token_in_DrugBank_db'] == 1]

53


Unnamed: 0,token_length,prefix_feature,suffix_feature,all_uppercase_letters,all_lowercase_letters,initial_capital_letter,contains_slash,all_letters,all_digits,contains_digit,...,contains_slash_context2,all_letters_context2,all_digits_context2,contains_digit_context2,contains_letters_context2,contains_uppercase_context2,contains_dash_context2,is_token_in_DrugBank_db,token,output
5,7,0,1,0,1,0,0,0,0,0,...,0,1,0,0,1,1,0,1,alcohol,B
7,11,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,Acamprosate,B
15,7,0,1,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,alcohol,B
5,11,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,Acamprosate,B
15,11,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,Acamprosate,B
16,11,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,Acamprosate,B
3,11,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,Acamprosate,B
6,9,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,Abciximab,B
1,9,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,Abciximab,B
10,7,0,0,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,1,disease,O


# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [None]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X = train_df.loc[:, [all(x) for x in list(zip(train_df.columns!=target_name,train_df.columns!=token_name))]]
Y = train_df[target_name]

### Tunning SVM in python

In [None]:
# Create a SVM object with the corresponding tunned parameters
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()

# Look for the best parameters of the SVM model with GridSearchCV
start = time.time()
clf = GridSearchCV(svc, parameters)
end = time.time()
print('Execution time for GridSearchCV: ', str(end - start))

In [None]:
# Train the SVM model with the parameters selected before
start = time.time()
clf.fit(X,Y)
end = time.time()
print('Training time of the SVM: ', str(end - start))

Predicting with just one test text. Let's tokenize it, and create its feature vector:

##### Making predictions

In [None]:
predictions = []
for text,entities in test_texts_entities:
    #print('text: ', text)
    #print('real entities: ',entities,'\n')
    
    # tokenize text
    tokens = nltk.word_tokenize(text)
    # computing predictions
    features = createFeatureVector(text, drugbank_db)
    
    # removing those columns deleted when training the classifier
    for column in features.columns.values:
        if column in removed_columns:
            features = features.drop(column,axis=1)
    predicted_tags = clf.predict(features)
    
    predictions.append((list(predicted_tags),entities,text)) 
    #print('predicted bio tags: ',predicted_tags,'\n')
    pred_entities = bioTagsToEntities(tokens = tokens, bio_tags = predicted_tags)
    #print('predicted entities: ', pred_entities, '\n')
    
# predictions is a list of tupples comprised of predicted tags and the true drugs we should extract from there
# print('predictions of text 1: ',predictions[1])

Then, let's define a function that recover's the whole drug name from BIO taggs

#####  Evaluation

Evaluation will be based on $$F1=\frac{2*precision*recall}{precision+recall}$$

In [None]:
def compute_precision(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(pred_ent),2)*100     

In [None]:
def compute_recall(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(true_ent),2)*100

Let's recover all the words from the predicted bio_tags and try to compute F1 for each sentence

In [None]:
import statistics
precision = []
recall = []
for tags, true_entities, text in predictions:
    # I need the tokens for the bioTagsToEntities function
    tokens = nltk.word_tokenize(text)
    predicted_entities = bioTagsToEntities(tokens,tags)
    precision = precision + [compute_precision(predicted_entities,true_entities)]
    recall = recall + [compute_recall(predicted_entities,true_entities)]

    
avg_precision = statistics.mean(precision)
avg_recall = statistics.mean(recall)
print('precision: ',avg_precision)
print('recall: ',avg_recall)

# F1 metric
F1 = round((2*avg_precision*avg_recall) / (avg_precision + avg_recall),2)
print('F1: ', F1)