# Drug Name Entity Classifier
## AHLT - MIRI 2018



## Initialization

Load needed modules and specify the working directory

In [97]:
# Load needed packages
from lxml import etree
from os import listdir
import pandas as pd
import numpy as np
import re
from drug_functions import *
from sklearn import *

In [98]:
# Set the data directories
# Maik, he afegit les dades dins el mateix repositori de forma que tinguem els mateixos paths i 
# no haguem de canviarlos cada cop que entrem a aquest file.

train_dirs_whereto_parse = ['data/small_train_DrugBank']
test_dirs_whereto_parse = ['data/small_test_DrugBank']

## Reading the train and test data from the XML files
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the token 'STOP' at the end of each sentence.

In [99]:
## TRAINING DATA

# Initialise the different lists with the data
entities=[]
texts=[]
train_texts_entities = []

# Iterate over all the different .xml files located in the specified directories
for directory in train_dirs_whereto_parse:
    
    # Get the names of all the files in the directory and create a 'xml.root' object for
    # each xml file
    roots = [etree.parse(directory+'/'+a).getroot() for a in listdir(directory) if a.endswith('.xml')]
    
    # Iterate over all the different 'xml.root' objects to extract the needed information
    for root in roots:
        for sentence in root.findall('sentence'):
            for entity in sentence.findall('entity'):
                entities = entities+[entity.get('text')]
            train_texts_entities = train_texts_entities + [('START ' + sentence.get('text') + ' STOP', entities)]
            entities =[]

# train_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
# Example: 
# [('I love Ibuprofeno and Frenadol', ['Ibuprofeno', 'Frenadol']), ('Give me a Fluimucil', ['Fluimucil'])]

train_texts_entities[0:2]

[('START Formal drug interaction studies have not been conducted with ORENCIA. STOP',
  ['ORENCIA']),
 ('START Population pharmacokinetic analyses revealed that MTX, NSAIDs, corticosteroids, and TNF blocking agents did not influence abatacept clearance. STOP',
  ['MTX', 'NSAIDs', 'corticosteroids', 'TNF blocking agents', 'abatacept'])]

In [100]:
## TESTING DATA

# Same process as with the training data
# In the testing data, for each sentance we have two related files:
# - A file with a sentence to be parsed, in which we may encounter drug names (ending with 'text.txt')
# - A file with the drug entities recognised in the sentence (ending with 'entities.txt')

test_texts = []
test_entities = []

for directory in test_dirs_whereto_parse:
    
    # Si no poso el sorted, em llegeix els files amb un ordre aleatori.
    # Amb el sorted m'asseguro que els corresponents files text.txt i entities.txt estan en la mateixa posicio
    
    # Read the pairs of files in alphabetical order
    text_file_names = sorted([directory + '/' + file for file in listdir(directory) if file.endswith('text.txt')])
    entities_file_names = sorted([directory + '/' + file for file in listdir(directory) if file.endswith('entities.txt')])
    
    for file in text_file_names:
        file = open(file,'r')
        test_texts = test_texts + [file.read()]
        
    for file in entities_file_names:
        read_entities = []
        with open(file,'r') as f:
            for line in f:
                read_entities = read_entities+[' '.join(line.split()[0:-1])] # separo en words, el.limino la ultima i torno a unir
                
        test_entities.append(read_entities)


test_texts_entities=list(zip(test_texts,test_entities))


# test_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there.
# print(test_texts_entities[4])

## BIO Tagger

In this section we will tag each sentence with the BIO format. For this, we have created a function called 'bio_tagger' which will perform the following actions:

Given a sentence 'text' and a set of drugs 'drugs', this function returns a list of str that
contains a tag for each of the tokens in text. The tags can be either 'B', 'I' or 'O'. 'B' means
the token is the first part of a drug entity, 'I' means the token is the continuation of a drug entity,
and 'O' means that the token does not belong to a drug entity.

In [101]:
# Initialise the needed lists
tokens = []
tags = []

# Iterate over all the train entities (tuples of (sentence, drugs)) and apply the bio_tagger function
for text,drugs in train_texts_entities:
    tuples = bio_tagger(text, drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

    
# Create a training set with the tokens and the BIO tags
train_set = {'token':tokens,'output':tags}
train_df = pd.DataFrame(train_set)
# train_df.head()

# Creating the features for the classifier

In [102]:
# Define some functions that will be used in order to create the features
def hasNumbers(string):
    return any(char.isdigit() for char in string)

def hasLetters(string):
    return any(char.isalpha() for char in string)

def hasUpperCase(string):
    return any(char.isupper() for char in string)

In [103]:
# Define a function for the automatized creation of features given a tokenized sentence

def feature_vector(tokenized_sentence):
    '''
    Description:
    
    Examples/Tests:
    
    '''
    
    # Feature: Initialise the feature_vector dictionary, in which we will create the features of each token
    feature_vector = {}
    
    # Feature: Length of the token
    feature_vector['token_length'] = [len(token) for token in tokenized_sentence]
    
    # Feature: Prefixes and Suffixes

    prefix_feature = []
    suffix_feature = []

    prefixes = r'^meth|^eth|^prop|^but|^pent|^hex|^hept|^oct|^non|^dec'
    suffixes = r'ane$|ene$|yne$|ol$|al$|amine$|cid$|ium$|ether$|ate$|one$'

    for token in tokenized_sentence:

            if re.search(prefixes,token):
                prefix_feature=prefix_feature+[1]
            else:
                prefix_feature = prefix_feature+[0]

            if re.search(suffixes,token):
                suffix_feature=suffix_feature+[1]
            else:
                suffix_feature = suffix_feature+[0]

    feature_vector['prefix_feature']=prefix_feature
    feature_vector['suffix_feature']=suffix_feature

    # Feature: Check if the token is already in the DrugBank database
    
    
    # Feature: Binary token type features
        # contains_hyphen, all_lowercase_letters, 
        # contains_slash, all_letters, contains_period, all_digits, contains_uppercase,
        # contains_digit, contains_letters
    
    all_uppercase_letters = [1 if token.isupper() else 0 for token in tokenized_sentence]
    all_lowercase_letters = [1 if token.islower() else 0 for token in tokenized_sentence]
    initial_capital_letter = [1 if token[0].isupper() else 0 for token in tokenized_sentence]
    contains_slash = [1 if '/' in token else 0 for token in tokenized_sentence]
    all_letters = [1 if token.isalpha() else 0 for token in tokenized_sentence]
    all_digits = [1 if token.isdigit() else 0 for token in tokenized_sentence]
    contains_digit = [1 if hasNumbers(token) else 0 for token in tokenized_sentence]
    contains_letters = [1 if hasLetters(token) else 0 for token in tokenized_sentence]
    contains_uppercase = [1 if hasUpperCase(token) else 0 for token in tokenized_sentence]
    contains_dash = [1 if '_' in token else 0 for token in tokenized_sentence]
    
    feature_vector['all_uppercase_letters']=all_uppercase_letters
    feature_vector['all_lowercase_letters']=all_lowercase_letters
    feature_vector['initial_capital_letter']=initial_capital_letter
    feature_vector['contains_slash']=contains_slash
    feature_vector['all_letters']=all_uppercase_letters
    feature_vector['all_digits']=all_digits
    feature_vector['contains_digit']=contains_digit
    feature_vector['contains_letters']=contains_letters
    feature_vector['contains_uppercase']=contains_uppercase  
    feature_vector['contains_dash']=contains_dash  
    
    
    # Feature: Position of the token in the sentence (distance from the 'START' token)
    idx_position = []
    current_position = -1
    for token in tokenized_sentence:
        if token == 'STOP':
            current_position = -1
            idx_position.append(current_position)
        else:
            current_position += 1
            idx_position.append(current_position)
    feature_vector['idx_position'] = idx_position
    
    
    # Feature: Binary token type features of the +-2 previous/following tokens
    all_uppercase_letters_prev2 = 
    all_uppercase_letters_prev2 = 
    all_lowercase_letters
    all_lowercase_letters
    
    contains_digit_prev2 = checkPreviousTokenCondition(tokens = tokenized_sentence, pos = -2, condition = hasNumbers)
    contains_digit_prev1 = checkPreviousTokenCondition(tokens = tokenized_sentence, pos = -1, condition = hasNumbers)
    
    feature_vector['contains_digit_prev2'] = contains_digit_prev2
    
    return feature_vector

    



In [104]:
t = checkPreviousTokenCondition(tokens = train_set['token'], pos = -2, fun = hasNumbers)
print(len(t))

print(len(train_set['token']))


22117
22117


In [106]:
# feature vector
features = feature_vector(train_set['token'])

# joining two dictionaries
train_set = {**train_set,**features}

# creating the data frame
train_df = pd.DataFrame(train_set)

22117
22117
22117
22117
22117
22117
22117
22117
22117
22117
22117
22117
22117
22117
22117
17


NameError: name 'train' is not defined

In [108]:
train_df

Unnamed: 0,all_digits,all_letters,all_lowercase_letters,all_uppercase_letters,contains_dash,contains_digit,contains_digit_prev2,contains_letters,contains_slash,contains_uppercase,idx_position,initial_capital_letter,output,prefix_feature,suffix_feature,token,token_length
0,0,1,0,1,0,0,0,1,0,1,0,1,O,0,0,START,5
1,0,0,0,0,0,0,0,1,0,1,1,1,O,0,1,Formal,6
2,0,0,1,0,0,0,0,1,0,0,2,0,O,0,0,drug,4
3,0,0,1,0,0,0,0,1,0,0,3,0,O,0,0,interaction,11
4,0,0,1,0,0,0,0,1,0,0,4,0,O,0,0,studies,7
5,0,0,1,0,0,0,0,1,0,0,5,0,O,0,0,have,4
6,0,0,1,0,0,0,0,1,0,0,6,0,O,0,0,not,3
7,0,0,1,0,0,0,0,1,0,0,7,0,O,0,0,been,4
8,0,0,1,0,0,0,0,1,0,0,8,0,O,0,0,conducted,9
9,0,0,1,0,0,0,0,1,0,0,9,0,O,0,0,with,4


# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [37]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X = train_df.loc[:, [all(x) for x in list(zip(train_df.columns!=target_name,train_df.columns!=token_name))]]

In [38]:
X.head()

Unnamed: 0,all_digits,all_letters,all_lowercase_letters,all_uppercase_letters,contains_dash,contains_digit,contains_letters,contains_slash,contains_uppercase,initial_capital_letter,prefix_feature,suffix_feature,token_length
0,0,1,0,1,0,0,1,0,1,1,0,0,5
1,0,0,0,0,0,0,1,0,1,1,0,1,6
2,0,0,1,0,0,0,1,0,0,0,0,0,4
3,0,0,1,0,0,0,1,0,0,0,0,0,11
4,0,0,1,0,0,0,1,0,0,0,0,0,7


One hot encoding for Y

In [39]:
Y = train_df[target_name]
Y.head()

0    O
1    O
2    O
3    O
4    O
Name: output, dtype: object

### Tunning SVM in python

In [40]:
# Create a SVM object with the corresponding tunned parameters
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X,Y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Predicting with just one test text. Le'ts tokenize it, and create its feature vector:

In [41]:
token_test_text = nltk.word_tokenize(test_texts[6])
print(token_test_text)

features = pd.DataFrame(feature_vector(token_test_text))

['Periodic', 'measurement', 'of', 'serum', 'PSA', 'levels', 'may', 'also', 'be', 'considered', '.']


In [42]:
features.head()

Unnamed: 0,all_digits,all_letters,all_lowercase_letters,all_uppercase_letters,contains_dash,contains_digit,contains_letters,contains_slash,contains_uppercase,initial_capital_letter,prefix_feature,suffix_feature,token_length
0,0,0,0,0,0,0,1,0,1,1,0,0,8
1,0,0,1,0,0,0,1,0,0,0,0,0,11
2,0,0,1,0,0,0,1,0,0,0,0,0,2
3,0,0,1,0,0,0,1,0,0,0,0,0,5
4,0,1,0,1,0,0,1,0,1,1,0,0,3


In [43]:
clf.predict(features)

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

##### Making predictions

In [50]:
predictions = []
for text,entities in test_texts_entities:
    print('text: ', text)
    print('real entities: ',entities,'\n')
    
    # tokenize text
    tokens = nltk.word_tokenize(text)
    predicted_tags = clf.predict(pd.DataFrame(feature_vector(tokens)))
    predictions.append((list(predicted_tags),entities,text)) 
    print('predicted bio tags: ',predicted_tags,'\n')
    
# predictions is a list of tupples comprised of predicted tags and the true drugs we should extract from there
# print('predictions of text 1: ',predictions[1])

text:  No drug, nutritional supplement, food or herb interactions have yet been reported.

real entities:  [] 

predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'] 

text:  No formal drug/drug interaction studies with Plenaxis were performed.

real entities:  ['Plenaxis'] 

predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'] 

text:  Cytochrome P-450 is not known to be involved in the metabolism of Plenaxis.

real entities:  ['Plenaxis'] 

predicted bio tags:  ['B' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'] 

text:  Plenaxis is highly bound to plasma proteins (96 to 99%).

real entities:  ['Plenaxis'] 

predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'] 

text:  Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on Day 29 and every 8 weeks thereafter.

real entities:  ['testosterone', 'Plenaxis'] 

predicted bio tags:  

Then, le'ts define a function that recover's the whole drug name from BIO taggs

In [45]:
def bio_tags_to_entities(tokens,bio_tags):
    entities = []
    prev_tag = 'O'
    word = ''
    for idx in range(0,len(bio_tags)-1):
        tag = bio_tags[idx]
        if tag=='B':
            if prev_tag in ['B','I']:
                # si trobo una nova B i la previa era B o I, envio la word previa
                entities = entities + [word]
            word = tokens[idx]
            prev_tag='B'
        elif tag =='I':
            # si trobo una I, actualitzo la word
            word == word + tokens[idx]
            prev_tag='I'
        elif tag == 'O' and prev_tag in ['B','I']:
            # si em trobo una O pero abans tenia una B o una I, envio la word previa
            entities = entities + [word]
            prev_tag='O'
        else:
            continue
    
    # print(tokens)
    # print(bio_tags)
    # print(entities)
    
    return entities



# Un exemple aprofitant l'exemple de prediccio d'abans
bio_tags_to_entities(token_test_text,clf.predict(pd.DataFrame(feature_vector(token_test_text))))

[]

#####  Evaluation

Evaluation will be based on $$F1=\frac{2*precision*recall}{precision+recall}$$

Aquest exemple m'ha ajudat a entendre com calcular la precision i la recall:

In [46]:
true = ['hola','que','ca','bo']
pred = ['hola','que','pet']

print(round(len([word for word in pred if word in true])/len(pred),2))
print(round(len([word for word in pred if word in true])/len(true),2))

0.67
0.5


In [47]:
def compute_precision(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(pred_ent),2)*100     

In [48]:
def compute_recall(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(true_ent),2)*100

Let's recover all the words from the predicted bio_tags and try to compute F1 for each sentence

In [49]:
import statistics
precision = []
recall = []
for tags, true_entities, text in predictions:
    # I need the tokens for the bio_tags_to_entities function
    tokens = nltk.word_tokenize(text)
    predicted_entities = bio_tags_to_entities(tokens,tags)
    precision = precision + [compute_precision(predicted_entities,true_entities)]
    recall = recall + [compute_recall(predicted_entities,true_entities)]

    
avg_precision = statistics.mean(precision)
avg_recall = statistics.mean(recall)
print('precision: ',avg_precision)
print('recall: ',avg_recall)

precision:  27.77777777777778
recall:  21.72222222222222
