# Drug Name Entity Classifier
## AHLT - MIRI 2018



In [1]:
#import xml.etree.ElementTree as ET
from lxml import etree
from os import listdir
import pandas as pd
import numpy as np
import re
from bio_tagger import *

## Directories

In [3]:
# Cesc
'''
train_dir = '../LaboCase/Train/'
dirs_whereto_parse = [train_dir+'/test_DrugBank']
'''

train_dir = '../LaboCase/'
train_dirs_whereto_parse = [train_dir+'/small_train_DrugBank']

test_dir = '../LaboCase/'
test_dirs_whereto_parse = [test_dir+'/small_test_DrugBank']

# Miki
'''
train_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/Train'
train_dirs_whereto_parse = [train_dir+'/DrugBank',train_dir+'/MedLine']
test_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/Test'
test_dirs_whereto_parse = [test_dir+'/DrugBankOutput',test_dir+'/MedLineOutput']

train_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/'
train_dirs_whereto_parse = [train_dir+'/Small Train']

test_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/'
test_dirs_whereto_parse = [test_dir+'/Small Test']
'''


"\ntrain_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/Train'\ntrain_dirs_whereto_parse = [train_dir+'/DrugBank',train_dir+'/MedLine']\ntest_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/Test'\ntest_dirs_whereto_parse = [test_dir+'/DrugBankOutput',test_dir+'/MedLineOutput']\n\ntrain_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/'\ntrain_dirs_whereto_parse = [train_dir+'/Small Train']\n\ntest_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/'\ntest_dirs_whereto_parse = [test_dir+'/Small Test']\n"

## Reading train data
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the tokens 'START' and 'STOP' at the beginning and end of the sentences.

In [4]:
entities=[]
texts=[]
train_texts_entities = []

for directory in train_dirs_whereto_parse:
    name_files=listdir(directory)   # querying all the files that are in that directory
    # Parse all these xml files
    roots = [etree.parse(directory+'/'+a).getroot() for a in name_files if a.endswith('.xml')]
    for root in roots:
        for sentence in root.findall('sentence'):
            for entity in sentence.findall('entity'):
                entities = entities+[entity.get('text')]
            train_texts_entities = train_texts_entities + [('START ' + sentence.get('text') + ' STOP',entities)]
            entities =[]

# train_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
# print(texts_entities[0])


## Reading test data

In [5]:
test_texts = []
test_entities = []
for directory in test_dirs_whereto_parse:
    name_files = listdir(directory)
    # Si no poso el sorted, em llegeix els files amb un ordre aleatori.
    # Amb el sorted m'asseguro que els corresponents files text.txt i entities.txt estan en la mateixa posicio
    text_file_names = sorted([directory+'/'+a for a in name_files if a.endswith('text.txt')])
    entities_file_names = sorted([directory+'/'+a for a in name_files if a.endswith('entities.txt')])
    for file in text_file_names:
        file = open(file,'r')
        test_texts = test_texts + [file.read()]
    for file in entities_file_names:
        read_entities = []
        with open(file,'r') as f:
            for line in f:
                read_entities = read_entities+[' '.join(line.split()[0:-1])] # separo en words, el.limino la ultima i torno a unir
        test_entities.append(read_entities)
        
# test_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
test_texts_entities=list(zip(test_texts,test_entities))
print(test_texts_entities[4])

('Laboratory Tests Response to Plenaxis should be monitored by measuring serum total testosterone concentrations just prior to administration on Day 29 and every 8 weeks thereafter.\n', ['testosterone', 'Plenaxis'])


#### BIO TAGGER

Let's try to tag each sentence with the BIO format

In [6]:
tokens = []
tags = []
for text,drugs in train_texts_entities:
    tuples = bio_tagger(text,drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

train_set = {'token':tokens,'output':tags}
train_df = pd.DataFrame(train_set)

In [7]:
train_df.head()

Unnamed: 0,output,token
0,O,START
1,O,Formal
2,O,drug
3,O,interaction
4,O,studies


# Creating the features for the classifier

In [8]:
def feature_vector(tokenized_sentence):
    feature_vector = {}
    # Feature 1: Length of the token
    feature_vector['token_length'] = [len(token) for token in tokenized_sentence]

    # Feature 2: Is the the first letter of the word capitalized?
    is_capitalized = [1 if row[0].isupper() else 0 for row in tokenized_sentence]
    feature_vector['is_capitalized'] = is_capitalized

    # Feature 3: Is the token completely capitalized?
    is_total_capitalized = [1 if row.isupper() else 0 for row in tokenized_sentence]
    feature_vector['is_total_capitalized'] = is_total_capitalized
    
    # Feature 4 & 5: Prefixes and Suffixes

    prefix_feature = []
    suffix_feature = []

    prefixes = r'^meth|^eth|^prop|^but|^pent|^hex|^hept|^oct|^non|^dec'
    suffixes = r'ane$|ene$|yne$|ol$|al$|amine$|cid$|ium$|ether$|ate$|one$'

    for token in tokenized_sentence:

            if re.search(prefixes,token):
                prefix_feature=prefix_feature+[1]
            else:
                prefix_feature = prefix_feature+[0]

            if re.search(suffixes,token):
                suffix_feature=suffix_feature+[1]
            else:
                suffix_feature = suffix_feature+[0]

    feature_vector['prefix_feature']=prefix_feature
    feature_vector['suffix_feature']=suffix_feature

    # Feature 6: Check if the token is already in the DrugBank database
    
    
    # Feature 
    
    return feature_vector

# feature vector
features = feature_vector(train_set['token'])

# joining two dictionaries
train_set = {**train_set,**features}
# creating the data frame
train_df = pd.DataFrame(train_set)
train_df.head()

Unnamed: 0,is_capitalized,is_total_capitalized,output,prefix_feature,suffix_feature,token,token_length
0,1,1,O,0,0,START,5
1,1,0,O,0,1,Formal,6
2,0,0,O,0,0,drug,4
3,0,0,O,0,0,interaction,11
4,0,0,O,0,0,studies,7


# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [9]:
from sklearn import *



In [10]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X = train_df.loc[:, [all(x) for x in list(zip(train_df.columns!=target_name,train_df.columns!=token_name))]]
# X = train_df.loc[:,train_df.columns!=target_name]

In [11]:
X.head()

Unnamed: 0,is_capitalized,is_total_capitalized,prefix_feature,suffix_feature,token_length
0,1,1,0,0,5
1,1,0,0,1,6
2,0,0,0,0,4
3,0,0,0,0,11
4,0,0,0,0,7


One hot encoding for Y

In [12]:
Y = train_df[target_name]
Y.head()

0    O
1    O
2    O
3    O
4    O
Name: output, dtype: object

### Tunning SVM in python

In [13]:
from sklearn.model_selection import GridSearchCV

# Create a SVM object with the corresponding tunned parameters
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X,Y)



GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Predicting with just one test text. Le'ts tokenize it, and create its feature vector:

In [14]:
token_test_text = nltk.word_tokenize(test_texts[6])
print(token_test_text)

features = pd.DataFrame(feature_vector(token_test_text))

['Periodic', 'measurement', 'of', 'serum', 'PSA', 'levels', 'may', 'also', 'be', 'considered', '.']


In [15]:
features.head()

Unnamed: 0,is_capitalized,is_total_capitalized,prefix_feature,suffix_feature,token_length
0,1,0,0,0,8
1,0,0,0,0,11
2,0,0,0,0,2
3,0,0,0,0,5
4,1,1,0,0,3


In [16]:
clf.predict(features)

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

##### Making predictions

In [17]:
predictions = []
for text,entities in test_texts_entities:
    # print('text: ', text)
    # print('real entities: ',entities,'\n')
    
    # tokenize text
    tokens = nltk.word_tokenize(text)
    predicted_tags = clf.predict(pd.DataFrame(feature_vector(tokens)))
    predictions.append((list(predicted_tags),entities,text)) 
    # print('predicted bio tags: ',predicted_tags,'\n')
    
# predictions is a list of tupples comprised of predicted tags and the true drugs we should extract from there
# print('predictions of text 1: ',predictions[1])

Then, le'ts define a function that recover's the whole drug name from BIO taggs

In [18]:
def bio_tags_to_entities(tokens,bio_tags):
    entities = []
    prev_tag = 'O'
    word = ''
    for idx in range(0,len(bio_tags)-1):
        tag = bio_tags[idx]
        if tag=='B':
            if prev_tag in ['B','I']:
                # si trobo una nova B i la previa era B o I, envio la word previa
                entities = entities + [word]
            word = tokens[idx]
            prev_tag='B'
        elif tag =='I':
            # si trobo una I, actualitzo la word
            word == word + tokens[idx]
            prev_tag='I'
        elif tag == 'O' and prev_tag in ['B','I']:
            # si em trobo una O pero abans tenia una B o una I, envio la word previa
            entities = entities + [word]
            prev_tag='O'
        else:
            continue
    
    # print(tokens)
    # print(bio_tags)
    # print(entities)
    
    return entities



# Un exemple aprofitant l'exemple de prediccio d'abans
bio_tags_to_entities(token_test_text,clf.predict(pd.DataFrame(feature_vector(token_test_text))))

[]

#####  Evaluation

Evaluation will be based on $$F1=\frac{2*precision*recall}{precision+recall}$$

Aquest exemple m'ha ajudat a entendre com calcular la precision i la recall:

In [19]:
true = ['hola','que','ca','bo']
pred = ['hola','que','pet']

print(round(len([word for word in pred if word in true])/len(pred),2))
print(round(len([word for word in pred if word in true])/len(true),2))

0.67
0.5


In [20]:
def compute_precision(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(pred_ent),2)*100     

In [21]:
def compute_recall(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(true_ent),2)*100

Let's recover all the words from the predicted bio_tags and try to compute F1 for each sentence

In [22]:
import statistics
precision = []
recall = []
for tags, true_entities, text in predictions:
    # I need the tokens for the bio_tags_to_entities function
    tokens = nltk.word_tokenize(text)
    predicted_entities = bio_tags_to_entities(tokens,tags)
    precision = precision + [compute_precision(predicted_entities,true_entities)]
    recall = recall + [compute_recall(predicted_entities,true_entities)]

    
avg_precision = statistics.mean(precision)
avg_recall = statistics.mean(recall)
print('precision: ',avg_precision)
print('recall: ',avg_recall)

precision:  25.0
recall:  18.944444444444443
