# Drug Name Entity Classifier
## AHLT - MIRI 2018



In [231]:
#import xml.etree.ElementTree as ET
from lxml import etree
from os import listdir
import pandas as pd
import numpy as np
import re

Defining the directory where to parse:

### Directories for  CESC

In [232]:
'''
train_dir = '../LaboCase/Train/'
dirs_whereto_parse = [train_dir+'/test_DrugBank']
'''

"\ntrain_dir = '../LaboCase/Train/'\ndirs_whereto_parse = [train_dir+'/test_DrugBank']\n"

## Directories for Miki

In [233]:
'''
train_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/Train'
train_dirs_whereto_parse = [train_dir+'/DrugBank',train_dir+'/MedLine']
test_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/Test'
'''
train_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/'
train_dirs_whereto_parse = [train_dir+'/Small Train']

test_dir = '/Users/miqueltubaupires/Documents/Master/3r QUATRIMESTRE/AHLT/Lab/ddi/'
test_dirs_whereto_parse = [test_dir+'/Small Test']


##### Reading train data
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the tokens 'START' and 'STOP' at the beginning and end of the sentences.

In [234]:
entities=[]
texts=[]
train_texts_entities = []

for directory in train_dirs_whereto_parse:
    name_files=listdir(directory)   # querying all the files that are in that directory
    # Parse all these xml files
    roots = [etree.parse(directory+'/'+a).getroot() for a in name_files if a.endswith('.xml')]
    for root in roots:
        for sentence in root.findall('sentence'):
            for entity in sentence.findall('entity'):
                entities = entities+[entity.get('text')]
            train_texts_entities = train_texts_entities + [('START ' + sentence.get('text') + ' STOP',entities)]
            entities =[]

# train_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
# print(texts_entities[0])


##### Reading test data

In [235]:
test_texts = []
test_entities = []
for directory in test_dirs_whereto_parse:
    name_files = listdir(directory)
    # Si no poso el sorted, em llegeix els files amb un ordre aleatori.
    # Amb el sorted m'asseguro que els corresponents files text.txt i entities.txt estan en la mateixa posicio
    text_file_names = sorted([directory+'/'+a for a in name_files if a.endswith('text.txt')])
    entities_file_names = sorted([directory+'/'+a for a in name_files if a.endswith('entities.txt')])
    for file in text_file_names:
        file = open(file,'r')
        test_texts = test_texts + [file.read()]
    for file in entities_file_names:
        read_entities = []
        with open(file,'r') as f:
            for line in f:
                read_entities = read_entities+[' '.join(line.split()[0:-1])] # separo en words, el.limino la ultima i torno a unir
        test_entities.append(read_entities)
        
# test_texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
test_texts_entities=list(zip(test_texts,test_entities))
# print(test_texts_entities[8])

#### BIO TAGGER

Let's try to tag each sentence with the BIO format

In [236]:
# -*- coding: utf-8 -*-

import nltk


def bio_tagger(text,drugs):
    
        # Some Preprocessing. I split each word and those ones joined with -
        tokens = nltk.word_tokenize(text)
        tokens = sum([word.split('-') for word in tokens if word[0] != '-' and word[-1] != '-'],[])
        # print(tokens)
        drugs = sum([word.split() for word in drugs],[])
        #print(drugs)
        
        
        bio_tagged = []
        prev_tag = "O"
        for token in tokens:
            if prev_tag == "O" : # Begin NE or continue O
                
                if token in drugs:
                    bio_tagged.append((token,'B'))
                    prev_tag = 'B'
                else:
                    bio_tagged.append((token,'O'))
                    prev_tag = 'O'
                
            elif prev_tag == "B": # Inside NE
                
                if token in drugs:
                    bio_tagged.append((token,'I'))
                    prev_tag = 'I'
                else: 
                    bio_tagged.append((token,'O'))
                    prev_tag = 'O'
                    
            elif  prev_tag == "I": # Inside NE
                if token in drugs:
                    bio_tagged.append((token,'I'))
                    prev_tag = 'I'
                else: 
                    bio_tagged.append((token,'O'))
                    prev_tag = 'O'
        return bio_tagged
    

In [237]:
tokens = []
tags = []
for text,drugs in train_texts_entities:
    tuples = bio_tagger(text,drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

train_set = {'token':tokens,'output':tags}
train_df = pd.DataFrame(train_set)

In [238]:
train_df.head()

Unnamed: 0,output,token
0,O,START
1,O,No
2,O,formal
3,O,assessments
4,O,of


# Creating the features for the classifier

In [239]:
def feature_vector(tokenized_sentence):
    feature_vector = {}
    # Feature 1: Length of the token
    feature_vector['token_length'] = [len(token) for token in tokenized_sentence]

    # Feature 2: Is the the first letter of the word capitalized?
    is_capitalized = [1 if row[0].isupper() else 0 for row in tokenized_sentence]
    feature_vector['is_capitalized'] = is_capitalized

    # Feature 3: Is the token completely capitalized?
    is_total_capitalized = [1 if row.isupper() else 0 for row in tokenized_sentence]
    feature_vector['is_total_capitalized'] = is_total_capitalized
    
    # Feature 4 & 5: Prefixes and Suffixes

    prefix_feature = []
    suffix_feature = []

    prefixes = r'^meth|^eth|^prop|^but|^pent|^hex|^hept|^oct|^non|^dec'
    suffixes = r'ane$|ene$|yne$|ol$|al$|amine$|cid$|ium$|ether$|ate$|one$'

    for token in tokenized_sentence:

            if re.search(prefixes,token):
                prefix_feature=prefix_feature+[1]
            else:
                prefix_feature = prefix_feature+[0]

            if re.search(suffixes,token):
                suffix_feature=suffix_feature+[1]
            else:
                suffix_feature = suffix_feature+[0]

    feature_vector['prefix_feature']=prefix_feature
    feature_vector['suffix_feature']=suffix_feature
    
    return feature_vector

# feature vector
features = feature_vector(train_set['token'])

# joining two dictionaries
train_set = {**train_set,**features}
# creating the data frame
train_df = pd.DataFrame(train_set)
train_df.head()

Unnamed: 0,is_capitalized,is_total_capitalized,output,prefix_feature,suffix_feature,token,token_length
0,1,1,O,0,0,START,5
1,1,0,O,0,0,No,2
2,0,0,O,0,1,formal,6
3,0,0,O,0,0,assessments,11
4,0,0,O,0,0,of,2


# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [240]:
from sklearn import datasets
from sklearn import svm

In [241]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create a SVM object with the corresponding parameters
clf = svm.SVC(gamma=0.001, cache_size = 200, class_weight = None, coef0 = 0.0, 
              decision_function_shape = None, degree = 3, kernel = 'rbf', 
              max_iter = -1, probability = False, random_state = None, shrinking = True, 
              tol = 0.001, C=100.0, verbose = True)

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X = train_df.loc[:, [all(x) for x in list(zip(train_df.columns!=target_name,train_df.columns!=token_name))]]
# X = train_df.loc[:,train_df.columns!=target_name]

In [242]:
X.head()

Unnamed: 0,is_capitalized,is_total_capitalized,prefix_feature,suffix_feature,token_length
0,1,1,0,0,5
1,1,0,0,0,2
2,0,0,0,1,6
3,0,0,0,0,11
4,0,0,0,0,2


One hot encoding for Y

In [243]:
Y=pd.get_dummies(train_df[target_name])
Y = train_df[target_name]

In [244]:
train_df[token_name].head()

0          START
1             No
2         formal
3    assessments
4             of
Name: token, dtype: object

In [245]:
clf.fit(X,Y)

[LibSVM]

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=True)

Predicting with just one test text. Le'ts tokenize it, and create its feature vector:

In [246]:
token_test_text = nltk.word_tokenize(test_texts[6])
print(token_test_text)

features = pd.DataFrame(feature_vector(token_test_text))

['There', 'have', 'been', 'no', 'studies', 'of', 'the', 'interaction', 'of', 'methyl', 'aminolevulinate', 'cream', 'with', 'any', 'other', 'drugs', ',', 'including', 'local', 'anesthetics', '.']


In [247]:
features.head()

Unnamed: 0,is_capitalized,is_total_capitalized,prefix_feature,suffix_feature,token_length
0,1,0,0,0,5
1,0,0,0,0,4
2,0,0,0,0,4
3,0,0,0,0,2
4,0,0,0,0,7


In [248]:
clf.predict(features)

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype=object)

##### Making predictions

In [249]:
predictions = []
for text,entities in test_texts_entities:
    # print('text: ', text)
    # print('real entities: ',entities,'\n')
    
    # tokenize text
    tokens = nltk.word_tokenize(text)
    predicted_tags = clf.predict(pd.DataFrame(feature_vector(tokens)))
    predictions.append((list(predicted_tags),entities,text)) 
    # print('predicted bio tags: ',predicted_tags,'\n')
    
# predictions is a list of tupples comprised of predicted tags and the true drugs we should extract from there
# print('predictions of text 1: ',predictions[1])

text:  If you are taking these medicines together or you have further questions about drug interactions talk to your doctor or pharmacist. 

real entities:  [] 

predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O'] 

text:  Consult your doctor or pharmacist if you are taking any of the following: seizure medications antibiotics warfarin medications to help you sleep

real entities:  ['warfarin', 'antibiotics'] 

predicted bio tags:  ['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O'] 

text:  Infergen should be used cautiously in patients who are receiving agents that are known to cause myelosuppression or with agents known to be metabolized via the cytochrome P-450 pathway.9 Patients taking drugs that are metabolized by this pathway should be monitored closely for changes in the therapeutic and/or toxic levels of concomitant drugs.

real entities:  ['Infergen'] 

predicted bio tags:  

Then, le'ts define a function that recover's the whole drug name from BIO taggs

In [250]:
def bio_tags_to_entities(tokens,bio_tags):
    entities = []
    prev_tag = 'O'
    word = ''
    for idx in range(0,len(bio_tags)-1):
        tag = bio_tags[idx]
        if tag=='B':
            if prev_tag in ['B','I']:
                # si trobo una nova B i la previa era B o I, envio la word previa
                entities = entities + [word]
            word = tokens[idx]
            prev_tag='B'
        elif tag =='I':
            # si trobo una I, actualitzo la word
            word == word + tokens[idx]
            prev_tag='I'
        elif tag == 'O' and prev_tag in ['B','I']:
            # si em trobo una O pero abans tenia una B o una I, envio la word previa
            entities = entities + [word]
            prev_tag='O'
        else:
            continue
    
    # print(tokens)
    # print(entities)
    
    return entities



# Un exemple aprofitant l'exemple de prediccio d'abans
# bio_tags_to_entities(token_test_text,clf.predict(pd.DataFrame(feature_vector(token_test_text))))

#####  Evaluation

Evaluation will be based on $$F1=\frac{2*precision*recall}{precision+recall}$$

Aquest exemple m'ha ajudat a entendre com calcular la precision i la recall:

In [251]:
true = ['hola','que','ca','bo']
pred = ['hola','que','pet']

print(round(len([word for word in pred if word in true])/len(pred),2))
print(round(len([word for word in pred if word in true])/len(true),2))

0.67
0.5


In [252]:
def compute_precision(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(pred_ent),2)        

In [253]:
def compute_recall(pred_ent,true_ent):
    if len(pred_ent) == 0 or len(true_ent) == 0:
        return 0
    else:
        return round(len([word for word in pred_ent if word in true_ent])/len(true_ent),2)

Let's recover all the words from the predicted bio_tags and try to compute F1 for each sentence

In [254]:
import statistics
precision = []
recall = []
for tags, true_entities, text in predictions:
    # I need the tokens for the bio_tags_to_entities function
    tokens = nltk.word_tokenize(text)
    predicted_entities = bio_tags_to_entities(tokens,tags)
    precision = precision + [compute_precision(predicted_entities,true_entities)]
    recall = recall + [compute_recall(predicted_entities,true_entities)]

    
avg_precision = statistics.mean(precision)
avg_recall = statistics.mean(recall)
print(avg_precision)
print(avg_recall)

0.125
0.125
