# Drug Name Entity Classifier
## AHLT - MIRI 2018



## Initialization

Load needed modules and specify the working directory

In [1]:
# Load needed packages
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV # Parameter selection
import time # Execution time of some blocks
from nltk.tag import StanfordPOSTagger
import statistics
import scipy.stats # for RandomizedSearchCV


# Import our defined functions
from drug_functions import *
from makingPredictions import *
from datasetBuilder import *

In [2]:
# Set the data directories
train_dirs_whereto_parse = ['data/medium_train_DrugBank']
test_dirs_whereto_parse = ['data/medium_test_DrugBank']

## Reading the train and test data from the XML files
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the token 'STOP' at the end of each sentence.

In [3]:
train_texts_entities = createTrainSet(train_dirs_whereto_parse)
test_texts_entities = createTestSet(test_dirs_whereto_parse)

# Creating the features for the classifier

## BIO Tagger and Feature Creation

In this section we will tag each sentence with the BIO format. For this, we have created a function called 'BIOTagger' which will perform the following actions:

Given a sentence 'text' and a set of drugs 'drugs', this function returns a list of str that
contains a tag for each of the tokens in text. The tags can be either 'B', 'I' or 'O'. 'B' means
the token is the first part of a drug entity, 'I' means the token is the continuation of a drug entity,
and 'O' means that the token does not belong to a drug entity.

Apart from that, we have also downloaded the DrugBank database (ref: https://www.drugbank.ca/) from we will extract all the named entities. We will create a list out of these set of entities and for each token processed, we will check if the token is already in the database, meaning that has a very high probability of being a NE.

In [4]:
# Load the DrugBank list of entities (it has already been processed for the extraction of the NE).
# Each line of the file contains a different named entity.
with(open('data/DrugBank_names_DB.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()
        
# Initialise the needed lists
tokens = []
tags = []
removed_columns = []
features = pd.DataFrame()

# Creating StanfordPOStagger. We will need it as a createFeatureVector function parameter
jar='Stanford_POStagger/stanford-postagger.jar'
model='Stanford_POStagger/models/english-bidirectional-distsim.tagger'
st = StanfordPOSTagger(model,jar, encoding='utf-8')

#
# Iterate over all the train entities (tuples of (sentence, drugs)) and apply the BIOTagger function
for text,drugs in train_texts_entities:
    tokenized_sentence = nltk.word_tokenize(text)
    features = pd.concat([features,createFeatureVector(tokenized_sentence, drugbank_db,st)])
    tuples = BOTagger(text, drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

# computing one-hot coding for 'Aa1-' feature.
training_dummies = pd.get_dummies(features['Aa1-'])
features = features.drop('Aa1-',axis=1)
# joining both data frames
for name in training_dummies.columns:
    features[name]=training_dummies[name]
'''
# Adding the Frequency tokens. We will needed first the list with all the tokens.
training_frequencies = frequencyTokens(tokens)
features['frequencies'] = training_frequencies
'''

# Create a training set with the features,tokens and the BIO tags
train_df = features
train_df['token'] = tokens
train_df['output'] = tags


The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)


# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [5]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X_train = train_df.loc[:, [all(x) for x in list(zip(train_df.columns!=target_name,train_df.columns!=token_name))]]
Y_train = train_df[target_name]

### Tunning SVM in python

In [6]:
# Create a SVM object with the corresponding tunned parameters
svc = svm.SVC()

# Look for the best parameters of the SVM model with GridSearchCV
start = time.time()
clf = RandomizedSearchCV(svc,{'C': scipy.stats.norm(15,5), 'gamma': scipy.stats.expon(scale=.1),
                              'kernel': ['rbf'], 'class_weight':['balanced', None]},30) # 40 iterations
end = time.time()
print('Execution time for GridSearchCV: ', str(end - start))

Execution time for GridSearchCV:  0.0018301010131835938


In [None]:
# Train the SVM model with the parameters selected before
start = time.time()
clf.fit(X_train,Y_train)
end = time.time()
print('Training time of the SVM: ', str(end - start))

Computing training error:

In [None]:
F1_train,train_precision,train_recall = makingPredictions(train_texts_entities,clf,drugbank_db,st,training_dummies)
print('Results of the training evaluation: ','\nPrecision: ',train_precision,'\nRecall: ',train_recall,'\nF1: ',F1_train,)

### Making predictions and evaluations

In [None]:
F1_test,test_precision,test_recall = makingPredictions(test_texts_entities,clf,drugbank_db,st,training_dummies)
print('Results of the evaluation of the test part: ','\nPrecision: ',test_precision,'\nRecall: ',test_recall,'\nF1: ',F1_test)

### Evaluation

Evaluation will be based on $$F1=\frac{2*precision*recall}{precision+recall}$$

In [None]:
'''
## Log of results
date, precision, recall, F1, features, test
14-May, 46.2, 52.1, 48.99, Token length; Prefixes/Suffixes; POS tag; Binary features (+-2); Token position; DrugBank DB; Shape, yes
'''