# Drug Name Entity Classifier
## AHLT - MIRI 2018



In [1]:
#import xml.etree.ElementTree as ET
from lxml import etree
from os import listdir
import pandas as pd
import numpy as np

Defining the directory where to parse:

In [2]:
train_dir = '../LaboCase/Train/'
dirs_whereto_parse = [train_dir+'/test_DrugBank']

Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the tokens 'START' and 'STOP' at the beginning and end of the sentences.

In [3]:
entities=[]
texts=[]
texts_entities = []

for directory in dirs_whereto_parse:
    name_files=listdir(directory)   # querying all the files that are in that directory
    # Parse all these xml files
    roots = [etree.parse(directory+'/'+a).getroot() for a in name_files if a.endswith('.xml')]
    for root in roots:
        for sentence in root.findall('sentence'):
            for entity in sentence.findall('entity'):
                entities = entities+[entity.get('text')]
            texts_entities = texts_entities + [('START ' + sentence.get('text') + ' STOP',entities)]
            entities =[]

# texts_entities is a list of tuples. Each one is comprised of the sentence and the drugs in there
# print(texts_entities[0])



#### BIO TAGGER

Let's try to tag each sentence with the BIO format

In [11]:
# -*- coding: utf-8 -*-

import nltk


def bio_tagger(text,drugs):
    
        # Some Preprocessing. I split each word and those ones joined with -
        tokens = nltk.word_tokenize(text)
        tokens = sum([word.split('-') for word in tokens if word[0] != '-' and word[-1] != '-'],[])
        # print(tokens)
        drugs = sum([word.split() for word in drugs],[])
        #print(drugs)
        
        
        bio_tagged = []
        prev_tag = "O"
        for token in tokens:
            if prev_tag == "O" : # Begin NE or continue O
                
                if token in drugs:
                    bio_tagged.append((token,'B'))
                    prev_tag = 'B'
                else:
                    bio_tagged.append((token,'O'))
                    prev_tag = 'O'
                
            elif prev_tag == "B": # Inside NE
                
                if token in drugs:
                    bio_tagged.append((token,'I'))
                    prev_tag = 'I'
                else: 
                    bio_tagged.append((token,'O'))
                    prev_tag = 'O'
                    
            elif  prev_tag == "I": # Inside NE
                if token in drugs:
                    bio_tagged.append((token,'I'))
                    prev_tag = 'I'
                else: 
                    bio_tagged.append((token,'O'))
                    prev_tag = 'O'
        return bio_tagged
    

In [12]:
import pandas as pd
tokens = []
tags = []
for text,drugs in texts_entities:
    tuples = bio_tagger(text,drugs)
    tokens = tokens + [word[0] for word in tuples]
    tags = tags + [word[1] for word in tuples]

train_set = {'token':tokens,'output':tags}

0


In [9]:
train_set['token'][len(train_set['token'])-1]

'STOP'

In [None]:
train_df.head()

# Creating the features for the classifier

## Length and capitalization of words

In [13]:
# Feature 1: Length of the token
train_df['token_length'] = [len(token) for token in train_set['token']]

# We have realised that some tokens have length 0 (empty string), so we have decided to remove those rows
train_df = train_df[train_df['token_length'] > 0]

# Feature 2: Is the the first letter of the word capitalized?
is_capitalized = [row[0].isupper() for row in train_df['token']]
train_df['is_capitalized'] = is_capitalized

# Feature 3: Is the token completely capitalized?
is_total_capitalized = [row.isupper() for row in train_df['token']]
train_df['is_total_capitalized'] = is_capitalized

NameError: name 'train_df' is not defined

In [None]:
'hello'.isupper()

## Sufixes and prefixes

In [None]:
import re
# this feature indicates whether the word has a usual prefix/suffix of a drug

prefix_feature = []
suffix_feature = []

prefixes = r'^meth|^eth|^prop|^but|^pent|^hex|^hept|^oct|^non|^dec'
suffixes = r'ane$|ene$|yne$|ol$|al$|amine$|cid$|ium$|ether$|ate$|one'

for token in train_set['token']:
    
        if re.search(prefixes,token):
            prefix_feature=prefix_feature+[1]
        else:
            prefix_feature = prefix_feature+[0]
            
        if re.search(suffixes,token):
            suffix_feature=suffix_feature+[1]
        else:
            suffix_feature = suffix_feature+[0]
            
train_df['prefix_feature']=prefix_feature
train_df['suffix_feature']=suffix_feature
print(train_df)

# Building the classifier
## Support Vector Machines

The advantages of support vector machines are:

- Effective in high dimensional spaces.
- Still effective in cases where number of dimensions is greater than the number of samples.
- Uses a subset of training points in the decision function (called support vectors), so it is also memory efficient.
- Versatile: different Kernel functions can be specified for the decision function. Common kernels are provided, but it is also possible to specify custom kernels.

The disadvantages of support vector machines include:

- If the number of features is much greater than the number of samples, avoid over-fitting in choosing Kernel functions and regularization term is crucial.
- SVMs do not directly provide probability estimates, these are calculated using an expensive five-fold cross-validation (see Scores and probabilities, below).

In [None]:
from sklearn import datasets
from sklearn import svm

In [None]:
# Name of the target variable
target_name = 'output'
token_name = 'token'

# Create a SVM object with the corresponding parameters
clf = svm.SVC(gamma=0.001, cache_size = 200, class_weight = None, coef0 = 0.0, 
              decision_function_shape = None, degree = 3, kernel = 'rbf', 
              max_iter = -1, probability = False, random_state = None, shrinking = True, 
              tol = 0.001, C=100.0, verbose = True)

# Create the appropiate data structure to pass it to the SVM
X = train_df.loc[:, train_df.columns != target_name]
Y = train_df[target_name].values

In [None]:
X

In [None]:
train_set.loc[:, train_set.columns != target_name]
train_set[target_name]


In [None]:
train_set['target_name'].values

In [None]:
train_set