# Drug Name Entity Classifier
## AHLT - MIRI 2018



## Initialization

Load needed modules and specify the working directory

In [1]:
# Load needed packages
import pandas as pd
import numpy as np
import scipy


import time # Execution time of some blocks
from nltk.tag import StanfordPOSTagger

# sklearn package
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


# Import our defined functions
from NER_functions import *
from datasetBuilder import *
from feature_creation_ner import *



In [2]:
# init time
first_init = time.time()

In [3]:
train_dirs_whereto_parse = ['data/Train/DrugBank','data/Train/MedLine']
test_dirs_whereto_parse = ['data/Test/Test for DrugNER task/DrugBank','data/Test/Test for DrugNER task/MedLine']

## Reading the train and test data from the XML files
Accessing to all the files of the directory and storing id's and text's in two arrays.
We have also added the token 'STOP' at the end of each sentence.

In [4]:
train_texts_entities = createSet(train_dirs_whereto_parse)
test_texts_entities = createSet(test_dirs_whereto_parse)

Before computing features, I want the input data to have a special format:

In [5]:
# we want each dataset with the following format: 
# for ['have','Ibuprofeno'] ---- [('hola','V','O'),('Ibuprofeno','NN','B')]

def buildSet(text_entities):
    dataset = []
    for text,drugs in text_entities:
        # tokenizing
        tokenized_sentence = nltk.word_tokenize(text)
        # BIO tagging
        tokens_tags = BIOTagger(text, drugs)
        # POS tagging
        tokens_pos = nltk.pos_tag(tokenized_sentence)

        text_triples = []
        for idx,token in enumerate(tokenized_sentence):
            text_triples.append((token,tokens_pos[idx][1],tokens_tags[idx][1]))
        dataset.append(text_triples)
    
    return dataset

# train_set and test_set are list of list of triples; each list of triples refers to a different sentence
train_set = buildSet(train_texts_entities) 
test_set = buildSet(test_texts_entities)

In [6]:
# computing the Distribution of frequencies that we will need later for a feature

list_of_tokens = [tuples[0] for row in train_set for tuples in row]
freqDistribution = nltk.FreqDist(list_of_tokens)

In [7]:
frequencies = [v for k,v in freqDistribution.items()]
frequencies = list(set(sorted(frequencies)))
max_pos = round(len(frequencies)*20/100) # we only take the 20 small frequencies
more_rare_freq = frequencies[0:max_pos]

## FEATURES 

In [8]:
with(open('data/DrugBank_names_DB.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()

# remember s is a triple defined as follows: (token, POS tag, BIO tag)
X_train = [sent2features(s,drugbank_db,freqDistribution,more_rare_freq) for s in train_set]
y_train = [sent2labels(s) for s in train_set]

X_test = [sent2features(s,drugbank_db,freqDistribution,more_rare_freq) for s in test_set]
y_test = [sent2labels(s) for s in test_set]


In [9]:
X_train[0]

[{'+1:bigrams': [],
  '+1:postag': ',',
  '+1:postag[:2]': ',',
  '+1:trigrams': [],
  '+1:unigrams': [','],
  '+2:bigrams': ['mi', 'il', 'lk'],
  '+2:postag': 'NN',
  '+2:postag[:2]': 'NN',
  '+2:trigrams': ['mil', 'ilk'],
  '+2:unigrams': ['m', 'i', 'l', 'k'],
  'BOS': True,
  'all_uppercase_letters': 0,
  'caps_mix': 1,
  'contains_capital_letter': 1,
  'contains_drug_prefix': 0,
  'contains_drug_sufix': 0,
  'contains_real_numbers': 0,
  'freq': 1,
  'has_digit': 0,
  'initial_capital_letter': 1,
  'initial_digit': 0,
  'intermediate_dash': 0,
  'isInDB': False,
  'is_Dash': 0,
  'is_end_punctuation': 0,
  'is_roman_letter': 0,
  'letter_and_num': 0,
  'many_numbers': 0,
  'postag': 'NN',
  'postag[:2]': 'NN',
  'punctuation': 0,
  'rare_word': True,
  'single_capital_letter': 0,
  'single_digit': 0,
  'word[-2:]': 'lk',
  'word[-3:]': 'ilk',
  'word[-4:]': 'Milk',
  'word[-5:]': 'Milk'},
 {'+1:bigrams': ['mi', 'il', 'lk'],
  '+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:trigra

In [10]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
'''
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}
'''
# use the same metric for evaluation
labels = list(set(sum(y_train,[])))
labels.remove('O')
'''
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=30,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)
'''

"\nf1_scorer = make_scorer(metrics.flat_f1_score,\n                        average='weighted', labels=labels)\n\nrs = RandomizedSearchCV(crf, params_space,\n                        cv=3,\n                        verbose=1,\n                        n_jobs=-1,\n                        n_iter=30,\n                        scoring=f1_scorer)\nrs.fit(X_train, y_train)\n"

####  Predictions

In [11]:
#model = rs.best_estimator_
model = crf
model.fit(X_train,y_train)
pred_tags = model.predict(X_test)

####  Evaluation

####  Evaluation with scickit learn

In [12]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, pred_tags, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

          I      0.762     0.581     0.660       160
    B-brand      0.579     0.569     0.574        58
     B-drug      0.774     0.813     0.793       337
   B-drug_n      0.462     0.054     0.097       111
    B-group      0.625     0.604     0.614       149

avg / total      0.688     0.609     0.624       815



In [13]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(model.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(model.transition_features_).most_common()[-20:])

Top likely transitions:
B-group -> I       2.128699
O      -> O       1.238841
B-drug_n -> I       1.201980
I      -> I       0.582975
B-brand -> O       0.092441
B-drug -> O       0.089084
O      -> B-brand 0.081909
O      -> B-group 0.076379
B-drug -> I       0.016120
O      -> B-drug  0.001743
O      -> B-drug_n -0.016265
I      -> O       -0.210648
B-drug_n -> O       -0.275595
B-group -> O       -1.120307
B-brand -> I       -1.169511
B-drug_n -> B-drug_n -1.445257
B-brand -> B-drug_n -1.629979
B-group -> B-drug_n -2.082389
B-drug_n -> B-group -2.435606
I      -> B-group -2.436011

Top unlikely transitions:
B-brand -> B-drug_n -1.629979
B-group -> B-drug_n -2.082389
B-drug_n -> B-group -2.435606
I      -> B-group -2.436011
B-drug_n -> B-brand -2.446214
I      -> B-drug_n -2.581685
B-group -> B-brand -3.219718
B-brand -> B-group -3.582766
B-drug -> B-drug_n -3.704547
I      -> B-brand -4.166828
I      -> B-drug  -4.172962
B-drug -> B-group -4.427842
B-group -> B-drug  -4.618653
B-dr

In [14]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(model.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(model.state_features_).most_common()[-30:])

Top positive:
4.423663 B-drug_n word[-5:]:gaine
4.355241 B-drug_n word[-5:]:atrol
4.301883 B-brand  word[-5:]:pirin
4.155148 B-group  word[-5:]:tacid
3.905020 B-group  word[-5:]:piate
3.861939 B-group  word[-2:]:Is
3.836142 B-group  word[-5:]:ylate
3.828551 B-drug   word[-5:]:goxin
3.814664 O        word[-5:]:state
3.777820 B-group  word[-2:]:Ds
3.763490 B-drug_n word[-5:]:alins
3.682693 B-drug   -1:trigrams:suv
3.634564 B-group  word[-5:]:ators
3.633064 B-drug_n word[-2:]:MC
3.475499 B-group  word[-5:]:marin
3.428247 B-group  -2:trigrams:uty
3.419080 B-drug_n word[-5:]:orate
3.362971 B-drug_n word[-5:]:enate
3.352667 B-group  word[-4:]:tins
3.344006 I        -2:trigrams:spa
3.320270 B-brand  word[-2:]:AN
3.304385 B-brand  word[-2:]:EX
3.271317 B-group  word[-5:]:ogens
3.239331 B-group  word[-5:]:ourea
3.171444 B-brand  word[-2:]:AR
3.124903 O        word[-2:]:ch
3.122558 B-group  -1:trigrams:Com
3.106535 B-drug   -2:trigrams:rtr
3.072319 O        -1:bigrams:1-
3.059925 O        word[-

In [15]:
print('Total execution time: ',(time.time() - first_init)/60, ' minutes')

Total execution time:  1.8266138315200806  minutes


In [16]:

'''
## Log of results
date, precision, recall, F1, features, test
14-May, 46.2, 52.1, 48.99, Token length; Prefixes/Suffixes; POS tag; Binary features (+-2); Token position; DrugBank DB; Shape, yes
'''

'\n## Log of results\ndate, precision, recall, F1, features, test\n14-May, 46.2, 52.1, 48.99, Token length; Prefixes/Suffixes; POS tag; Binary features (+-2); Token position; DrugBank DB; Shape, yes\n'