In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## Module importing

In [2]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split # Parameter selection
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

# Other libraries
import time # Execution time of some blocks
import statistics
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from itertools import chain
import collections

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *
from drug_functions import *
from NER_functions import *
from ortographic_features import *
from context_features import *



## Objectives of this part
In this second part of the project, we will focus on two different things: 
1. Detection of interactions between drugs
2. Classification of each drug-drug interaction according to one of the following types:
    - Advice: 'Interactions may be expected, and Uroxatral should not be used in combination with other alpha-blockers.'
    - Effect: 'In uninfected volunteers, 46% developed rash while receiving Sustiva and Clarithromycin.'
    - Mechanism: 'Grepafloxacin is a competitive inhibitor of the metabolism of theophylline'.
    - Int: The interaction of omeprazole and ketoconazole has been stablished.

## Reading the XML data

In [3]:
%%time
'''
data_dir1 = 'data/Train/DrugBank/'
data_dir2 = 'data/Train/MedLine/'
'''

data_dir1 = 'data/small_train_DrugBank/'
# TODO: Read test data from the correct files

def readXMLData(data_dir):

    # Use xlm_element.tag to get the name of the xlm element
    # Use xlm_element.attrib to get all the attributes of the xlm element as a string

    # Parse the DrugBank Files
    drugs_dataset = []
    #parent_directory = '../LaboCase/small_train_DrugBank/'
    for filename in os.listdir(data_dir):
        if filename.endswith(".xml"):
            # Parse the file
            tree = ET.parse(data_dir + filename)
            # Create a list of lists with the interactions of the file
            drugs_dataset = drugs_dataset + listDDIFromXML(tree.getroot())

    return(drugs_dataset)

# Create a list of lists with the interactions of the file
XMLdata = readXMLData(data_dir1)
#XMLdata_MedLine = readXMLData(data_dir2)

#XMLdata = XMLdata_DrugBank #+ XMLdata_MedLine

CPU times: user 926 ms, sys: 40.1 ms, total: 966 ms
Wall time: 967 ms


In [4]:
print('Number of total sentences in the training dataset:', len(XMLdata))

Number of total sentences in the training dataset: 308


In [5]:
XMLdata[1]

('Population pharmacokinetic analyses revealed that MTX, NSAIDs, corticosteroids, and TNF blocking agents did not influence abatacept clearance.',
 'MTX',
 'corticosteroids',
 ['MTX', 'NSAIDs', 'corticosteroids', 'TNF blocking agents', 'abatacept'],
 ['NNP',
  'JJ',
  'NNS',
  'VBD',
  'IN',
  'NNP',
  ',',
  'NNP',
  ',',
  'NNS',
  ',',
  'CC',
  'NNP',
  'VBG',
  'NNS',
  'VBD',
  'RB',
  'VB',
  'JJ',
  'NN',
  '.'],
 ['Population',
  'pharmacokinetic',
  'analyses',
  'revealed',
  'that',
  'MTX',
  'NSAIDs',
  ',',
  'corticosteroids',
  ',',
  'and',
  'TNF',
  'blocking',
  'agents',
  'did',
  'not',
  'influence',
  'abatacept',
  'clearance',
  '.'],
 'false',
 'none')

## Creation of features
Before training our model, we need to come up with features to help us determine whether there is a relationship between the two drugs or not.

Some ideas for features are the following:
- Does the sentence contain a modal verb (should, must,...) between the two entities?
- Word bigrams: This is a binary feature for all word bigrams that appeared more than once in the corpus, indicating the presence or absence of each such bigram in the sentence
- Number of words between a pair of drugs
- Number of drugs between a pair of drugs
- POS of words between a pair of drugs: This is a binary feature for word POS tags obtained from POS tagging, and indicates the presence or absence of each POS between the two main drugs.
- Path between a pair of drugs: Path between two main drugs in the parse tree is another feature in our system. Because syntactic paths are in general a sparse feature, we reduced the sparsity by collapsing identical adjacent non-terminal labels. E.g., NP-S-VP-VP-NP is converted to NP-S-VP-NP. This technique decreased the number of paths by 24.8%.

In [17]:
with(open('data/DrugBank_names_DB.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()
    
def sent2features(tupl, i, database):
    
    if len(tupl) != 8:
        raise ValueError('The introduced tuple does not have the correct length')
    sent = tupl[0]
    ent1 = tupl[1]
    ent2 = tupl[2]
    ent_list = tupl[3]
    pos_tags = tupl[4]
    tok_sent = tupl[5] # Tokenized sentence
    
    prefixes = r'^alk|^meth|^eth|^prop|^but|^pent|^hex|^hept|^oct|^non|^dec|^undec|^dodec|^eifcos|^di|^tri|^tetra|^penta|^hexa|^hepta'
    suffixes = r'ane$|ene$|yne$|yl$|ol$|al$|oic$|one$|ate$|amine$|amide$'

    features = {
        
    'ent1': ent1,
    'ent2': ent2,
    # Orthographic features
        
    # Entity 1
    'ent1_all_uppercase_letters' : allCaps(ent1), 
    'ent1_initial_capital_letter': initCap(ent1), 
    'ent1_contains_capital_letter' : hasCap(ent1),
    'ent1_single_capital_letter' : singleCap(ent1),
    'ent1_punctuation' : punctuation(ent1),
    'ent1_initial_digit' : initDigit(ent1),
    'ent1_single_digit' : singleDigit(ent1),
    'ent1_letter_and_num' : alphaNum(ent1),
    'ent1_many_numbers' : manyNum(ent1),
    'ent1_contains_real_numbers' : realNum(ent1),
    'ent1_intermediate_dash' : inDash(ent1),
    'ent1_has_digit' : hasDigit(ent1),
    'ent1_is_Dash' : isDash(ent1),
    'ent1_is_roman_letter' : roman(ent1),
    'ent1_is_end_punctuation' : endPunctuation(ent1),
    'ent1_caps_mix' : capsMix(ent1),

    # Entity 2
    'ent2_all_uppercase_letters' : allCaps(ent2), 
    'ent2_initial_capital_letter': initCap(ent2), 
    'ent2_contains_capital_letter' : hasCap(ent2),
    'ent2_single_capital_letter' : singleCap(ent2),
    'ent2_punctuation' : punctuation(ent2),
    'ent2_initial_digit' : initDigit(ent2),
    'ent2_single_digit' : singleDigit(ent2),
    'ent2_letter_and_num' : alphaNum(ent2),
    'ent2_many_numbers' : manyNum(ent2),
    'ent2_contains_real_numbers' : realNum(ent2),
    'ent2_intermediate_dash' : inDash(ent2),
    'ent2_has_digit' : hasDigit(ent2),
    'ent2_is_Dash' : isDash(ent2),
    'ent2_is_roman_letter' : roman(ent2),
    'ent2_is_end_punctuation' : endPunctuation(ent2),
    'ent2_caps_mix' : capsMix(ent2),
        
    # Morphological information: prefixes/suffixes of lengths from 2 to 5 and word shapes of tokens. 
    # Entity 1
    'ent1_word[-5:]': ent1[-5:],
    'ent1_word[-4:]': ent1[-4:],
    'ent1_word[-3:]': ent1[-3:],
    'ent1_word[-2:]': ent1[-2:],

    # Entity 2
    'ent2_word[-5:]': ent2[-5:],
    'ent2_word[-4:]': ent2[-4:],
    'ent2_word[-3:]': ent2[-3:],
    'ent2_word[-2:]': ent2[-2:],
    
    # Domain knowledge
    # Entity 1
    'ent1_drug_sufix': getSuffix(ent1, suffixes),
    'ent1_drug_prefix': getPrefix(ent1, prefixes),

    # Entity 2
    'ent2_drug_sufix': getSuffix(ent2, suffixes),
    'ent2_drug_prefix': getPrefix(ent2, prefixes),

        
    # Is in DrugBank dataset
    'ent1_isInDB':isTokenInDB(ent1,database),
    'ent2_isInDB':isTokenInDB(ent2,database),
    
        
    # Context features
    'n_tokens_bw_entities': countTokensBetweenEntities(tok_sent, ent1, ent2),
    'n_entities_bw_entities': countEntitiesBetweenEntities(tok_sent, ent1, ent2, ent_list),
    'n_modal_verbs_bw_entities': countModalVerbsBetweenEntities(tok_sent, ent1, ent2),
    'sentence_contains_neg': sentenceContainsNegation(tok_sent),
    'keywords_bw_entities': keyWordsBetweenEntities(tok_sent, ent1, ent2),
    'first_modal_sentence': getFirstModalVerb(tok_sent),
    'POS_tags_sentence_simpl': createSimplifiedPOSPath(pos_tags),
    '2_grams_bw_entities': getNgramsBetweenEntities(tok_sent, ent1, ent2, 2),
    '3_grams_bw_entities': getNgramsBetweenEntities(tok_sent, ent1, ent2, 3)
        
    }

    return features


def text2features(text,database):
    for i in range(len(text)):
        return(sent2features(text, i, drugbank_db))

def text2labels(text):
    return text[]

In [18]:
%%time
X = [[text2features(s, drugbank_db)] for s in XMLdata]
y = [[text2labels(s)] for s in XMLdata]

CPU times: user 143 ms, sys: 4.75 ms, total: 147 ms
Wall time: 145 ms


## Train/Test/Validation split

For this project, we will split the original dataset in the following proportions:
 - Training data: 42%
 - Testing data: 40%
 - Validation data: 18%

In [19]:
seed = 16273
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seed, shuffle = True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=seed, shuffle = True)
print('Number of training sentences: ', len(X_train))
print('Number of testing sentences: ', len(X_test))

Number of training sentences:  128
Number of testing sentences:  124


In [21]:
print(y_train[10])

[['DT', 'NN', 'IN', 'NNS', 'IN', 'NNP', 'JJ', 'NNS', 'VBD', 'CD', 'CC', 'JJR', 'IN', 'DT', 'JJ', 'JJ', 'NNS', 'IN', 'NNP', ':', 'NNP', ',', 'NNP', ',', 'NNS', ',', 'NNP', 'VBG', 'NNS', ',', 'NN', ',', 'NN', ',', 'NN', ',', 'NN', ',', 'RB', ',', 'NN', ',', 'CC', 'NN', '.']]


#### Hyperparameter Optimization


Using validation data

In [15]:
%%time
# We don't want to include the 'none' label for the optimization
labels = ['mechanism', 'effect', 'int', 'advise']

# Define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)


# search
'''

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)


params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=20,
                        scoring=f1_scorer)


'''

crf.fit(X_val, y_val)

TypeError: expected bytes, list found

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

## Model training
Using training data and the parameters obtained in the previous step

In [None]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=rs.best_params_['c1'],
    c2=rs.best_params_['c2'],
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

## Evaluation

Using test data

In [None]:
y_pred = crf.predict(X_test)

In [None]:
for i in range(len(y_pred)):
    if y_pred[i][0] is None:
        print('y_pred index: ', i)
        y_pred[i][0] = 'none'
    
    if y_test[i][0] is None:
        print('y_test index:' , i)
        y_test[i][0] = 'none'

In [None]:
print(sklearn.metrics.recall_score(y_true = y_test, 
                             y_pred = y_pred, 
                             labels=labels, 
                             pos_label=1, 
                             average='weighted',
                             sample_weight=None)
      )

print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(collections.Counter(crf.state_features_).most_common(15))

print("\nTop negative:")
print_state_features(collections.Counter(crf.state_features_).most_common()[-15:])

In [None]:
'''
def transformStrCategoriesIntoInts(vector):
    res = []
    for el in vector:
        if el == 'none' or el == 'None':
            res.append(0)
        elif el == 'mechanism':
            res.append(1)
        elif el == 'effect':
            res.append(2)
        elif el == 'int':
            res.append(3)
        elif el == 'advise':
            res.append(4)
        else:
            print(el)
            print(type(el))
            print(vector.index(el))
    return(res)
'''