In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## Module importing

In [2]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split # Parameter selection
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

# Other libraries
import time # Execution time of some blocks
import statistics
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from itertools import chain
import collections

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *
from drug_functions import *
from NER_functions import *
from ortographic_features import *
from context_features import *
from feature_creation_interaction import *



## Objectives of this part
In this second part of the project, we will focus on two different things: 
1. Detection of interactions between drugs
2. Classification of each drug-drug interaction according to one of the following types:
    - Advice: 'Interactions may be expected, and Uroxatral should not be used in combination with other alpha-blockers.'
    - Effect: 'In uninfected volunteers, 46% developed rash while receiving Sustiva and Clarithromycin.'
    - Mechanism: 'Grepafloxacin is a competitive inhibitor of the metabolism of theophylline'.
    - Int: The interaction of omeprazole and ketoconazole has been stablished.

## Reading the XML data

In [3]:
%%time
'''
data_dir1 = 'data/Train/DrugBank/'
data_dir2 = 'data/Train/MedLine/'
'''

data_dir1 = 'data/Train/DrugBank/'
test_data_dir = 'data/Test/Test_data_DDI/DrugBank/'

def readXMLData(data_dir):

    # Use xlm_element.tag to get the name of the xlm element
    # Use xlm_element.attrib to get all the attributes of the xlm element as a string

    # Parse the DrugBank Files
    drugs_dataset = []
    #parent_directory = '../LaboCase/small_train_DrugBank/'
    for filename in os.listdir(data_dir):
        if filename.endswith(".xml"):
            # Parse the file
            tree = ET.parse(data_dir + filename)
            # Create a list of lists with the interactions of the file
            drugs_dataset = drugs_dataset + listDDIFromXML(tree.getroot())

    return(drugs_dataset)

# Create a list of lists with the interactions of the file
XMLdata = readXMLData(data_dir1)
XMLdata_test = readXMLData(test_data_dir)

CPU times: user 2min 7s, sys: 1.46 s, total: 2min 8s
Wall time: 2min 9s


## Creation of features
Before training our model, we need to come up with features to help us determine whether there is a relationship between the two drugs or not.

Some ideas for features are the following:
- Does the sentence contain a modal verb (should, must,...) between the two entities?
- Word bigrams: This is a binary feature for all word bigrams that appeared more than once in the corpus, indicating the presence or absence of each such bigram in the sentence
- Number of words between a pair of drugs
- Number of drugs between a pair of drugs
- POS of words between a pair of drugs: This is a binary feature for word POS tags obtained from POS tagging, and indicates the presence or absence of each POS between the two main drugs.
- Path between a pair of drugs: Path between two main drugs in the parse tree is another feature in our system. Because syntactic paths are in general a sparse feature, we reduced the sparsity by collapsing identical adjacent non-terminal labels. E.g., NP-S-VP-VP-NP is converted to NP-S-VP-NP. This technique decreased the number of paths by 24.8%.

In [4]:
# Read the database
with(open('data/DrugBank_names_DB.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()

In [5]:
%%time

# Create the train and tests dataset
# Train
X_train = [[text2features(s, drugbank_db)] for s in XMLdata]
y_train_int = [[text2interaction(s)] for s in XMLdata]
y_train_type = [[text2interactionType(s)] for s in XMLdata]

# Test
X_test = [[text2features(s, drugbank_db)] for s in XMLdata_test]
y_test_int = [[text2interaction(s)] for s in XMLdata_test]
y_test_type = [[text2interactionType(s)] for s in XMLdata_test]

print('Number of training sentences: ', len(X_train))
print('Number of testing sentences: ', len(X_test))

Number of training sentences:  26005
Number of testing sentences:  5265
CPU times: user 14 s, sys: 277 ms, total: 14.3 s
Wall time: 14.3 s


# Model training

## First model: Classifying interactions between true/false

## Model training
Using training data and the parameters obtained in the previous step

In [9]:
%%time

#c1 = rs.best_params_['c1']
#c2 = rs.best_params_['c2']

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train, y_train_int)

CPU times: user 24.7 s, sys: 301 ms, total: 25 s
Wall time: 25.6 s


In [17]:
y_pred_int = crf.predict(X_test)

## Second model: Classifying between types of interactions

### Filtering the data

In [18]:
X_train_filtered = []
y_train_filtered = []
for idx,val in enumerate(y_train_int):
    if val==['true']:
        X_train_filtered.append(X_train[idx])
        y_train_filtered.append(y_train_type[idx]) # no we don't want true or false rather than int, mechanism...


X_test_filtered = []
y_test_filtered = []
for idx,val in enumerate(y_pred_int):
    if val==['true']:
        X_test_filtered.append(X_test[idx])
        y_test_filtered.append(y_test_type[idx]) # no we don't want true or false rather than int, mechanism...

In [26]:
for i in range(len(y_train_filtered)):
    if y_train_filtered[i][0] is None:
        print('y_pred index: ', i)
        y_train_filtered[i][0] = 'none'

y_pred index:  568


In [27]:
%%time

#c1 = rs.best_params_['c1']
#c2 = rs.best_params_['c2']

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_filtered, y_train_filtered)

CPU times: user 5.61 s, sys: 61.2 ms, total: 5.67 s
Wall time: 5.75 s


In [28]:
y_pred_type = crf.predict(X_test_filtered)

## Evaluation

Using test data

In [11]:
for i in range(len(y_pred)):
    if y_pred[i][0] is None:
        print('y_pred index: ', i)
        y_pred[i][0] = 'none'
    
    if y_test[i][0] is None:
        print('y_test index:' , i)
        y_test[i][0] = 'none'

In [29]:
labels = ['mechanism', 'int', 'advise', 'effect']
print(metrics.flat_classification_report(
    y_test_filtered, y_pred_type, labels = labels, digits=3
))

             precision    recall  f1-score   support

  mechanism      0.353     0.856     0.500       139
        int      0.355     0.196     0.253        56
     advise      0.360     0.754     0.488       130
     effect      0.354     0.868     0.503       159

avg / total      0.355     0.756     0.469       484



In [13]:
'''
def transformStrCategoriesIntoInts(vector):
    res = []
    for el in vector:
        if el == 'none' or el == 'None':
            res.append(0)
        elif el == 'mechanism':
            res.append(1)
        elif el == 'effect':
            res.append(2)
        elif el == 'int':
            res.append(3)
        elif el == 'advise':
            res.append(4)
        else:
            print(el)
            print(type(el))
            print(vector.index(el))
    return(res)
'''

"\ndef transformStrCategoriesIntoInts(vector):\n    res = []\n    for el in vector:\n        if el == 'none' or el == 'None':\n            res.append(0)\n        elif el == 'mechanism':\n            res.append(1)\n        elif el == 'effect':\n            res.append(2)\n        elif el == 'int':\n            res.append(3)\n        elif el == 'advise':\n            res.append(4)\n        else:\n            print(el)\n            print(type(el))\n            print(vector.index(el))\n    return(res)\n"

In [None]:
'''

%%time
# Define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

# Parameter search

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=1,
                        scoring=f1_scorer)

rs.fit(X_train, y_int_train)
'''

In [None]:
'''
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
'''



In [None]:
'''
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))
'''

In [14]:
'''
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(collections.Counter(crf.state_features_).most_common(15))

print("\nTop negative:")
print_state_features(collections.Counter(crf.state_features_).most_common()[-15:])
'''

'\ndef print_state_features(state_features):\n    for (attr, label), weight in state_features:\n        print("%0.6f %-8s %s" % (weight, label, attr))\n\nprint("Top positive:")\nprint_state_features(collections.Counter(crf.state_features_).most_common(15))\n\nprint("\nTop negative:")\nprint_state_features(collections.Counter(crf.state_features_).most_common()[-15:])\n'