In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## Module importing

In [2]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split # Parameter selection
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

# Other libraries
import time # Execution time of some blocks
import statistics
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from itertools import chain
import collections
import pickle

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *
from drug_functions import *
from NER_functions import *
from ortographic_features import *
from context_features import *
from feature_creation_interaction import *
from crf_functions import *



## Objectives of this part
In this second part of the project, we will focus on two different things: 
1. Detection of interactions between drugs
2. Classification of each drug-drug interaction according to one of the following types:
    - Advice: 'Interactions may be expected, and Uroxatral should not be used in combination with other alpha-blockers.'
    - Effect: 'In uninfected volunteers, 46% developed rash while receiving Sustiva and Clarithromycin.'
    - Mechanism: 'Grepafloxacin is a competitive inhibitor of the metabolism of theophylline'.
    - Int: 'The interaction of omeprazole and ketoconazole has been stablished.'

## Reading the XML data

In [None]:
'''
%%time

# Training data

train_data_dir_DrugBank = 'data/Train/DrugBank/'
train_data_dir_MedLine = 'data/Train/MedLine/'
train_data_dirs = [train_data_dir_DrugBank, train_data_dir_MedLine]


XMLdata_train = []
for train_data_dir in train_data_dirs:
    XMLdata_train = XMLdata_train + readXMLData(train_data_dir)
'''

In [None]:
'''
%%time

# Testing data

test_data_dir_DrugBank = 'data/Test/Test_data_DDI/DrugBank/'
test_data_dir_MedLine = 'data/Test/Test_data_DDI/MedLine/'
test_data_dirs = [test_data_dir_DrugBank, test_data_dir_MedLine]

XMLdata_test = []
for test_data_dir in test_data_dirs:
    XMLdata_test = XMLdata_test + readXMLData(test_data_dir)
'''

## Saving the parsed XML data for later use
As we don't want to read and parse the XML files each time, we will save them into a pickle object that we can quickly read when needed 


In [None]:
'''
with open("parsed_train.txt", "wb") as f:   #Pickling
    pickle.dump(XMLdata_train, f)

with open("parsed_test.txt", "wb") as f:   #Pickling
    pickle.dump(XMLdata_test, f)
'''

## Read the saved parsed XML data quickly

In [None]:
'''
with open("parsed_train.txt", "rb") as f:   # Unpickling
    XMLdata_train = pickle.load(f)
    
with open("parsed_test.txt", "rb") as f:   # Unpickling
    XMLdata_test = pickle.load(f)

print('Number of training sentences readed: ', len(XMLdata_train))
print('Number of testing sentences readed: ', len(XMLdata_test))
'''

## Creation of features
Before training our model, we need to come up with features to help us determine whether there is a relationship between the two drugs or not.

Some ideas for features are the following:
- Does the sentence contain a modal verb (should, must,...) between the two entities?
- Word bigrams: This is a binary feature for all word bigrams that appeared more than once in the corpus, indicating the presence or absence of each such bigram in the sentence
- Number of words between a pair of drugs
- Number of drugs between a pair of drugs
- POS of words between a pair of drugs: This is a binary feature for word POS tags obtained from POS tagging, and indicates the presence or absence of each POS between the two main drugs.
- Path between a pair of drugs: Path between two main drugs in the parse tree is another feature in our system. Because syntactic paths are in general a sparse feature, we reduced the sparsity by collapsing identical adjacent non-terminal labels. E.g., NP-S-VP-VP-NP is converted to NP-S-VP-NP. This technique decreased the number of paths by 24.8%.

In [None]:
'''
%%time
# Read the database
with(open('data/DrugBank_names_DB.txt', 'r')) as f:
    drugbank_db = f.read().splitlines()

# Create the train and tests datasets

# Train
X_train = [[text2features(s, drugbank_db)] for s in XMLdata_train]
y_train_int = [[text2interaction(s)] for s in XMLdata_train]
y_train_type = [[text2interactionType(s)] for s in XMLdata_train]

# Test
X_test = [[text2features(s, drugbank_db)] for s in XMLdata_test]
y_test_int = [[text2interaction(s)] for s in XMLdata_test]
y_test_type = [[text2interactionType(s)] for s in XMLdata_test]

print('Number of training sentences: ', len(X_train))
print('Number of testing sentences: ', len(X_test))
'''

In [3]:
%%time

with open("X_train.txt", "rb") as f:   # Unpickling
    X_train = pickle.load(f)
    
with open("y_train_int.txt", "rb") as f:   # Unpickling
    y_train_int = pickle.load(f)
    
with open("y_train_type.txt", "rb") as f:   # Unpickling
    y_train_type = pickle.load(f)

with open("X_test.txt", "rb") as f:   # Unpickling
    X_test = pickle.load(f)
    
with open("y_test_int.txt", "rb") as f:   # Unpickling
    y_test_int = pickle.load(f)
    
with open("y_test_type.txt", "rb") as f:   # Unpickling
    y_test_type = pickle.load(f)

print('Number of training sentences readed: ', len(X_train))
print('Number of testing sentences readed: ', len(X_test))

Number of training sentences readed:  27792
Number of testing sentences readed:  5716
CPU times: user 2.22 s, sys: 434 ms, total: 2.65 s
Wall time: 2.9 s


# Model training

## First model: Classifying interactions between true/false

## Model training
Using training data and the parameters obtained in the previous step

In [5]:
%%time

mod1 = trainCRFAndEvaluate(
            X_train = X_train, 
            y_train = y_train_int,
            X_test = X_test,
            y_test = y_test_int,
            labels = ['true', 'false'],
            hyperparam_optim = True)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 13.2min finished


Hyperparameter optimization took 822.69 seconds to complete
             precision    recall  f1-score   support

       true      0.515     0.532     0.523       979
      false      0.903     0.896     0.899      4737

avg / total      0.836     0.834     0.835      5716

CPU times: user 4min 35s, sys: 22.4 s, total: 4min 57s
Wall time: 14min 11s


In [6]:
y_pred_int = mod1.predict(X_test)

In [12]:
for i in range(len(X_train))
print(X_train[0:1])

[[{'ent1_has_digit': 0, '3_grams_bw_entities': ['cal', 'alc', 'lci', 'ciu', 'ium', 'um ', 'm r', ' ri', 'ric', 'ich', 'ch ', 'h f', ' fo', 'foo', 'ood', 'ods', 'ds ', 's o', ' or', 'or ', 'r d', ' dr', 'dru', 'rug', 'ugs', 'gs ', 's m', ' ma', 'may', 'ay ', 'y i', ' im', 'imp', 'mpa', 'pai', 'air', 'ir ', 'r t', ' th', 'the', 'he ', 'e a', ' ab', 'abs', 'bso', 'sor', 'orp', 'rpt', 'pti', 'tio', 'ion', 'on ', 'n o', ' of', 'of ', 'f E', ' EM', 'EMC', 'MCY', 'CYT'], 'ent1_initial_digit': 0, 'ent2_word[-5:]': 'EMCYT', 'ent1_pos_tag_following_word3': 'NN', 'ent2_isInDB': False, 'ent2': 'EMCYT', 'ent2_pos_tag_prev_word3': 'NN', 'ent2_contains_real_numbers': 0, 'ent2_pos_tag_following_word3': '', 'ent1_word[-4:]': 'cium', 'ent2_word[-4:]': 'MCYT', 'ent2_word[-2:]': 'YT', 'ent1_contains_real_numbers': 0, 'ent1_isInDB': True, 'ent2_many_numbers': 0, 'ent2_initial_capital_letter': 1, 'ent2_initial_digit': 0, 'ent2_is_roman_letter': 0, 'ent1_is_end_punctuation': 0, 'ent2_letter_and_num': 0, 'sen

## Second model: Classifying between types of interactions

### Filtering the data

In [7]:
X_train_filtered = []
y_train_filtered = []
for idx,val in enumerate(y_train_int):
    if val==['true']:
        X_train_filtered.append(X_train[idx])
        y_train_filtered.append(y_train_type[idx]) # no we don't want true or false rather than int, mechanism...


X_test_filtered = []
y_test_filtered = []
for idx,val in enumerate(y_pred_int):
    if val==['true']:
        X_test_filtered.append(X_test[idx])
        y_test_filtered.append(y_test_type[idx]) # no we don't want true or false rather than int, mechanism...

print('Number of training sentences: ', len(X_train_filtered))
print('Number of testing sentences: ', len(X_test_filtered))

Number of training sentences:  4021
Number of testing sentences:  1012


In [8]:
mod2 = trainCRFAndEvaluate(
            X_train = X_train_filtered, 
            y_train = y_train_filtered,
            X_test = X_test_filtered,
            y_test = y_test_filtered,
            labels = ['mechanism', 'advise', 'effect', 'int'],
            hyperparam_optim = True)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for

Hyperparameter optimization took 161.4 seconds to complete
             precision    recall  f1-score   support

  mechanism      0.374     0.839     0.517       155
     advise      0.453     0.764     0.569       127
     effect      0.390     0.911     0.547       180
        int      0.467     0.237     0.315        59

avg / total      0.409     0.777     0.517       521



In [None]:
'''

%%time
# Define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

# Parameter search

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=1,
                        scoring=f1_scorer)

rs.fit(X_train, y_int_train)
'''

In [None]:
'''
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
'''



In [None]:
'''
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))
'''

In [None]:
'''
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(collections.Counter(crf.state_features_).most_common(15))

print("\nTop negative:")
print_state_features(collections.Counter(crf.state_features_).most_common()[-15:])
'''