# AHLT - MIRI
# Drugs Interaction Classifier

In [1]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from sklearn import svm
import scipy.stats # for RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, train_test_split # Parameter selection

# Other libraries
import time # Execution time of some blocks
import statistics

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *
from drug_functions import *

## Objectives of this part
In this second part of the project, we will focus on two different things: 
1. Detection of interactions between drugs
2. Classification of each drug-drug interaction according to one of the following types:
    - Advice: 'Interactions may be expected, and Uroxatral should not be used in combination with other alpha-blockers.'
    - Effect: 'In uninfected volunteers, 46% developed rash while receiving Sustiva and Clarithromycin.'
    - Mechanism: 'Grepafloxacin is a competitive inhibitor of the metabolism of theophylline'.
    - Int: The interaction of omeprazole and ketoconazole has been stablished.

## Parsing the XML Files

### DrugBank and MedLine files

In [2]:
# Define the data paths
train_data_dir = 'data/medium_train_DrugBank/'
test_data_dir = 'data/medium_test_DrugBank/'

# Read the data from the specified directories
DrugBank_df = readTrainingData(train_data_dir)

# Select the initial columns from which we will compute the features for each row
train_df = DrugBank_df[['sentence_text', 'e1_name', 'e2_name', 'list_entities', 'interaction']]

## Creation of features
Before training our model, we need to come up with features to help us determine whether there is a relationship between the two drugs or not.

Some ideas for features are the following:
- Does the sentence contain a modal verb (should, must,...) between the two entities?
- Word bigrams: This is a binary feature for all word bigrams that appeared more than once in the corpus, indicating the presence or absence of each such bigram in the sentence
- Number of words between a pair of drugs
- Number of drugs between a pair of drugs
- POS of words between a pair of drugs: This is a binary feature for word POS tags obtained from POS tagging, and indicates the presence or absence of each POS between the two main drugs.
- Path between a pair of drugs: Path between two main drugs in the parse tree is another feature in our system. Because syntactic paths are in general a sparse feature, we reduced the sparsity by collapsing identical adjacent non-terminal labels. E.g., NP-S-VP-VP-NP is converted to NP-S-VP-NP. This technique decreased the number of paths by 24.8%.

In [3]:
train_df = createFeatures(train_df)
display(train_df.head())
train_df.dtypes


Unnamed: 0,sentence_text,e1_name,e2_name,list_entities,interaction,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities
0,"Milk, milk products, and calcium-rich foods or...",calcium,EMCYT,"[calcium, EMCYT]",True,1,11,0
1,Interaction with central nervous system depres...,central nervous system depressants,benzodiazepines,"[central nervous system depressants, benzodiaz...",False,0,4,0
2,"however, no deleterious interactions were seen...",ROMAZICON,narcotics,"[ROMAZICON, narcotics, anesthetics, muscle rel...",False,0,5,0
3,"however, no deleterious interactions were seen...",ROMAZICON,anesthetics,"[ROMAZICON, narcotics, anesthetics, muscle rel...",False,0,8,1
4,"however, no deleterious interactions were seen...",ROMAZICON,muscle relaxants,"[ROMAZICON, narcotics, anesthetics, muscle rel...",False,0,10,2


sentence_text                object
e1_name                      object
e2_name                      object
list_entities                object
interaction                  object
n_modal_verbs_bw_entities     int64
n_tokens_bw_entities          int64
n_entities_bw_entities        int64
dtype: object

In [4]:
# Is there any null value?
train_df[train_df.isnull().any(axis=1)]

Unnamed: 0,sentence_text,e1_name,e2_name,list_entities,interaction,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities


### Categorical variables preprocessing
As we are working with the sklearn.SVM machine learning model, in this case we need our output variable ('interaction') to be a binary variable encoded with 0 and 1's. For this purpose, we will use the pd.replace function.

In [5]:
new_encoding = {'interaction': {'true':1, 'false':0}}
train_df.replace(new_encoding, inplace = True)
train_df.head()

Unnamed: 0,sentence_text,e1_name,e2_name,list_entities,interaction,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities
0,"Milk, milk products, and calcium-rich foods or...",calcium,EMCYT,"[calcium, EMCYT]",1,1,11,0
1,Interaction with central nervous system depres...,central nervous system depressants,benzodiazepines,"[central nervous system depressants, benzodiaz...",0,0,4,0
2,"however, no deleterious interactions were seen...",ROMAZICON,narcotics,"[ROMAZICON, narcotics, anesthetics, muscle rel...",0,0,5,0
3,"however, no deleterious interactions were seen...",ROMAZICON,anesthetics,"[ROMAZICON, narcotics, anesthetics, muscle rel...",0,0,8,1
4,"however, no deleterious interactions were seen...",ROMAZICON,muscle relaxants,"[ROMAZICON, narcotics, anesthetics, muscle rel...",0,0,10,2


## Building the classifier - SVM

### Creation of the training, validation and testing datasets

In [6]:
# Name of the target variable
target_name = 'interaction'
sentence_name = 'sentence_text'
list_entities_name = 'list_entities'
ent_1_name = 'e1_name'
ent_2_name = 'e2_name'
var_not_incl = ['sentence_text', 'e1_name', 'e2_name']

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
# In this first step we will create a whole dataset with 100% of the data, which we will
# split in the following steps into training, validation and testing data
X = train_df.loc[:, [all(x) for x in list(zip( 
                train_df.columns != target_name,
                train_df.columns != list_entities_name))]]
Y = train_df[target_name]

Once we have our dataset with 100% of the data created, we will create the training, validation and testing datasets. For this part of the project we have decided to split the dataset with the following proportions (60, 20, 20).

In [7]:
seed = 16273
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed, shuffle = True)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=seed, shuffle = True)

In [8]:
print(X_train.shape, X_val.shape, X_test.shape)
display(X.head())
display(Y.head())

(8318, 6) (2080, 6) (2600, 6)


Unnamed: 0,sentence_text,e1_name,e2_name,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities
0,"Milk, milk products, and calcium-rich foods or...",calcium,EMCYT,1,11,0
1,Interaction with central nervous system depres...,central nervous system depressants,benzodiazepines,0,4,0
2,"however, no deleterious interactions were seen...",ROMAZICON,narcotics,0,5,0
3,"however, no deleterious interactions were seen...",ROMAZICON,anesthetics,0,8,1
4,"however, no deleterious interactions were seen...",ROMAZICON,muscle relaxants,0,10,2


0    1
1    0
2    0
3    0
4    0
Name: interaction, dtype: int64

## Model selection

In [9]:
# Create a SVM object with the corresponding tunned parameters
svc = svm.SVC()

# Look for the best parameters of the SVM model with GridSearchCV
start = time.time()
clf = RandomizedSearchCV(svc,{'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),
                              'kernel': ['rbf'], 'class_weight':['balanced', None]},n_iter=40,n_jobs=-1)
clf.fit(X_val.drop(var_not_incl, axis = 1), Y_val)
end = time.time()
print('Validating time of the SVM: ', str(end - start),'\n')

print('Best estimator: ', clf.best_estimator_)

Validating time of the SVM:  16.717604875564575 

Best estimator:  SVC(C=86.786214441268555, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.016986834489730408,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)


In [10]:
# Train the SVM model with the parameters selected before
start = time.time()
model = clf.best_estimator_
model.fit(X_train.drop(var_not_incl, axis=1), Y_train)
end = time.time()
print('Training time of the SVM: ', str(end - start))

Training time of the SVM:  25.020905017852783


In [11]:
pred = model.predict(X_test.drop(var_not_incl, axis = 1))
true = np.array(Y_test)
print(pred, true)

[0 0 0 ..., 0 0 0] [0 1 0 ..., 0 1 0]


In [18]:
print('Precision: ', round(computePrecision(true=true, pred=pred)*100, 1))
print('Recall: ', round(computeRecall(true=true, pred=pred)*100, 1))
print('F1: ', round(computeF1(true = true, pred = pred)*100, 1))

Precision:  65.5
Recall:  18.8
F1:  29.2


In [19]:
X_test['real_interaction'] = true
X_test['pred_interaction'] = pred

In [21]:
X_test[X_test['pred_interaction'] == 1]

Unnamed: 0,sentence_text,e1_name,e2_name,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities,real_interaction,pred_interaction
11541,"As with other antihypertensive agents, the ant...",antihypertensive agents,non-steroidal anti-inflammatory drug,1,12,1,0,1
6023,"FLEXERIL may enhance the effects of alcohol, b...",FLEXERIL,alcohol,1,7,0,1,1
10730,"Furosemide: Clinical studies, as well as rando...",ibuprofen,thiazides,1,10,2,1,1
2323,Nicotine: Nicotine may provoke vasoconstrictio...,Nicotine,ergot,1,17,1,1,1
981,"Hepatic Enzyme Inducers, Inhibitors and Substr...",carbamazepine,corticosteroid,1,9,2,0,1
4225,Aspirin: Concurrent administration of aspirin ...,Aspirin,meclofenamate sodium,1,8,1,0,1
5057,"When taken orally , imidazole compounds like k...",imidazole compounds,coumarin,1,10,1,1,1
7008,Ethoxzolamide may increase the action of tricy...,Ethoxzolamide,quinidine,1,14,3,1,1
7516,Non-nucleoside reverse transcriptase inhibitor...,NNRTIs,combination hormonal contraceptives,1,9,1,0,1
4457,The use of dextromethorphan hydrobromide may r...,dextromethorphan hydrobromide,alcohol,1,12,0,1,1
