# AHLT - MIRI
# Drugs Interaction Classifier

In [1]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from sklearn import svm
import scipy.stats # for RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV,KFold # Parameter selection

# Other libraries
import time # Execution time of some blocks
import statistics

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *

## Objectives of this part
In this second part of the project, we will focus on two different things: 
1. Detection of interactions between drugs
2. Classification of each drug-drug interaction according to one of the following types:
    - Advice: 'Interactions may be expected, and Uroxatral should not be used in combination with other alpha-blockers.'
    - Effect: 'In uninfected volunteers, 46% developed rash while receiving Sustiva and Clarithromycin.'
    - Mechanism: 'Grepafloxacin is a competitive inhibitor of the metabolism of theophylline'.
    - Int: The interaction of omeprazole and ketoconazole has been stablished.

## Parsing the XML Files

### DrugBank and MedLine files

In [2]:
# Define the data paths
train_data_dir = 'data/small_train_DrugBank/'
test_data_dir = 'data/small_test_DrugBank/'

# Read the data from the specified directories
DrugBank_df = readTrainingData(train_data_dir)

# Select the initial columns from which we will compute the features for each row
train_df = DrugBank_df[['sentence_text', 'e1_name', 'e2_name', 'list_entities', 'interaction']]

## Creation of features
Before training our model, we need to come up with features to help us determine whether there is a relationship between the two drugs or not.

Some ideas for features are the following:
- Does the sentence contain a modal verb (should, must,...) between the two entities?
- Word bigrams: This is a binary feature for all word bigrams that appeared more than once in the corpus, indicating the presence or absence of each such bigram in the sentence
- Number of words between a pair of drugs
- Number of drugs between a pair of drugs
- POS of words between a pair of drugs: This is a binary feature for word POS tags obtained from POS tagging, and indicates the presence or absence of each POS between the two main drugs.
- Path between a pair of drugs: Path between two main drugs in the parse tree is another feature in our system. Because syntactic paths are in general a sparse feature, we reduced the sparsity by collapsing identical adjacent non-terminal labels. E.g., NP-S-VP-VP-NP is converted to NP-S-VP-NP. This technique decreased the number of paths by 24.8%.

In [3]:
train_df = createFeatures(train_df)
train_df.head()

Unnamed: 0,sentence_text,e1_name,e2_name,list_entities,interaction,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities
0,Population pharmacokinetic analyses revealed t...,MTX,NSAIDs,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",False,0,2,0
1,Population pharmacokinetic analyses revealed t...,MTX,corticosteroids,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",False,0,4,1
2,Population pharmacokinetic analyses revealed t...,MTX,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",False,0,7,2
3,Population pharmacokinetic analyses revealed t...,MTX,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",False,0,13,3
4,Population pharmacokinetic analyses revealed t...,NSAIDs,corticosteroids,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",False,0,2,0


## Building the classifier - SVM

### Creation of the training, validation and testing datasets

In [None]:
# Name of the target variable
target_name = 'interaction'
sentence_name = 'sentence_text'
list_entities_name = 'list_entities'
ent_1_name = 'e1_name'
ent_2_name = 'e2_name'

# Create the appropiate data structure to pass it to the SVM.
# X columns should be all but target_name and token_name
X_train = train_df.loc[:, [all(x) for x in list(zip(
                train_df.columns != sentence_name, 
                train_df.columns != target_name,
                train_df.columns != list_entities_name,
                train_df.columns != ent_1_name,
                train_df.columns != ent_2_name))]]
Y_train = train_df[target_name]

In [None]:
display(X_train.head())
display(Y_train.head())

We will create a validation dataset that contains 30% of the original data.

In [None]:
validation_df = train_df.sample(frac=0.3)
X_val = validation_df.loc[:, [all(x) for x in list(zip(
                validation_df.columns != sentence_name,
                validation_df.columns != target_name,
                validation_df.columns != list_entities_name,
                validation_df.columns != ent_1_name,
                validation_df.columns != ent_2_name))]]
Y_val = validation_df[target_name]

## Model selection

In [None]:
# Create a SVM object with the corresponding tunned parameters
svc = svm.SVC()

# Look for the best parameters of the SVM model with GridSearchCV
start = time.time()
clf = RandomizedSearchCV(svc,{'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),
                              'kernel': ['rbf'], 'class_weight':['balanced', None]},n_iter=40,n_jobs=-1)
clf.fit(X_val,Y_val)
end = time.time()
print('Validating time of the SVM: ', str(end - start),'\n')

print('Best estimator: ', clf.best_estimator_)

In [None]:
# Train the SVM model with the parameters selected before
start = time.time()
model = clf.best_estimator_
model.fit(X_train,Y_train)
end = time.time()
print('Training time of the SVM: ', str(end - start))