# AHLT - MIRI
# Drugs Interaction Classifier

In [1]:
import nltk # NLTK Library
import xml.etree.ElementTree as ET # ElementTree Library
import os
import pandas as pd
import numpy as np
from xlm_parsers_functions import *
from drug_interaction_functions import *

## TODO list
# Train the first ML model that will identify the drug names from a sentence
# Build the necessary data structure that will hold the predictors and the response variable
# Train ML models with that data


## Objectives of this part
In this second part of the project, we will focus on two different things: 
1. Detection of interactions between drugs
2. Classification of each drug-drug interaction according to one of the following types:
    - Advice: 'Interactions may be expected, and Uroxatral should not be used in combination with other alpha-blockers.'
    - Effect: 'In uninfected volunteers, 46% developed rash while receiving Sustiva and Clarithromycin.'
    - Mechanism: 'Grepafloxacin is a competitive inhibitor of the metabolism of theophylline'.
    - Int: The interaction of omeprazole and ketoconazole has been stablished.

## Parsing the XML Files

### DrugBank and MedLine files

In [2]:
# Use xlm_element.tag to get the name of the xlm element
# Use xlm_element.attrib to get all the attributes of the xlm element as a string

# Give the headers name for the final dataset
headers = ['sentence_id', 'sentence_text', 'e1_id', 'e1_name', 'e1_type', 'e2_id', 'e2_name', 'e2_type', 'list_entities', 'interaction']

# Parse the DrugBank Files
drugs_dataset = []
parent_directory = '../LaboCase/small_train_DrugBank/'
for filename in os.listdir(parent_directory):
    if filename.endswith(".xml"):
        # Parse the file
        tree = ET.parse(parent_directory + filename)
        # Create a list of lists with the interactions of the file
        drugs_dataset = drugs_dataset + listDDIFromXML(tree.getroot())


DrugBank_df = pd.DataFrame(drugs_dataset, columns=headers)

In [10]:
DrugBank_df[DrugBank_df['e1_name'] == 'TNF blocking agents']

Unnamed: 0,sentence_id,sentence_text,e1_id,e1_name,e1_type,e2_id,e2_name,e2_type,list_entities,interaction
9,DDI-DrugBank.d297.s1,Population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.e3,TNF blocking agents,group,DDI-DrugBank.d297.s1.e4,abatacept,drug,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",False
48,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e5,azathioprine,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False
49,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e6,chloroquine,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False
50,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e7,gold,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False
51,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e8,hydroxychloroquine,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False
52,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e9,leflunomide,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False
53,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e10,sulfasalazine,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False
54,DDI-DrugBank.d297.s2,The majority of patients in RA clinical studie...,DDI-DrugBank.d297.s2.e4,TNF blocking agents,group,DDI-DrugBank.d297.s2.e11,anakinra,drug,"[ORENCIA, MTX, NSAIDs, corticosteroids, TNF bl...",False


In [14]:
train_df = DrugBank_df[['sentence_text', 'e1_name', 'e2_name', 'list_entities', 'interaction']]

In [16]:
train_df

Unnamed: 0,sentence_text,e1_name,e2_name,list_entities,interaction,n_modal_verbs_bw_entities,n_tokens_bw_entities
0,Population pharmacokinetic analyses revealed t...,MTX,NSAIDs,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,2
1,Population pharmacokinetic analyses revealed t...,MTX,corticosteroids,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,4
2,Population pharmacokinetic analyses revealed t...,MTX,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,7
3,Population pharmacokinetic analyses revealed t...,MTX,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,13
4,Population pharmacokinetic analyses revealed t...,NSAIDs,corticosteroids,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,2
5,Population pharmacokinetic analyses revealed t...,NSAIDs,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,5
6,Population pharmacokinetic analyses revealed t...,NSAIDs,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,11
7,Population pharmacokinetic analyses revealed t...,corticosteroids,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,3
8,Population pharmacokinetic analyses revealed t...,corticosteroids,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,9
9,Population pharmacokinetic analyses revealed t...,TNF blocking agents,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,5


## Feature creation

In [17]:
train_df['n_modal_verbs_bw_entities'] = train_df.apply(
    lambda row: countModalVerbsBetweenEntities(
        sentence=row['sentence_text'],
        ent1=row['e1_name'],
        ent2=row['e2_name']),
    axis=1)

train_df['n_tokens_bw_entities'] = train_df.apply(
    lambda row: countTokensBetweenEntities(
        sentence=row['sentence_text'],
        ent1=row['e1_name'],
        ent2=row['e2_name']),
    axis = 1)

train_df['n_entities_bw_entities'] = train_df.apply(
    lambda row: countEntitiesBetweenEntities(
        sentence=row['sentence_text'],
        ent1=row['e1_name'],
        ent2=row['e2_name'],
        entities_list = row['list_entities']),
    axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
train_df

Unnamed: 0,sentence_text,e1_name,e2_name,list_entities,interaction,n_modal_verbs_bw_entities,n_tokens_bw_entities,n_entities_bw_entities
0,Population pharmacokinetic analyses revealed t...,MTX,NSAIDs,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,2,0
1,Population pharmacokinetic analyses revealed t...,MTX,corticosteroids,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,4,1
2,Population pharmacokinetic analyses revealed t...,MTX,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,7,2
3,Population pharmacokinetic analyses revealed t...,MTX,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,13,3
4,Population pharmacokinetic analyses revealed t...,NSAIDs,corticosteroids,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,2,0
5,Population pharmacokinetic analyses revealed t...,NSAIDs,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,5,1
6,Population pharmacokinetic analyses revealed t...,NSAIDs,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,11,2
7,Population pharmacokinetic analyses revealed t...,corticosteroids,TNF blocking agents,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,3,0
8,Population pharmacokinetic analyses revealed t...,corticosteroids,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,9,1
9,Population pharmacokinetic analyses revealed t...,TNF blocking agents,abatacept,"[MTX, NSAIDs, corticosteroids, TNF blocking ag...",false,0,5,0


In [19]:
train_df['sentence_text'][1]

'Population pharmacokinetic analyses revealed that MTX, NSAIDs, corticosteroids, and TNF blocking agents did not influence abatacept clearance.'

## Creation of features
Before training our model, we need to come up with features to help us determine whether there is a relationship between the two drugs or not.

Some ideas for features are the following:
- Does the sentence contain a modal verb (should, must,...) between the two entities?
- Word bigrams: This is a binary feature for all word bigrams that appeared more than once in the corpus, indicating the presence or absence of each such bigram in the sentence
- Number of words between a pair of drugs
- Number of drugs between a pair of drugs
- POS of words between a pair of drugs: This is a binary feature for word POS tags obtained from POS tagging, and indicates the presence or absence of each POS between the two main drugs.
- Path between a pair of drugs: Path between two main drugs in the parse tree is another feature in our system. Because syntactic paths are in general a sparse feature, we reduced the sparsity by collapsing identical adjacent non-terminal labels. E.g., NP-S-VP-VP-NP is converted to NP-S-VP-NP. This technique decreased the number of paths by 24.8%.

In [None]:
?pd.DataFrame.apply

In [None]:
'hola' in 'hola que'

In [None]:
a = ['Hola', 'que', 'TAL']
a[0].lower()

In [None]:
a

In [None]:
a = 'HOLA'

In [None]:
a.lower()

In [None]:
a