In [1]:


import xml.etree.ElementTree as ET
import nltk
from nltk.parse.corenlp import CoreNLPDependencyParser

import os
import sys
import re


In [2]:
#### FUNCTIONS ####

def analyze(s):
    mytree = my_parser.raw_parse(s)
    last_offset_end = 0
    for head_node in mytree:
        print(list(head_node.nodes.viewkeys())[1:])
        for key in sorted(head_node.nodes, key=lambda key: int(key)):
            # first key is not first word (is root)
            if key == 0:
                continue
            # find first occurrence of substring token in sentence
            word = head_node.nodes[key]['word']
            offset_start = s.find(word, last_offset_end)
            offset_end = offset_start + len(word) - 1  # -1 as length 1 is same start and end
            # store last offsets
            last_offset_end = offset_end
            # add start and end to the token
            head_node.nodes[key]['start'] = offset_start
            head_node.nodes[key]['end'] = offset_end

    return head_node

# Returns the key corresponding to the entity in the analysis dependency graph
def get_entity_node_key(entity, analysis):
    for key in sorted(analysis.nodes, key=lambda key: int(key)):
        try:
            if analysis.nodes[key]['start'] == int(entity[0]):
                return key
        except KeyError:
            pass
    return 0

# Checks if the input word (belonging to a node) is a parent of the input parent
def isNodeInParent(parent, word):
    for node in parent:
        if type(node) is nltk.Tree:
            if node.label() == word:
                return True
            if isNodeInParent(node, word):
                return True
        else:
            if node == word:
                return True
    return False


# receives DependencyGraph with all sentence, list of entities and the ids of the 2 entities to be checked
def check_interaction(analysis, entities, e1, e2):
    #### RULE VARIABLES ####
    effect_clue_words = {"administer", "potentiate", "prevent", "antagonize", "antagonized"}
    mechanism_clue_words = {"reduce", "increase", "decrease"}
    int_clue_words = {"interact", "interaction"}
    advise_clue_words = {"may", "might", "should"}

    # results and interaction type, 0 and null unless we find some
    result = "0"
    interaction = "null"

    # key of the node of entity 1 and entity 2
    e1_node_key = get_entity_node_key(entities[e1], analysis)
    e2_node_key = get_entity_node_key(entities[e2], analysis)
    # get the corresponding word (token) of each entity from the dependency graph
    e1_word = analysis.nodes[e1_node_key]['word']
    e2_word = analysis.nodes[e2_node_key]['word']

    # if not found assume no interaction
    if e1_node_key == 0 or e2_node_key == 0:
        return result, interaction

    # get tree of dependency graph, will be used to check hierarchy (words under some others)
    tree = analysis.tree()

    # Iterate through all nodes in the graph
    for key in sorted(analysis.nodes, key=lambda key: int(key)):
        try:
            # get current word of current node
            current_word = analysis.nodes[key]['word']

            # Check if interaction is advise
            if current_word in advise_clue_words:
                next_word = analysis.nodes[key + 1]['word']
                if next_word == "not" and analysis.nodes[key + 2]['word'] == "be" \
                        and analysis.nodes[key + 3]['tag'] == "VBN" \
                        or next_word == "be" and analysis.nodes[key + 2]['tag'] == "VBN":
                    result = "1"
                    interaction = "advise"

            # Check if interaction is int
            elif current_word in int_clue_words:
                for subtree in tree.subtrees():
                    if subtree.label() in int_clue_words:
                        if isNodeInParent(subtree, e1_word) and isNodeInParent(subtree, e2_word):
                            result = "1"
                            interaction = "int"

            # Check if interaction is effect
            elif current_word in effect_clue_words:
                next_word = analysis.nodes[key + 1]['word']
                # Explicit observed structure
                if current_word == "antagonize" and next_word == "the":
                    if analysis.nodes[key+2]['word'] == e1_word or analysis.nodes[key+2]['word'] == e2_word:
                        result = "1"
                        interaction = "effect"
                elif current_word == "antagonized" and next_word == "by":
                    if analysis.nodes[key + 2]['word'] == e1_word or analysis.nodes[key + 2]['word'] == e2_word:
                        result = "1"
                        interaction = "effect"
                # Generic structure (entities are childs of clue word)
                else:
                    for subtree in tree.subtrees():
                        if subtree.label() in effect_clue_words:
                            if isNodeInParent(subtree, e1_word) and isNodeInParent(subtree, e2_word):
                                result = "1"
                                interaction = "effect"

            # Check if interaction is mechanism
            elif current_word in mechanism_clue_words:
                for subtree in tree.subtrees():
                    if subtree.label() in mechanism_clue_words:
                        if isNodeInParent(subtree, e1_word) and isNodeInParent(subtree, e2_word):
                            result = "1"
                            interaction = "mechanism"

        # If there's a KeyError, which can happen if there's extra non-numeric keys at the end of the tree (we are not \
        # interested in them, pass
        except KeyError:
            pass

    return result, interaction


# receives data dir and filename for the results to evaluate
def evaluate(inputdir, outputfile):
    os.system("java -jar eval/evaluateDDI.jar " + inputdir + " " + outputfile)

In [None]:

#### VARIABLES ####
inputdir = "data/Devel"
outputfilename = "./task9.2_TrainGianMarc_1.txt"
outputfile = open(outputfilename, "w")

# connect to CoreNLP server
my_parser = CoreNLPDependencyParser(url="http://localhost:9000")

#### MAIN ####

# TODO mirar ordre!

# process each file in directory
for filename in os.listdir(inputdir):
    # parse XML file, obtaining a DOM tree
    file_path = os.path.join(inputdir, filename)
    tree = ET.parse(file_path)
    sentences = tree.findall("sentence")

    for sentence in sentences:
        (sid, stext) = (sentence.attrib["id"], sentence.attrib["text"])

        # load sentence entities into a dictionary
        entities = {}
        ents = sentence.findall("entity")
        for e in ents:
            ent_id = e.attrib["id"]
            offs = e.attrib["charOffset"].split("-")
            entities[ent_id] = offs

        # Tokenize, tag, and parse sentence
        if not stext:
            continue
        analysis = analyze(stext)
        # for each pair in the sentence, decide whether it is DDI and its type
        pairs = sentence.findall("pair")
        for pair in pairs:
            id_e1 = pair.attrib["e1"]
            id_e2 = pair.attrib["e2"]
            (is_ddi, ddi_type) = check_interaction(analysis, entities, id_e1, id_e2)
            line = "|".join([sid, id_e1, id_e2, is_ddi, ddi_type])
            outputfile.write(line + "\n")

evaluate(inputdir, outputfilename)
outputfile.close()


In [23]:
my_parser = CoreNLPDependencyParser(url="http://localhost:9000")

s  = "Hello how are you paracetamol?"
mytree,  = my_parser.raw_parse(s)

In [24]:
tree = mytree.tree()

In [1]:
import os
import pycrfsuite
import argparse
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer


In [2]:
train_filename = "megam.dat"
test_filename = "megam.dat"

In [3]:
output_filename = "predicted.txt"
model_filename = "model.crfsuite"

In [4]:
train_samples = open(train_filename, "r").read().split("\n")
test_samples = open(test_filename, "r").read().split("\n")

In [49]:
def split_data(features):
    """
    Receive a list of featured sentences splitted by words and split it into samples and labels.

    Parameters:
    word_features(list): List of feature words. There is an empty element between sentences in order to split each one.

    Returns:
    X_samples(list): List of feature words missing the label of the word.
    Y_labels(list): List of labels for each word.
    """

    X_samples = []
    Y_labels = []

    for feat in features:
        feat = feat.split(" ")
        Y_labels.append(feat[0])
        X_samples.append(feat[1:])

    return X_samples, Y_labels

In [50]:
X_train, Y_train = split_data(train_samples)
X_test, Y_test = split_data(test_samples)

In [51]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, Y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.05,  # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty 1e-1 0.61
    'max_iterations': 10000,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train(model_filename)

tagger = pycrfsuite.Tagger()
tagger.open(model_filename)

Y_pred = [tagger.tag(xseq) for xseq in X_test]


ValueError: The numbers of items and labels differ: |x| = 1, |y| = 4

In [55]:
class Entity:                                                                                                                                                                           
                                                                                                                                                                                         
    def __init__(self, **kwargs):                                                                                                                                                       
        self.word = kwargs["text"]                                                                                                                                                      
        self.offset_from, self.offset_to = self.parse_offset(kwargs["charOffset"])                                                                                                      
        self.id = kwargs["id"]                                                                                                                                                          

    def parse_offset(self, offset):                                                                                                                                                     

        # offset can be given in two ways                                                                                                                                               
        # e.g.:                                                                                                                                                                         
        #       * 9-23                                                                                                                                                                  
        #       * 9-11;12-20;21-23                                                                                                                                                      
        #                                                                                                                                                                               
        # We differenciate both cases and always save the first one and the last one                                                                                                    

        if ";" in offset:                                                                                                                                                               
            offset = offset.split(";")                                                                                                                                                  
            offset_from = offset[0].split('-')[0]                                                                                                                                       
            offset_to = offset[-1].split('-')[1]                                                                                                                                        
        else:                                                                                                                                                                           
            offset_from, offset_to = offset.split('-')                                                                                                                                  

        return int(offset_from), int(offset_to)           

In [61]:
my_parser = CoreNLPDependencyParser(url="http://localhost:9000")
mytree,  = my_parser.raw_parse(s)

In [65]:
analysis = analyze(s)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


In [None]:
Caution should be exercised when combining resorcinol or salicylic acid with DIFFERIN Gel

In [74]:
s = "Caution should be exercised when combining resorcinol or salicylic acid with DIFFERIN Gel"
e1 = "resorcinol"
e2 = "salicylic acid"
entities = [Entity(**{"text": e1, "charOffset": "21-30", "id": "DDI-DrugBank.d200.s0.e0"}), Entity(**{"text": e2, "charOffset": "37-45", "id": "DDI-DrugBank.d200.s0.e1"})]

In [75]:
analysis = analyze(s)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


In [78]:
analysis.nodes

defaultdict(<function nltk.parse.dependencygraph.<lambda>>,
            {0: {u'address': 0,
              u'ctag': u'TOP',
              u'deps': defaultdict(list, {u'ROOT': [4]}),
              u'feats': None,
              u'head': None,
              u'lemma': None,
              u'rel': None,
              u'tag': u'TOP',
              u'word': None},
             1: {u'address': 1,
              u'ctag': u'NN',
              u'deps': defaultdict(list, {}),
              'end': 6,
              u'feats': u'_',
              u'head': 4,
              u'lemma': u'caution',
              u'rel': u'nsubjpass',
              'start': 0,
              u'tag': u'NN',
              u'word': u'Caution'},
             2: {u'address': 2,
              u'ctag': u'MD',
              u'deps': defaultdict(list, {}),
              'end': 13,
              u'feats': u'_',
              u'head': 4,
              u'lemma': u'should',
              u'rel': u'aux',
              'start': 8,
           

In [71]:
print(analysis.to_conll(4))

Co-administration	NN	8	nsubjpass
of	IN	3	case
probenecid	NN	1	nmod
with	IN	5	case
acyclovir	NN	1	nmod
has	VBZ	8	aux
been	VBN	8	auxpass
shown	VBN	0	ROOT
to	TO	10	mark
increase	VB	8	xcomp
the	DT	13	det
mean	NN	13	compound
half-life	NN	10	dobj
and	CC	13	cc
the	DT	16	det
area	NN	13	conj
under	IN	20	case
the	DT	20	det
concentration-time	JJ	20	amod
curve	NN	10	nmod
.	.	8	punct

