In [3]:
import xml.etree.ElementTree as ET
import re
import spacy
import scispacy
import spacy_transformers
import numpy as np
import networkx as nx

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, precision_score, recall_score
from nltk.tag import pos_tag
from spacy import displacy

In [4]:
def stringReplace(object):
    temp = object.group(0)
    temp = temp[0] + " " + temp[1]
    return temp

In [5]:
def dataPreprocess(root):
    docs = []
    entities = []
    vocabulary = []
    sentences = []
    entVals = []
    yLabel = []
    for x in root.findall("./document/sentence"):
        count = 0
        yCount = 0
        sentence = x.attrib["text"]
        sentence = re.sub('[a-z]\(', stringReplace, sentence)
        sentence = re.sub('[a-z]\.', stringReplace, sentence)
        sentence = re.sub('[a-z]\-', stringReplace, sentence)
        sentence = re.sub('\-[a-z]', stringReplace, sentence)
        sentence = re.sub('[A-Z]\-', stringReplace, sentence)
        sentence = re.sub('[A-Z]\(', stringReplace, sentence)
        sentence = re.sub('\-[A-Z]', stringReplace, sentence)
        sentence = re.sub('[0-9]\-', stringReplace, sentence)
        sentence = re.sub('\-[0-9]', stringReplace, sentence)
        sentence = re.sub('[0-9]\(', stringReplace, sentence)
        temp = sentence
        ents = {}
        vals = {}
        yTemp = []
        for i in x.findall("./entity"):
            if i.attrib["text"] not in ents:
                tempEnt = i.attrib["text"]
                tempEnt = re.sub('[a-z]\(', stringReplace, tempEnt)
                tempEnt = re.sub('[a-z]\.', stringReplace, tempEnt)
                tempEnt = re.sub('[a-z]\-', stringReplace, tempEnt)
                tempEnt = re.sub('\-[a-z]', stringReplace, tempEnt)
                tempEnt = re.sub('[A-Z]\-', stringReplace, tempEnt)
                tempEnt = re.sub('[A-Z]\(', stringReplace, tempEnt)
                tempEnt = re.sub('\-[A-Z]', stringReplace, tempEnt)
                tempEnt = re.sub('[0-9]\-', stringReplace, tempEnt)
                tempEnt = re.sub('\-[0-9]', stringReplace, tempEnt)
                tempEnt = re.sub('[0-9]\(', stringReplace, tempEnt) 
                ents[tempEnt] = ["BRAIN_REGION"]
            tempSen = sentence
            sentence = sentence.replace(tempEnt, "ENTITY" + str(count))
            # if tempSen != sentence:
            vals["ENTITY" + str(count)] = tempEnt
            count += 1
            if count == 3:
                break
        for i in x.findall("./pair"):
            yValTemp = i.attrib["interaction"]
            if yValTemp == "True":
                yTemp.append(1)
            else:
                yTemp.append(0)
            yCount += 1
            if yCount == 2:
                break
        if (count == 3 or count < 2) and (yCount == 2 or yCount < 1):
            continue
        keyList = list(ents.keys())
        if (len(keyList) == 1):
            continue
        if (keyList[0] in keyList[1]) or (keyList[1] in keyList[0]):
            continue
        for i in ents:
            vocabulary.append(i)
        for i in sentence.split():
            if i != "ENTITY":
                if i not in vocabulary:
                    vocabulary.append(i)
        yLabel.append(yTemp[0])
        docs.append(sentence)
        entVals.append(vals)
        sentences.append(temp)
        entities.append(ents)
    return yLabel, docs, entVals, sentences, entities

In [6]:
def shortestPathsCalculator(yLabel, docs, entVals, sentences, entities):
    shortestPaths = []
    dependecies = []
    nums = []
    _docs = docs
    for sr, i in enumerate(_docs):
        # print(i)
        document = nlp(i)
        edges = []
        deps = []
        for token in document:
            for child in token.children:
                edges.append(('{0}'.format(token.lower_), '{0}'.format(child.lower_)))
                deps.append(('{0}'.format(token.dep_), '{0}'.format(child.dep_)))
        graph = nx.Graph(edges)
        entity1 = 'entity0'
        entity2 = 'entity1'
        # print(sr)
        # print(edges)
        try:
            path = nx.shortest_path(graph, source=entity1, target=entity2)
            directions = []
            dirDeps = []
            for xi, x in enumerate(edges):
                for y in range(len(path) - 1):
                    if path[y] in x:
                        if path[y] == x[0]:
                            if path[y + 1] == x[1]:
                                directions.append("->")
                                dirDeps.append([deps[xi][0], deps[xi][1]])
                            else:
                                continue
                        else:
                            if path[y + 1] == x[0]:
                                directions.append("<-")
                                dirDeps.append([deps[xi][1], deps[xi][0]])
                            else:
                                continue
                    else:
                        continue
            finalPath = []
            for x in range(len(path)):
                finalPath.append(path[x])
                if len(directions) > x:
                    finalPath.append(directions[x])
        except nx.NetworkXNoPath:
            del yLabel[sr]
            del docs[sr]
            del sentences[sr]
            del entVals[sr]
            del entities[sr]
            continue   
        # print(path)
        nums.append(sr)
        shortestPaths.append(finalPath)
        dependecies.append(dirDeps)
    print(len(nums), len(shortestPaths), len(dependecies), len(yLabel), len(docs), len(sentences), len(entVals), len(entities))
    return shortestPaths, dependecies

In [7]:
def entityLabels(sentences, entities):
    for sr, x in enumerate(sentences):
        document = nlp(x)
        for d in document.ents:
            for i in entities[sr].keys():
                if d.text in i:
                    entities[sr][i].append(d.label_)

In [8]:
class Tree:
    def __init__ (self):
        self.nodes = []
    
    def addNodes(self, nodeList, n, entVals, entities, dependencies):
        if len(nodeList) == 3:
            flag = 0
            relCount = 0
            for i in range(3):
                if nodeList[i] == "<-" or nodeList[i] == "->":
                    self.nodes.append([nodeList[i], dependencies[n][relCount][0], dependencies[n][relCount][1]])
                    relCount += 1
                else:
                    ent = entVals[n]["ENTITY" + str(flag)]
                    self.nodes.append([ent, "NN", "NOUN", entities[n][ent]])
                    flag = 1
        else:
            relCount = 0
            for i in range(len(nodeList)):
                # if (nodeList[i] == "<-"):
                #     self.nodes.append([nodeList[i], dependencies[n][relCount][0]])
                #     relCount += 1
                # elif ( nodeList[i] == "->"):
                #     self.nodes.append([nodeList[i], dependencies[n][relCount][1]])

                # if (nodeList[i] == "<-" or nodeList[i] == "->"):
                #     self.nodes.append([nodeList[i]])

                if (nodeList[i] == "<-" or nodeList[i] == "->"):
                    continue
                
                elif (nodeList[i] == "entity0"):
                    ent = entVals[n]["ENTITY0"]
                    self.nodes.append([ent, "NN", "NOUN", entities[n][ent]])
                elif (nodeList[i] == "entity1"):
                    ent = entVals[n]["ENTITY1"]
                    self.nodes.append([ent, "NN", "NOUN", entities[n][ent]])
                else:
                    test = nlp(nodeList[i])
                    for token in test:
                        self.nodes.append([token.text, token.tag_, token.pos_])
            
    def displayNodes(self):
        print(self.nodes)

In [9]:
def similarityFunction(x, y):
    if len(x) != len(y):
        return 0
    else:
        simScore = 1
        for (xVal, yVal) in zip(x, y):
            tempSimScore = 0
            if len(xVal) == len(yVal):
                for i in range(len(xVal)):
                    if isinstance(xVal[i], list):
                        for j in xVal[i]:
                            if j in yVal[i]:
                                tempSimScore += 1
                    elif xVal[i] == yVal[i]:
                        tempSimScore += 1
            simScore *= tempSimScore
        return simScore

def treeKernel(X1, X2):
    gram_matrix = np.zeros((X1.shape[0], X2.shape[0]))
    for i, x1 in enumerate(X1):
        for j, x2 in enumerate(X2):
            gram_matrix[i, j] = similarityFunction(x1.nodes, x2.nodes)
    return gram_matrix

In [10]:
tree = ET.parse('data/train/train.xml')
root = tree.getroot()
print(root.tag)

corpus


In [11]:
yLabel, docs, entVals, sentences, entities = dataPreprocess(root)

In [12]:
print(len(docs), len(yLabel))

2001 2001


In [13]:
nlp = spacy.load("en_ner_bionlp13cg_md")

In [14]:
entityLabels(sentences, entities)

In [15]:
for i in range(2):
    shortestPaths, dependencies = shortestPathsCalculator(yLabel, docs, entVals, sentences, entities)

1999 1999 1999 2000 2000 2000 2000 2000
2000 2000 2000 2000 2000 2000 2000 2000


In [16]:
xObjects = []
for i in range(len(shortestPaths)):
    obj = Tree()
    obj.addNodes(shortestPaths[i], i, entVals, entities, dependencies)
    xObjects.append(obj)

In [17]:
xObjects = np.array(xObjects)
yLabel = np.array(yLabel)

# VALIDATION AND OPTIMIZATION

In [32]:
def SVM(count, C):
    classifier = SVC(kernel = "precomputed", C = C)
    model = classifier.fit(treeKernel(xObjectsTrain, xObjectsTrain), yLabelTrain)
    pred = model.predict(treeKernel(xObjectsValid, xObjectsTrain))
    results[count][0] = C
    results[count][1] = model.score(treeKernel(xObjectsValid, xObjectsTrain), yLabelValid)
    results[count][2] = f1_score(yLabelValid, pred)
    results[count][3] = precision_score(yLabelValid, pred)
    results[count][4] = recall_score(yLabelValid, pred)

In [37]:
# xObjectsTrain = xObjects[:1800]
# yLabelTrain = yLabel[:1800]
# xObjectsValid = xObjects[1800:]
# yLabelValid = yLabel[1800:]
xObjectsArray = np.array_split(xObjects, 10)
yLabelArray = np.array_split(yLabel, 10)
fold = 0
finalC = 0
finalF1 = 0
for z in range(10):
    xObjectsValid = xObjectsArray[z]
    yLabelValid = yLabelArray[z]
    xObjectsTrain = np.array([])
    yLabelTrain = np.array([])
    for j in range(10):
        if j == z:
            continue
        else:
            xObjectsTrain = np.concatenate((xObjectsTrain, xObjectsArray[j]))
            yLabelTrain = np.concatenate((yLabelTrain, yLabelArray[j]))
    c = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1]
    results = np.zeros((7, 5))
    count = 0
    for i in c:
        SVM(count, i)
        count += 1
    bestC = 0
    bestF1 = 0
    for i in range(7):
        if results[i][2] > bestF1:
            bestC = results[i][0]
            bestF1 = results[i][2]
    if bestF1 > finalF1:
        finalC = bestC
        fold = z
        finalF1 = bestF1
    print("Fold: {}\t\tC: {}\t\tF1: {}".format(z, bestC, bestF1))
print("Best Fold: {}\t\tC: {}\t\tF1: {}".format(fold, finalC, finalF1))

Fold: 0		C: 0.25		F1: 0.5217391304347826
Fold: 1		C: 0.1		F1: 0.5289256198347108
Fold: 2		C: 0.05		F1: 0.48648648648648646
Fold: 3		C: 0.05		F1: 0.5238095238095237
Fold: 4		C: 0.1		F1: 0.4473684210526316
Fold: 5		C: 0.75		F1: 0.42696629213483145
Fold: 6		C: 0.1		F1: 0.4090909090909091
Fold: 7		C: 0.01		F1: 0.48275862068965514
Fold: 8		C: 0.1		F1: 0.4848484848484848
Fold: 9		C: 0.1		F1: 0.5176470588235295
Best Fold: 1		C: 0.1		F1: 0.5289256198347108


# Train Using Best Parameters

In [38]:
xObjectsValid = xObjectsArray[fold]
yLabelValid = yLabelArray[fold]
xObjectsTrain = np.array([])
yLabelTrain = np.array([])
for j in range(10):
    if j == fold:
        continue
    else:
        xObjectsTrain = np.concatenate((xObjectsTrain, xObjectsArray[j]))
        yLabelTrain = np.concatenate((yLabelTrain, yLabelArray[j]))

In [39]:
classifier = SVC(kernel = "precomputed", C = finalC, gamma = "scale")
model = classifier.fit(treeKernel(xObjectsTrain, xObjectsTrain), yLabelTrain)

# Testing

In [40]:
treeTest = ET.parse('data/test/WhiteTextUnseenEval.xml')
rootTest = treeTest.getroot()
print(rootTest.tag)

corpus


In [41]:
yLabelTest, docsTest, entValsTest, sentencesTest, entitiesTest = dataPreprocess(rootTest)

In [42]:
print(len(docsTest), len(yLabelTest))

1028 1028


In [43]:
entityLabels(sentencesTest, entitiesTest)

In [44]:
for i in range(2):
    shortestPathsTest, dependenciesTest = shortestPathsCalculator(yLabelTest, docsTest, entValsTest, sentencesTest, entitiesTest)

1028 1028 1028 1028 1028 1028 1028 1028
1028 1028 1028 1028 1028 1028 1028 1028


In [45]:
xObjectsTest = []
for i in range(len(shortestPathsTest)):
    obj = Tree()
    obj.addNodes(shortestPathsTest[i], i, entValsTest, entitiesTest, dependenciesTest)
    xObjectsTest.append(obj)

In [46]:
xObjectsTest = np.array(xObjectsTest)
yLabelTest = np.array(yLabelTest)

In [47]:
pred = model.predict(treeKernel(xObjectsTest, xObjectsTrain))
accuracy = model.score(treeKernel(xObjectsTest, xObjectsTrain), yLabelTest)
f1 = f1_score(yLabelTest, pred)
precision = precision_score(yLabelTest, pred)
recall = recall_score(yLabelTest, pred)

In [48]:
print("    Accuracy     F1     Precision    Recall")
print("{:10.4f} {:10.4f} {:10.4f} {:10.4f}".format(accuracy, f1, precision, recall))

    Accuracy     F1     Precision    Recall
    0.7354     0.5090     0.6157     0.4338
