In [1]:
import os
import re
import sys
import json
import glob
from datetime import datetime

import subprocess

import numpy as np
import pandas as pd

from sklearn import metrics

from pynlpl.formats import folia

In [2]:
inputfolder = "data/EIFD-FlatData"
foliaset = 'https://raw.githubusercontent.com/ErkanBasar/mebasarcom/master/eifd.foliaset.xml'

In [3]:
total = len([xfile for xfile in glob.glob(inputfolder + "/*.folia.xml")])
annotdocs = [xfile for xfile in glob.glob(inputfolder +"/*.folia.xml") if folia.Document(file=xfile).declared(folia.Entity, foliaset)]

count = len(annotdocs)
print(str(int((100*count)/total)) + '% of the documents are annotated (' + str(count) + '/' + str(total - count) + ')')

13% of the documents are annotated (37/243)


In [100]:
stopwords = []

## Feature Extraction

In [101]:
class Token(object):
    
    def __init__(self, folia_obj):
        self.obj = folia_obj
        
        self.cls = "O"
        self.ner = "---"
        self.previous = None
        self.next = None
        
        if(self.obj is None):
            self.foliaId = ""
            self.text = ""
            self.pos = ""
            self.isTitleCase = ""
            self.wordClass = ""
        else:
            self.foliaId = self.obj.id
            self.text = self.obj.text().lower()
            self.pos = self.obj.pos()
            self.isTitleCase = str(self.obj.text().istitle())
            self.wordClass = self.obj.cls

    def getClass(self):
        if(self.obj is not None):
            for entity in self.obj.findspans(folia.Entity):
                if(entity.annotatortype == folia.AnnotatorType.MANUAL):
                    self.cls = "B." + entity.cls
                    if(self.previous == None):
                        self.cls = "B." + entity.cls 
                    else:
                        for prevEntitiy in self.previous.findspans(folia.Entity):
                            if(entity.id == prevEntitiy.id and prevEntitiy.annotatortype == folia.AnnotatorType.MANUAL):
                                self.cls = "I." + entity.cls
        return self.cls
    
    def getNer(self):
        if(self.obj is not None):
            for entity in self.obj.findspans(folia.Entity):
                if(entity.annotator == 'nltk-stanford'):
                    self.ner = entity.cls
        return self.ner
    
    def getPrevious(self):
        if self.obj is not None and self.obj.previous(folia.Word, [folia.Sentence]) is not None:
            self.previous = self.obj.previous(folia.Word, [folia.Sentence])
            while True:
                if self.previous is not None:
                    if self.previous.text().lower() in stopwords:
                        self.previous = self.previous.previous(folia.Word, [folia.Sentence])
                    else:
                        break
                else:
                    break
        return self.previous
    
    def getNext(self):
        if self.obj is not None and self.obj.next(folia.Word, [folia.Sentence]) is not None:
            self.next = self.obj.next(folia.Word, [folia.Sentence])
            while True:
                if self.next is not None:
                    if self.next.text().lower() in stopwords:
                        self.next = self.next.next(folia.Word, [folia.Sentence])
                    else:
                        break
                else:
                    break
        return self.next
    
    def kill(self):
        del self

In [105]:
def createLine(token):
    token.getPrevious()
    line = str(token.text + ' ' + token.pos + ' ' + token.getNer() + ' ' + token.getClass() + '\n')
    return line

In [106]:
def createMBTFile(filelist, outfile, testrun=False):
    
    excludedclasses = []#["B.dmg.ppl.miss","B.dmg.bus","I.dmg.bus","I.dmg.ppl.miss","B.loc.supp","I.loc.supp","B.loc.other","I.loc.other","B.resp.supply","I.resp.supply","B.dmg.econ","I.dmg.econ"]
    
    filelist = [filelist] if(type(filelist) == str) else filelist

    if(type(filelist) == list and len(filelist) >= 1):

        for filepath in filelist:
            for doc in folia.Document(file=filepath):
                for sentence in doc.sentences():
                    for word in sentence.words():

                            token = Token(word)
                            
                            tokenClass = token.getClass()
                            
                            if(tokenClass in excludedclasses):
                                tokenClass = "O"
                            
                            if(testrun is True):
                                if(tokenClass != "O"):
                                    outfile.write(createLine(token))
                            else:
                                outfile.write(createLine(token))

                    outfile.write("<utt>\n")
                    
            #print(filepath)

    else:
        print('Input may be wrong. It should be a list of file paths or a single string file path.')
        print('Input type you tried: ' + str(type(filelist)))
        sys.exit()

In [107]:
splt = int((80 * len(annotdocs)) / 100.0)
splt

29

In [108]:
trainlist = annotdocs[:splt]
testlist = annotdocs[splt:]

In [109]:
trainingData = open('data/MBTData/MBTTrain.data', "w")

createMBTFile(trainlist, trainingData)

trainingData.close()

In [110]:
testData = open('data/MBTData/MBTTest.data', "w")

createMBTFile(testlist, testData)

testData.close()

## Run MBT

In [111]:
p = subprocess.Popen(["mbtg","-E","MBTTrain.data","-p","ddfaa","-P","ddfaapsss"], cwd="data/MBTData")
p.wait()

0

In [112]:
testOut = open('data/MBTData/test.out', 'w') 
p = subprocess.Popen(["mbt","-s","MBTTrain.data.settings","-E","MBTTest.data"], stdout=testOut, cwd="data/MBTData")
p.wait()

0

In [67]:
# %100 train
testOut = open('data/MBTData/test.out', 'w') 
p = subprocess.Popen(["mbt","-s","MBTTrain.data.settings","-E","MBTTrain.data"], stdout=testOut, cwd="data/MBTData")
p.wait()

0

## Evaluation

In [113]:
testOut = open('data/MBTData/test.out', "r")

lines = testOut.readlines()

Y = []
predictions = []

for line in lines[7:]:
    
    spacesplt = line.split(" ")
    
    if(len(spacesplt) > 1):
        
        tabsplt = spacesplt[1].split("\t")

        y = tabsplt[1]
        Y.append(y)
        
        prediction = tabsplt[2].split("\n")[0]
        predictions.append(prediction)

In [114]:
labels = sorted([label for label in list(set(Y)) if label not in ['O']])
labels

['B.dmg.ppl.evac',
 'B.dmg.ppl.kill',
 'B.dmg.prop',
 'B.event.flood',
 'B.loc.focus',
 'B.loc.other',
 'B.loc.supp',
 'B.resp.donat',
 'B.resp.supply',
 'B.time',
 'I.dmg.ppl.evac',
 'I.dmg.ppl.kill',
 'I.dmg.prop',
 'I.event.flood',
 'I.loc.focus',
 'I.loc.other',
 'I.loc.supp',
 'I.resp.donat',
 'I.resp.supply',
 'I.time']

In [70]:
# 100% train
print(metrics.classification_report(Y, predictions, labels=labels))

                precision    recall  f1-score   support

     B.dmg.bus       1.00      1.00      1.00         5
    B.dmg.econ       1.00      1.00      1.00        36
B.dmg.ppl.evac       1.00      0.88      0.94        83
B.dmg.ppl.kill       0.95      0.85      0.90       148
B.dmg.ppl.miss       1.00      1.00      1.00         1
    B.dmg.prop       0.99      0.72      0.83       142
 B.event.flood       0.89      0.64      0.74       368
   B.loc.focus       0.94      0.53      0.68       225
   B.loc.other       1.00      0.82      0.90        71
    B.loc.supp       1.00      0.77      0.87        47
  B.resp.donat       1.00      1.00      1.00        72
 B.resp.supply       0.98      0.75      0.85       113
        B.time       0.94      0.62      0.75       154

   avg / total       0.95      0.71      0.81      1465



In [90]:
# %90 train 
print(metrics.classification_report(Y, predictions, labels=labels))

                precision    recall  f1-score   support

B.dmg.ppl.evac       0.00      0.00      0.00         7
B.dmg.ppl.kill       0.34      0.44      0.39        25
    B.dmg.prop       0.40      0.14      0.21        43
 B.event.flood       0.55      0.49      0.52        90
   B.loc.focus       0.41      0.17      0.24        77
   B.loc.other       0.09      0.17      0.12         6
    B.loc.supp       0.00      0.00      0.00        29
  B.resp.donat       1.00      0.25      0.40        28
 B.resp.supply       0.00      0.00      0.00        57
        B.time       0.31      0.16      0.21        25

   avg / total       0.37      0.22      0.26       387



In [115]:
# %80 train
print(metrics.classification_report(Y, predictions, labels=labels))

                precision    recall  f1-score   support

B.dmg.ppl.evac       0.00      0.00      0.00         3
B.dmg.ppl.kill       0.27      0.29      0.28        14
    B.dmg.prop       0.38      0.23      0.29        22
 B.event.flood       0.49      0.37      0.42        63
   B.loc.focus       0.33      0.15      0.21        52
   B.loc.other       0.20      0.25      0.22         4
    B.loc.supp       0.00      0.00      0.00        13
  B.resp.donat       1.00      0.22      0.36         9
 B.resp.supply       0.00      0.00      0.00        23
        B.time       0.15      0.15      0.15        13
I.dmg.ppl.evac       0.00      0.00      0.00         4
I.dmg.ppl.kill       0.30      0.55      0.39        11
    I.dmg.prop       0.00      0.00      0.00        21
 I.event.flood       0.48      0.41      0.44        27
   I.loc.focus       0.43      0.24      0.31        25
   I.loc.other       0.00      0.00      0.00         2
    I.loc.supp       0.00      0.00      0.00  

In [80]:
# %50 train
print(metrics.classification_report(Y, predictions, labels=labels))

                precision    recall  f1-score   support

     B.dmg.bus       0.00      0.00      0.00         3
    B.dmg.econ       0.00      0.00      0.00         6
B.dmg.ppl.evac       0.40      0.12      0.18        50
B.dmg.ppl.kill       0.36      0.32      0.34        66
    B.dmg.prop       0.16      0.06      0.09        69
 B.event.flood       0.43      0.36      0.39       193
   B.loc.focus       0.32      0.15      0.20       143
   B.loc.other       0.29      0.16      0.21        37
    B.loc.supp       0.00      0.00      0.00        45
  B.resp.donat       0.00      0.00      0.00        66
 B.resp.supply       0.00      0.00      0.00        64
        B.time       0.20      0.18      0.19        65

   avg / total       0.26      0.17      0.20       807



  'precision', 'predicted', average, warn_for)
