<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Parse-XML-Data" data-toc-modified-id="Parse-XML-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Parse XML Data</a></span></li><li><span><a href="#Tokenize-Text" data-toc-modified-id="Tokenize-Text-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Tokenize Text</a></span></li><li><span><a href="#Create-Word-Embeddings-with-Wiki-Extvec" data-toc-modified-id="Create-Word-Embeddings-with-Wiki-Extvec-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Create Word Embeddings with Wiki-Extvec</a></span></li><li><span><a href="#Create-Contextual-String-Embeddings-with-Flair" data-toc-modified-id="Create-Contextual-String-Embeddings-with-Flair-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create Contextual String Embeddings with Flair</a></span></li><li><span><a href="#Create-BERT-Embeddings-with-flair" data-toc-modified-id="Create-BERT-Embeddings-with-flair-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Create BERT Embeddings with flair</a></span></li><li><span><a href="#Create-ELMo-Embeddings-with-flair" data-toc-modified-id="Create-ELMo-Embeddings-with-flair-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Create ELMo Embeddings with flair</a></span></li><li><span><a href="#Build-Dataset" data-toc-modified-id="Build-Dataset-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Build Dataset</a></span><ul class="toc-item"><li><span><a href="#Tagging" data-toc-modified-id="Tagging-7.1"><span class="toc-item-num">7.1&nbsp;&nbsp;</span>Tagging</a></span></li><li><span><a href="#Split-Dataset" data-toc-modified-id="Split-Dataset-7.2"><span class="toc-item-num">7.2&nbsp;&nbsp;</span>Split Dataset</a></span></li></ul></li><li><span><a href="#Data-compress" data-toc-modified-id="Data-compress-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>Data compress</a></span></li></ul></div>

# Parse XML Data

In [None]:
from xml.dom.minidom import parse
import xml.dom.minidom
from collections import Counter


def parse(n):
    """
    Parse XML corpus
    """
    sentence, label = [], []
    DOMTree = xml.dom.minidom.parse(n)
    items = DOMTree.documentElement.getElementsByTagName('item')
    for item in items:
        label.append(item.getAttribute('label'))
        sent = item.getElementsByTagName('sentence')
        sentence.append(sent[0].childNodes[0].data)
    return sentence, label


trainSent, trainLabel = parse('corpus/train-corpus.xml')
testSent, testLabel = parse('corpus/test-corpus.xml')

# Tokenize Text

In [None]:
from keras.preprocessing.text import text_to_word_sequence
import numpy as np
import pickle
import re
MAX_WLEN = 58
MAX_CLEN = 23


def delete(s):
    """
    Delete parentheses
    """
    return re.sub('\((.*?)\)', '', s)


def cut(s):
    """
    Word segmentation
    """
    ws = text_to_word_sequence(s,
                               filters='!#$&*+.%/<=>?@[\\]^_`{|}~\t\n',
                               lower=False,
                               split=' ')
    for s in [',', ':', ';', '"']:
        sep_pm(ws, s)
    return ws


def sep_pm(ws, s):
    """
    Separate punctuation mark: , : ; " 
    """
    if s in [',', ':', ';', '"']:
        for i in range(len(ws)):
            if ws[i].endswith(s) and ws[i] != s:
                ws[i] = ws[i][:-1]
                ws.insert(i+1, s)
        for i in range(len(ws)):
            if ws[i].endswith(s) and ws[i] != s:
                ws[i] = ws[i][:-1]
                ws.insert(i+1, s)
    if s in ['"']:
        for i in range(len(ws)):
            if ws[i].startswith(s) and ws[i] != s:
                ws[i] = ws[i][1:]
                ws.insert(i, s)
        for i in range(len(ws)):
            if ws[i].startswith(s) and ws[i] != s:
                ws[i] = ws[i][1:]
                ws.insert(i, s)


def find_cp_idx(ws):
    """
    Find index of causality phrases
    """
    E = []
    for n in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']:
        e = []
        for i in range(len(ws)):
            if ws[i] == 'e'+n:
                e.append(i-2*(int(n)-1))
                for j in range(i+1, len(ws)):
                    if ws[j] == 'e'+n and j-2 != i:
                        e.append(j-2*int(n))
                break
        if e != []:
            E.append([i for i in range(e[0], e[-1]+1)])
    return E


def format_sentence(ws):
    """
    Remove entity tags
    """
    return [i for i in ws if i not in ['e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11']]


train_index = []
trainSent = [delete(i) for i in trainSent]
trainWords = [cut(i) for i in trainSent]
for w in trainWords:
    train_index.append(find_cp_idx(w))
trainWords = [format_sentence(i) for i in trainWords]
trainChars = [[list(w) for w in s] for s in trainWords]

test_index = []
testSent = [delete(i) for i in testSent]
testWords = [cut(i) for i in testSent]
for w in testWords:
    test_index.append(find_cp_idx(w))
testWords = [format_sentence(i) for i in testWords]
testChars = [[list(w) for w in s] for s in testWords]

counts = Counter()
for sw in trainWords+testWords:
    counts.update(sw)

vocab = sorted(counts, key=counts.get, reverse=True)
word2index = {w: i for i, w in enumerate(vocab, 1)}
index2word = {i: w for w, i in word2index.items()}

counts = Counter()
for sc in trainChars+testChars:
    counts.update(sum(sc, []))

c_vocab = sorted(counts, key=counts.get, reverse=True)
char2index = {w: i for i, w in enumerate(c_vocab, 1)}
index2char = {i: w for w, i in char2index.items()}

w_filePath = 'data/index/index_w.pkl'
with open(w_filePath, 'wb') as fp:
    pickle.dump((word2index, index2word), fp, -1)

c_filePath = 'data/index/index_c.pkl'
with open(c_filePath, 'wb') as fp:
    pickle.dump((char2index, index2char), fp, -1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [None]:
causalLabel = [l for l in testLabel if l != 'Non-Causal']
print('\n------ Causal Sentence Analysis ------\n')
print('{:<40}{:>30}\n'.format('', 'number of causality triplet'))
for l in ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']:
    print('{:<40}{:>30d}'.format(
        l+':', Counter([Counter(i[12:])['e']/2 for i in causalLabel])[int(l)]))
print('{:<40}{:>30d}'.format('total', int(
    sum([Counter(i[12:])['e']/2 for i in causalLabel]))))


------ Causal Sentence Analysis ------

                                           number of causality triplet

1:                                                                 126
2:                                                                  47
3:                                                                   7
4:                                                                   8
5:                                                                   1
6:                                                                   1
7:                                                                   0
8:                                                                   0
9:                                                                   0
10:                                                                  0
11:                                                                  0
12:                                                                  1
total                              

# Create Word Embeddings with Wiki-Extvec

In [None]:
from pynlp import StanfordCoreNLP
from nltk import WordNetLemmatizer, PorterStemmer, LancasterStemmer
import spacy
import math
import h5py

In [None]:
annotators = 'lemma'
core_nlp = StanfordCoreNLP(annotators=annotators)
nlp = spacy.load('en_3')
wnl = WordNetLemmatizer()
porter = PorterStemmer()
lancaster = LancasterStemmer()

VOCAB_SIZE = len(word2index)+1
EXTVEC_DIM = 300

In [None]:
def replace(w, d):
    """
    Replace the words that are not in the dictionary
    """
    r = d.get(w)
    if r is None:
        nw = w.lower()
        r = d.get(nw)
    if r is None:
        nw = [i[0].lemma for i in core_nlp(w)][0]
        r = d.get(nw)
    if r is None:
        nw = [i.lemma_ for i in nlp(w)][0]
        r = d.get(nw)
    if r is None:
        nw = wnl.lemmatize(w)
        r = d.get(nw)
    if r is None:
        nw = porter.stem(w)
        r = d.get(nw)
    if r is None:
        nw = lancaster.stem(w)
        r = d.get(nw)
    if r is None:
        nw = w[:-1]
        r = d.get(nw)
    if r is None:
        nw = w[:-2]
        r = d.get(nw)
    return r

In [None]:
extvec_n_symbols = 1476022
extvec_index_dict = {}
extvec_embedding_weights = np.empty((extvec_n_symbols, EXTVEC_DIM))
filePath = 'your path to /dependency-based_word_embeddings/wiki_extvec'
with open(filePath, encoding='utf-8') as fp:
    index = 0
    for l in fp:
        l = l.split(' ')
        word = l[0]
        extvec_index_dict[word] = index
        extvec_embedding_weights[index, :] = np.asarray(l[1:], dtype='float32')
        index += 1

In [None]:
# Generate random embedding with same scale as extvec
SEED = 666
np.random.seed(SEED)
shape = (VOCAB_SIZE, EXTVEC_DIM)
scale = math.sqrt(3.0 / EXTVEC_DIM)
extvec_embedding = np.random.uniform(low=-scale, high=scale, size=shape)

In [None]:
# Copy from extvec weights of words that appear in index2word
count = 0
for i in range(1, VOCAB_SIZE):
    w = index2word[i]
    g = extvec_index_dict.get(w)
    if g is None:
        g = replace(w, extvec_index_dict)
    if g is not None:
        extvec_embedding[i, :] = extvec_embedding_weights[g, :]
        count += 1
print('{num_tokens}-{per:.3f}% tokens in vocab found in Wiki-Extvec and copied to embedding.'.format(
    num_tokens=count, per=count/float(VOCAB_SIZE)*100))

15129-97.361% tokens in vocab found in Wiki-Extvec and copied to embedding.


In [None]:
filePath = 'data/embedding/extvec_embedding.npy'
np.save(open(filePath, 'wb'), extvec_embedding)

# Create Contextual String Embeddings with Flair

In [None]:
from flair.embeddings import FlairEmbeddings, StackedEmbeddings
from tqdm import tqdm
from flair.data import Sentence
FLAIR_DIM = 4096


def flair_cse(sw):
    """
    Convert sentence to contextual string embeddings with flair
    """
    charlm_embedding_forward = FlairEmbeddings('news-forward')
    charlm_embedding_backward = FlairEmbeddings('your path to /news-backward-0.4.1.pt')
    stacked_embeddings = StackedEmbeddings(
        embeddings=[charlm_embedding_forward, charlm_embedding_backward])
    result = []
    nsw = [Sentence(' '.join(i)) for i in sw]
    for s in tqdm(nsw):
        stacked_embeddings.embed(s)
        result.append(np.concatenate((np.array([np.array(
            token.embedding) for token in s]), np.zeros((MAX_WLEN-len(s), FLAIR_DIM))), axis=0))
    return np.array(result)

In [None]:
trainFlair = flair_cse(trainWords)

100%|██████████| 4450/4450 [40:58<00:00,  1.78it/s] 


In [None]:
testFlair = flair_cse(testWords)

100%|██████████| 786/786 [07:30<00:00,  1.33it/s]


# Create BERT Embeddings with flair

In [None]:
from flair.embeddings import BertEmbeddings
from tqdm import tqdm
from flair.data import Sentence
BERT_DIM = 4096


def flair_bert(sw):
    """
    Convert sentence to bert embeddings with flair
    """
    bert_embedding = BertEmbeddings('bert-large-cased')
    result = []
    nsw = [Sentence(' '.join(i)) for i in sw]
    for s in tqdm(nsw):
        bert_embedding.embed(s)
        result.append(np.concatenate((np.array([np.array(
            token.embedding) for token in s]), np.zeros((MAX_WLEN-len(s), BERT_DIM))), axis=0))
    return np.array(result)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [None]:
trainBERT = flair_bert(trainWords)

100%|██████████| 4450/4450 [25:15<00:00,  2.72it/s]


In [None]:
testBERT = flair_bert(testWords)

100%|██████████| 786/786 [04:18<00:00,  2.72it/s]


# Create ELMo Embeddings with flair

In [None]:
from flair.embeddings import ELMoEmbeddings
from tqdm import tqdm
from flair.data import Sentence
ELMO_DIM = 3072


def flair_elmo(sw):
    """
    Convert sentence to ELMo embeddings with flair
    """
    elmo_embedding = ELMoEmbeddings('original')
    result = []
    nsw = [Sentence(' '.join(i)) for i in sw]
    for s in tqdm(nsw):
        elmo_embedding.embed(s)
        result.append(np.concatenate((np.array([np.array(
            token.embedding) for token in s]), np.zeros((MAX_WLEN-len(s), ELMO_DIM))), axis=0))
    return np.array(result)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [None]:
trainELMo = flair_elmo(trainWords)

100%|██████████| 4450/4450 [26:45<00:00,  2.58it/s]


In [None]:
testELMo = flair_elmo(testWords)

100%|██████████| 786/786 [04:34<00:00,  2.14it/s]


# Build Dataset

## Tagging

In [None]:
def find_cp(l):
    """
    Find index of cause and effect from label
    """
    p = '\((e.*?)\)'
    t = re.findall(p, l)
    t = [i.split(',') for i in t]
    c = [i[0] for i in t]
    e = [i[-1] for i in t]
    return c, e


def tagging(s, n, w):
    """
    Tagging
    """
    for i in range(len(w)):
        if i == 0:
            s[w[i]] = n
        else:
            s[w[i]] = n+1
    return s


def word2label(s, idx, l):
    """
    Convert sentence to label sequence
    :Tagging Scheme
    ---7 Tag---
    0: O = Other
    1: B-C = Cause Begin
    2: I-C = Cause Inside
    3: B-E = Effect Begin
    4: I-E = Effect Inside
    5: B-CE = Cause|Effect Begin
    6: I-CE = Cause|Effect Inside
    """
    r = [0]*len(s)
    if l == 'Non-Causal':
        return r
    else:
        c, e = find_cp(l)
        element = set(c+e)
        cause, cause_and_effect, effect = [], [], []
        for i in element:
            if i not in e:
                cause.append(i)
            if i in c and i in e:
                cause_and_effect.append(i)
            if i not in c and i in e:
                effect.append(i)
        for e in element:
            if e in cause:
                r = tagging(r, 1, idx[int(e[1:])-1])
            if e in effect:
                r = tagging(r, 3, idx[int(e[1:])-1])
            if e in cause_and_effect:
                r = tagging(r, 5, idx[int(e[1:])-1])
    return r


train_labelSeq = [word2label(
    trainWords[i], train_index[i], trainLabel[i]) for i in range(len(trainWords))]
test_labelSeq = [word2label(testWords[i], test_index[i], testLabel[i])
                 for i in range(len(testWords))]

In [None]:
labelSeq = sum(train_labelSeq+test_labelSeq, [])
tagAll = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
print('\n------ Corpous Tag Analysis ------\n')
print('{:<20}{:>30}{:>30}\n'.format('', 'number of tag', 'percentage'))
for i in range(len(tagAll)):
    print('{:<20}{:>30d}{:>30}'.format(tagAll[i]+':', Counter(labelSeq)[i], str(Counter(
        labelSeq)[i]/sum(Counter(labelSeq)[i] for i in range(len(tagAll)))*100)[:5]+'%'))
print('{:<20}{:>30d}{:>30d}'.format('total', sum(
    Counter(labelSeq)[i] for i in range(len(tagAll))), 1))


------ Corpous Tag Analysis ------

                                     number of tag                    percentage

O:                                           78440                        92.57%
B-C:                                          1544                        1.822%
I-C:                                          1650                        1.947%
B-E:                                          1506                        1.777%
I-E:                                          1460                        1.723%
B-CE:                                           64                        0.075%
I-CE:                                           71                        0.083%
total                                        84735                             1


In [None]:
labelSeq = sum(train_labelSeq, [])
tagAll = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
print('\n------ Corpous Tag Analysis ------\n')
print('{:<20}{:>30}{:>30}\n'.format('', 'number of tag', 'percentage'))
for i in range(len(tagAll)):
    print('{:<20}{:>30d}{:>30}'.format(tagAll[i]+':', Counter(labelSeq)[i], str(Counter(
        labelSeq)[i]/sum(Counter(labelSeq)[i] for i in range(len(tagAll)))*100)[:5]+'%'))
print('{:<20}{:>30d}{:>30d}'.format('total', sum(
    Counter(labelSeq)[i] for i in range(len(tagAll))), 1))


------ Corpous Tag Analysis ------

                                     number of tag                    percentage

O:                                           66614                        92.58%
B-C:                                          1308                        1.817%
I-C:                                          1421                        1.974%
B-E:                                          1268                        1.762%
I-E:                                          1230                        1.709%
B-CE:                                           55                        0.076%
I-CE:                                           55                        0.076%
total                                        71951                             1


In [None]:
labelSeq = sum(test_labelSeq, [])
tagAll = ['O', 'B-C', 'I-C', 'B-E', 'I-E', 'B-CE', 'I-CE']
print('\n------ Corpous Tag Analysis ------\n')
print('{:<20}{:>30}{:>30}\n'.format('', 'number of tag', 'percentage'))
for i in range(len(tagAll)):
    print('{:<20}{:>30d}{:>30}'.format(tagAll[i]+':', Counter(labelSeq)[i], str(Counter(
        labelSeq)[i]/sum(Counter(labelSeq)[i] for i in range(len(tagAll)))*100)[:5]+'%'))
print('{:<20}{:>30d}{:>30d}'.format('total', sum(
    Counter(labelSeq)[i] for i in range(len(tagAll))), 1))


------ Corpous Tag Analysis ------

                                     number of tag                    percentage

O:                                           11826                        92.50%
B-C:                                           236                        1.846%
I-C:                                           229                        1.791%
B-E:                                           238                        1.861%
I-E:                                           230                        1.799%
B-CE:                                            9                        0.070%
I-CE:                                           16                        0.125%
total                                        12784                             1


## Split Dataset

In [None]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import h5py
SEED = 666


def char_data(cw):
    result = []
    for s in cw:
        result.append(np.concatenate((pad_sequences([[char2index[c] for c in w] for w in s], maxlen=MAX_CLEN, padding='post', truncating='post'),
                                      np.zeros((MAX_WLEN-len(s), MAX_CLEN))), axis=0))
    return np.array(result)


trainCrray = char_data(trainChars)

testCrray = char_data(testChars)

trainSeq = [[word2index[cw] for cw in s] for s in trainWords]
trainWrray = pad_sequences(trainSeq, maxlen=MAX_WLEN,
                           padding='post', truncating='post')

testSeq = [[word2index[cw] for cw in s] for s in testWords]
testWrray = pad_sequences(testSeq, maxlen=MAX_WLEN,
                          padding='post', truncating='post')

train_labelSeq = pad_sequences(
    train_labelSeq, maxlen=MAX_WLEN, padding='post', truncating='post')
train_labelData = train_labelSeq.reshape(len(trainWords), MAX_WLEN, 1)

test_labelSeq = pad_sequences(
    test_labelSeq, maxlen=MAX_WLEN, padding='post', truncating='post')
test_labelData = test_labelSeq.reshape(len(testWords), MAX_WLEN, 1)

xTrain, _, yTrain, _ = train_test_split(
    trainWrray, train_labelData, test_size=0., random_state=SEED)
xTest, _, yTest, _ = train_test_split(
    testWrray, test_labelData, test_size=0., random_state=SEED)
xTrain_c = train_test_split(trainCrray, test_size=0., random_state=SEED)[0]
xTest_c = train_test_split(testCrray, test_size=0., random_state=SEED)[0]
xTrain_flair = train_test_split(trainFlair, test_size=0., random_state=SEED)[0]
xTest_flair = train_test_split(testFlair, test_size=0., random_state=SEED)[0]
xTrain_bert = train_test_split(trainBERT, test_size=0., random_state=SEED)[0]
xTest_bert = train_test_split(testBERT, test_size=0., random_state=SEED)[0]
xTrain_elmo = train_test_split(trainELMo, test_size=0., random_state=SEED)[0]
xTest_elmo = train_test_split(testELMo, test_size=0., random_state=SEED)[0]

In [None]:
xTrain.shape, yTrain.shape, xTest.shape, yTest.shape, xTrain_c.shape, xTest_c.shape, xTrain_flair.shape, xTest_flair.shape

((4501, 58),
 (4501, 58, 1),
 (786, 58),
 (786, 58, 1),
 (4501, 58, 23),
 (786, 58, 23),
 (4501, 58, 4096),
 (786, 58, 4096))

In [None]:
h5f = h5py.File('data/train/train.h5', 'w')
h5f.create_dataset('xTrain', data=xTrain)
h5f.create_dataset('xTrain_c', data=xTrain_c)
h5f.create_dataset('yTrain', data=yTrain)
h5f.close()

h5f = h5py.File('data/test/test.h5', 'w')
h5f.create_dataset('xTest', data=xTest)
h5f.create_dataset('xTest_c', data=xTest_c)
h5f.create_dataset('yTest', data=yTest)
h5f.close()

h5f = h5py.File('data/embedding/flair.h5', 'w')
h5f.create_dataset('xTrain_flair', data=xTrain_flair)
h5f.create_dataset('xTest_flair', data=xTest_flair)
h5f.close()

In [None]:
h5f = h5py.File('data/embedding/bert.h5', 'w')
h5f.create_dataset('xTrain_bert', data=xTrain_bert)
h5f.create_dataset('xTest_bert', data=xTest_bert)
h5f.close()

In [None]:
h5f = h5py.File('data/embedding/elmo.h5', 'w')
h5f.create_dataset('xTrain_elmo', data=xTrain_elmo)
h5f.create_dataset('xTest_elmo', data=xTest_elmo)
h5f.close()

# Data compress

In [None]:
import zlib

def compress(infile, dst, level=9):
    infile = open(infile, 'rb')
    dst = open(dst, 'wb')
    compress = zlib.compressobj(level)
    data = infile.read(1024)
    while data:
        dst.write(compress.compress(data))
        data = infile.read(1024)
    dst.write(compress.flush())


def decompress(infile, dst):
    infile = open(infile, 'rb')
    dst = open(dst, 'wb')
    decompress = zlib.decompressobj()
    data = infile.read(1024)
    while data:
        dst.write(decompress.decompress(data))
        data = infile.read(1024)
    dst.write(decompress.flush())
    
#compress('data/embedding/bert.h5',
#         'data/embedding/c_bert.h5')

In [None]:
decompress('data/embedding/c_bert.h5', 'data/embedding/bert.h5')

In [None]:
decompress('data/embedding/c_elmo.h5', 'data/embedding/elmo.h5')

In [None]:
decompress('data/embedding/c_flair.h5', 'data/embedding/flair.h5')