### This notebook converts data from xml format to tsv format

In [1]:
import xml.etree.ElementTree as ET

In [2]:
train_path = './NTCIR-ECA13-3000/emotion_cause_english_train.xml'
test_path = './NTCIR-ECA13-3000/emotion_cause_english_test.xml'

In [38]:
tree = ET.parse(test_path)
root = tree.getroot()
prefix = '{http://www.w3.org/2009/10/emotionml}'

In [None]:
for document in root.findall(prefix+'emotion'):
    docID = document.get('id')
    emotion = document.find(prefix+'category').get('name')
    e_value = document.find(prefix+'category').get('value')
    num_clauses = 0
    emotion_clauseID = -1
    cause_clauseID = -1
    c_list = []
    for clause in document.iter(prefix+'clause'):
        num_clauses += 1
        emotion_begin = -1
        emotion_length = -1
        cause_begin = -1
        cause_length = -1
        c_list.append(clause.get('id'))
        if clause.get('keywords') == 'Y':
            emotion_clauseID = clause.get('id')
            c_list.append(clause.find(prefix+'keywords').text)
            emotion_begin = clause.find(prefix+'keywords').get('keywords-begin')
            emotion_length = clause.find(prefix+'keywords').get('keywords-lenth')
        else:
            c_list.append('null')
            
        if clause.get('cause') == 'Y':
            cause_clauseID = clause.get('id')
            c_list.append(clause.find(prefix+'cause').text)
            cause_begin = clause.find(prefix+'cause').get('begin')
            cause_length = clause.find(prefix+'cause').get('lenth')
        else:
            c_list.append('null')
        
        c_list.append(clause.find(prefix+'text').text)
        if emotion_begin == -1:
            c_list.append('null')
        else:
            c_list.append((emotion_begin, emotion_length))

        if cause_begin == -1:
            c_list.append('null')
        else:
            c_list.append((cause_begin, cause_length))

### Convert JSON format into tab-seperated tsv file

In [61]:
import json
import csv

In [51]:
with open('eca_test.json', 'r') as test:
    test_data = json.load(test)

In [54]:
with open('eca_train.json', 'r') as train:
    train_data = json.load(train)

In [66]:
(train_data['emotionml']['emotion'][0]['clause'])

list

### Add token-level label to the tsv file for both train and test files

In [None]:
# Generate tokens for train file
with open('./eca-train.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['document', 'token_label', 'emotion-label'])
    for d, document in enumerate(train_data['emotionml']['emotion']):
        emotion = document['category']['_name']
        doc = []
        doc_label = []
        if 'clause' not in train_data['emotionml']['emotion'][d]: continue
        for i, clause in enumerate(train_data['emotionml']['emotion'][d]['clause']):
            # raw text
            if 'text' not in clause: continue
            text = clause['text'].split(' ')  # a list of tokens
#             if i+1 != len(test_data['emotionml']['emotion'][d]['clause']):
#                 text += ['[SEP]']
            
            # labels for each word in the raw data: 'O', 'B-CAU', 'I-CAU', 'B-EMO', 'I-EMO', '[CLS]', '[SEP]'
            if clause['_cause'] == 'N' and clause['_keywords'] == 'N':
                token_label = ['O'] * len(text)
            else:
                token_label = text.copy()
                if clause['_cause'] == 'Y':
                    cause_begin_charID = int(clause['cause']['_begin'])  # characater index
                    cause_length = int(clause['cause']['_lenth'])
                    begin_cause_wordID = len(clause['text'][:cause_begin_charID-1].split())  # word index for beginning
                    end_cause_wordID = begin_cause_wordID + len(clause['text'][cause_begin_charID-1:cause_begin_charID+cause_length].split())-1
                    for i, word in enumerate(text):
                        if i >= begin_cause_wordID and i <= end_cause_wordID:
                            if i == begin_cause_wordID:
                                token_label[i] = 'B-CAU'
                            else:
                                token_label[i] = 'I-CAU'
#                         else:
#                             token_label[i] = 'O'
                
                if clause['_keywords'] == 'Y':
                    emotion_begin_charID = int(clause['keywords']['_keywords-begin'])
                    emotion_length = int(clause['keywords']['_keywords-lenth'])
                    begin_emotion_wordID = len(clause['text'][:emotion_begin_charID-1].split())
                    end_emotion_wordID = begin_emotion_wordID + len(clause['text'][emotion_begin_charID-1:emotion_begin_charID+emotion_length].split())-1
                    for i, word in enumerate(text):
                        if i >= begin_emotion_wordID and i <= end_emotion_wordID:
                            if i == begin_emotion_wordID:
                                token_label[i] = 'B-EMO'
                            else:
                                token_label[i] = 'I-EMO'
                
                for i, word in enumerate(text):
                    if token_label[i] not in ['B-CAU', 'I-CAU', 'B-EMO', 'I-EMO', '[SEP]']:
                        token_label[i] = 'O'
                
                
            if text[-1] == '[SEP]':
                token_label[-1] = '[SEP]'
            
            doc += text
            doc_label += token_label
        
        assert len(doc) == len(doc_label)
            
        tsv_writer.writerow([' '.join(doc), ' '.join(doc_label), emotion])
    
    
#     tsv_writer.writerow([['her', 'great'], ['O', 'B-emotion']])

In [96]:
# Generate labels for test file
with open('./eca-test.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    tsv_writer.writerow(['document', 'token_label', 'emotion-label'])
    for d, document in enumerate(test_data['emotionml']['emotion']):
        emotion = document['category']['_name']
        doc = []
        doc_label = []
        if 'clause' not in test_data['emotionml']['emotion'][d]: continue
        for i, clause in enumerate(test_data['emotionml']['emotion'][d]['clause']):
            # raw text
            if 'text' not in clause: continue
            text = clause['text'].split(' ')  # a list of tokens
            if i+1 != len(test_data['emotionml']['emotion'][d]['clause']):
                text += ['[SEP]']
            
            # labels for each word in the raw data: 'O', 'B-CAU', 'I-CAU', 'B-EMO', 'I-EMO', '[CLS]', '[SEP]'
            if clause['_cause'] == 'N' and clause['_keywords'] == 'N':
                token_label = ['O'] * len(text)
            else:
                token_label = text.copy()
                if clause['_cause'] == 'Y':
                    cause_begin_charID = int(clause['cause']['_begin'])  # characater index
                    cause_length = int(clause['cause']['_lenth'])
                    begin_cause_wordID = len(clause['text'][:cause_begin_charID-1].split())  # word index for beginning
                    end_cause_wordID = begin_cause_wordID + len(clause['text'][cause_begin_charID-1:cause_begin_charID+cause_length].split())-1
                    for i, word in enumerate(text):
                        if i >= begin_cause_wordID and i <= end_cause_wordID:
                            if i == begin_cause_wordID:
                                token_label[i] = 'B-CAU'
                            else:
                                token_label[i] = 'I-CAU'
                
                if clause['_keywords'] == 'Y':
                    emotion_begin_charID = int(clause['keywords']['_keywords-begin'])
                    emotion_length = int(clause['keywords']['_keywords-lenth'])
                    begin_emotion_wordID = len(clause['text'][:emotion_begin_charID-1].split())
                    end_emotion_wordID = begin_emotion_wordID + len(clause['text'][emotion_begin_charID-1:emotion_begin_charID+emotion_length].split())-1
                    for i, word in enumerate(text):
                        if i >= begin_emotion_wordID and i <= end_emotion_wordID:
                            if i == begin_emotion_wordID:
                                token_label[i] = 'B-EMO'
                            else:
                                token_label[i] = 'I-EMO'
                
                for i, word in enumerate(text):
                    if token_label[i] not in ['B-CAU', 'I-CAU', 'B-EMO', 'I-EMO', '[SEP]']:
                        token_label[i] = 'O'
                
                
            if text[-1] == '[SEP]':
                token_label[-1] = '[SEP]'
            
            doc += text
            doc_label += token_label
        
        assert len(doc) == len(doc_label)
            
        tsv_writer.writerow([doc, doc_label, emotion])
