In [16]:
import xml.etree.ElementTree as ET
import nltk
import re
from dataclasses import dataclass
from typing import List

In [45]:
@dataclass
class metaphorSentance:
    sent: str
    metaIdx: int
    punc: str = ".,!?;“”"

    def posTag(self) -> str:
        def puncSwap(char: str, pos: str):
            if char in self.punc:
                return 'PUN'
            else:
                return pos

        return list(map(lambda x: (x[0], puncSwap(x[0], x[1])), \
            nltk.pos_tag(nltk.word_tokenize(self.sent))))

    def lemmatize(self) -> str:
        lmn = nltk.stem.WordNetLemmatizer()

        wordnetConv = {
            'J': 'a',
            'V': 'v',
            'N': 'n',
            'R': 'r'
        }

        return map(lambda x: lmn.lemmatize(x[0], wordnetConv.get(x[1][0], 'n')), self.posTag())

    def split(self, string: str, delim = ' '):
        return re.findall(rf"[\w']+|[{self.punc}]", string)

    def __iter__(self):
        posTags = self.posTag()
        lemmas = self.lemmatize()
        for idx, ((org, tag), lemma) in enumerate(zip(posTags, lemmas)):
            yield idx in self.metaIdx, org, tag, lemma



In [50]:
dir = './DeepMet/data/VUA/raw/'

data = {
    "Obama-waterMeta": {
        "name": "art1",
        "type": "article",
        "data": [
            metaphorSentance(
                "President Obama urged Congress on Tuesday to quickly provide almost $4 billion \
                to confront a surge of young migrants from Central America crossing the border into \
                Texas, calling it “an urgent humanitarian situation.”",
                [15, 18]
            ),
            metaphorSentance(
                "The president said he needed the money to set up new detention facilities, \
                conduct more aerial surveillance and hire immigration judges and Border \
                Patrol agents to respond to the flood of 52,000 children.",
                [30, 35]
            ),
            metaphorSentance(
                "Their sudden mass migration has overwhelmed local resources and touched off protests from residents angry about the impact on the local economy.",
                [3, 5]
            ),
            metaphorSentance(
                "In a letter to congressional leaders, Mr. Obama urged them to “act expeditiously” on his request.",
                []
            )
        ]
    },
}

In [52]:
start = ET.Element('group')
for key in data:
    child = ET.SubElement(start, 'text', {'xmlns': '', 'xml:id': key})
    child = ET.SubElement(child, 'body')
    child = ET.SubElement(child, 'div1', {'n': data[key]['name']})
    child = ET.SubElement(child, 'head', {'type': data[key]['type']})
    for idx, sent in enumerate(data[key]['data']):
        sentEt = ET.SubElement(child, 's', {'n': str(idx + 1)})
        for meta, orig, tag, lemma in sent:
            if tag == 'PUN':
                word = ET.SubElement(sentEt, 'c', {'type': tag})
                word.text = orig
                continue

            word = ET.SubElement(sentEt, 'w', {'lemma': lemma, 'type': tag})

            if meta:
                word = ET.SubElement(word, 'seg', {'function': 'mrw', 'type': 'met', 'vici:morph':'n'})
                word.text = orig
            else:
                word.text = orig
            


In [53]:
ET.tostring(start)

b'<group><text xmlns="" xml:id="Obama-waterMeta"><body><div1 n="art1"><head type="article"><s n="1"><w lemma="President" type="NNP">President</w><w lemma="Obama" type="NNP">Obama</w><w lemma="urge" type="VBD">urged</w><w lemma="Congress" type="NNP">Congress</w><w lemma="on" type="IN">on</w><w lemma="Tuesday" type="NNP">Tuesday</w><w lemma="to" type="TO">to</w><w lemma="quickly" type="RB">quickly</w><w lemma="provide" type="VB">provide</w><w lemma="almost" type="RB">almost</w><w lemma="$" type="$">$</w><w lemma="4" type="CD">4</w><w lemma="billion" type="CD">billion</w><w lemma="to" type="TO">to</w><w lemma="confront" type="VB">confront</w><w lemma="a" type="DT"><seg function="mrw" type="met" vici:morph="n">a</seg></w><w lemma="surge" type="NN">surge</w><w lemma="of" type="IN">of</w><w lemma="young" type="JJ"><seg function="mrw" type="met" vici:morph="n">young</seg></w><w lemma="migrant" type="NNS">migrants</w><w lemma="from" type="IN">from</w><w lemma="Central" type="NNP">Central</w><w