In [51]:
import xml.etree.ElementTree as ET
import os
import glob

In [18]:
# Get a list of words in the textblock along with its corresponding references
# words_arr: a list of words in the text block 
# ref_arr: a list of refs which correspond the each word in words_arr
def read_ink(path):
    tree = ET.parse(path)
    root = tree.getroot()

    traceView = root.find("traceView")

    traceView2 = traceView.findall("traceView")

    textBlockRoots = []

    for t in traceView2:
        annotation = t.find("annotation")

        if (annotation.text == "Textblock"):
            textBlockRoots.append(t)

    words_arr = []
    ref_arr = []

    for textBlock in textBlockRoots:
        for textLine in textBlock.findall("traceView"):
            string = textLine[1].text

            for i in range(2, len(textLine)):
                wordElement = textLine[i]
                word = wordElement[1].text

                if word is not None:
                    words_arr.append(word)

                    refs = wordElement.findall('traceView')
                    
                    ref_list = []
                
                for ref in refs:
                    ref_list.append(ref.attrib['traceDataRef'])

                ref_arr.append(ref_list)
                    
    return words_arr, ref_arr
    

In [43]:
# Returns a dict that maps ref id -> digital ink time series
def get_digital_ink(path):
    tree = ET.parse(path)
    root = tree.getroot()
    traces = root.findall("trace")

    digital_ink_dict = {}


    for trace in traces:
        digital_ink_dict[trace.attrib['{http://www.w3.org/XML/1998/namespace}id']] = trace.text

    return digital_ink_dict

In [41]:
words_arr, ref_arr = read_ink("datasets/001e.inkml")

In [42]:
print(words_arr)

['And', 'he', 'had', 'a', 'feeling', '-', 'thanks', 'to', 'the', 'girl', '-', 'that', 'things', 'would', 'get', 'worse', 'before', 'they', 'got', 'better', '.', 'They', 'had', 'the', 'house', 'cleaned', 'up', 'by', 'noon', ',', 'and', 'Wilson', 'sent', 'the', 'boy', 'out', 'to', 'the', 'meadow', 'to', 'bring', 'in', 'the', 'horses']


In [32]:
print(ref_arr)

[['#t42', '#t43', '#t44', '#t45', '#t46', '#t47'], ['#t48'], ['#t49', '#t50', '#t51', '#t52'], ['#t53'], ['#t54', '#t55', '#t56', '#t57', '#t58', '#t59'], ['#t60'], ['#t61', '#t62', '#t63', '#t64', '#t65', '#t66', '#t67', '#t68'], ['#t69', '#t70'], ['#t71', '#t72', '#t73'], ['#t74', '#t75', '#t76', '#t77', '#t78'], ['#t79'], ['#t80', '#t81', '#t82', '#t83', '#t84', '#t85'], ['#t86', '#t87', '#t88', '#t89', '#t90', '#t91', '#t92'], ['#t93', '#t94', '#t95', '#t96', '#t97'], ['#t98', '#t99', '#t100'], ['#t101', '#t102', '#t103', '#t104'], ['#t105', '#t106'], ['#t107', '#t108', '#t109', '#t110', '#t111'], ['#t112', '#t113', '#t114'], ['#t115', '#t116'], ['#t117'], ['#t118', '#t119', '#t120', '#t121', '#t122'], ['#t123', '#t124', '#t125', '#t126'], ['#t127', '#t128', '#t129'], ['#t130', '#t131', '#t132', '#t133'], ['#t134', '#t135', '#t136', '#t137', '#t138', '#t139', '#t140'], ['#t141', '#t142'], ['#t143', '#t144', '#t145'], ['#t146', '#t147', '#t148', '#t149'], ['#t150'], ['#t151', '#t152

In [55]:
ref_dict = get_digital_ink("./datasets/001e.inkml")

In [56]:
print(ref_dict['t0'])

 150.625 423.25 1233499039.688 0,'0'-.125'.026'94,"-.125"0".014"-76, .125 .125-.026-16, 0 0-.001 0, 0-.125 0-2, .25 .125 .041 2, 2 2.125 .106 0, 2.25 2.625-.067-2, 2.875 1.25 .04 0, 1.75 3.75 .067 0,-4-5.125-.026 0,-5.125-4.75-.148-4,-.25 .125-.012-10,-.25 .125-.001-88


In [54]:
for filename in glob.glob("./datasets/*"):
    print(filename)

./datasets/007.inkml
./datasets/001f.inkml
./datasets/001g.inkml
./datasets/001.inkml
./datasets/001e.inkml
./datasets/003.inkml
./datasets/001a.inkml
./datasets/002.inkml
./datasets/001b.inkml
./datasets/001c.inkml
./datasets/008.inkml
./datasets/009.inkml
./datasets/005.inkml
./datasets/001d.inkml
./datasets/006.inkml
