In [1]:
%load_ext autoreload
%autoreload 2

Make plain text data, remembering the nodes that the text comes from.

In [2]:
import os
import sys
import json
from unicodedata import name as uname
from IPython.display import display, HTML

from tf.app import use
from tf.convert.recorder import Recorder
from tf.core.helpers import specFromRanges, rangesFromSet

In [3]:
with open(os.path.expanduser("~/Downloads/resultsp.tsv"), "w", encoding="utf_16_le") as fh:
    fh.write("\ufeffx")

In [4]:
A = use("nena:clone", checkout="clone", hoist=globals())

In [5]:
for (tp, av, start, end) in C.levels.data:
    print(f"{end - start + 1:>6} {tp:<12} nodes average {int(round(av)):>6} chars")

     2 dialect      nodes average 269689 chars
   126 text         nodes average   4281 chars
   350 paragraph    nodes average   1541 chars
  2544 line         nodes average    212 chars
 16326 sentence     nodes average     33 chars
 24497 subsentence  nodes average     22 chars
 36444 inton        nodes average     15 chars
 93766 stress       nodes average      6 chars
120151 word         nodes average      4 chars
539378 letter       nodes average      1 chars


# Generate full layered data

We use the `full` transcription.

We remember nodes of the types *letter*, *word*, *sentence*, *line*, and *text*.

We store the positions by node type.

In [6]:
GH = os.path.expanduser("~/github")
ORG = "CambridgeSemiticsLab"
REPO = "nena_tf"
REL = "nena2search/app"
OUTPUT = f"{GH}/{ORG}/{REPO}/{REL}"
DEBUG_OUTPUT = f"{GH}/{ORG}/{REPO}/_local"
TEST_TEXTS_DIR = f"{DEBUG_OUTPUT}/texts"

In [7]:
HELP = """
This is a <i>layered</i> search interface.

The corpus is divided into levels, e.g. book/chapter/verse/sentence/word/line/letter.

The corpus can be represented at each level, e.g. by book title, chapter number
Each layer is a text representation of the complete corpus.
"""

In [8]:
PH_ABSENT = "z"
CH_ABSENT = "¿"

BASE_LEVEL = "word"
# BASE_LEVEL = "letter"

SETTINGS = {}
SIMPLE_BASE=dict(letter=True, word=False)

NAME = "nena"

DESCRIPTION = """
<p>Phonetic search interface for the Northeastern Neo-Aramaic Text-Fabric Corpus.</p>
<p>Based on <a href="https://github.com/CambridgeSemiticsLab/nena_tf" target="_blank">NENA data in Text-Fabric format</a>.</p>
<p>See the
<a href="https://github.com/CambridgeSemiticsLab/nena_tf/blob/master/docs/features.md" target="_blank">data documentation</a>.</p>
<p>This is a standalone app. You download it to your computer, and then it works without
connection to the internet.</p>
<p>This web app is by:</p>
<ul>
<li> <a href="https://www.ames.cam.ac.uk/people/professor-geoffrey-khan" target="_blank">Geoffrey Khan</a> (initiator)
<li> <a href="https://www.linkedin.com/in/cody-kingham-1135018a" target="_blank">Cody Kingham</a> (corpus developer)
<li> <a href="https://pure.knaw.nl/portal/en/persons/dirk-roorda" target="_blank">Dirk Roorda</a> (software developer)
</ul>
"""

In [9]:
DESC_TEXT = "text precise, complete, uses non-ascii: <code>maqəlbə̀nna</code>"
DESC_FULL = "text representation: <code>maq9lb9`nna</code>"
DESC_FUZZY = "text representation: <code>maqilbinna</code>"
DESC_LITE = "text representation: <code>maq9lb9nna</code>"
DESC_POS = "part-of-speech"
DESC_CLS = "phonetic class: <code>CVCVCCVCCV</code>"
DESC_VOICE = "phonation: <code>PzzzPVzPPz</code>"
DESC_PLACE = "phonation: <code>BzXzDBzDDz</code>"
DESC_MANNER = "phonation: <code>NzAzLAzNNz</code>"

DESC_LANG = "language, indicated by a number"
DESC_SPEAKER = "speaker, indicated by a number"

DESC_NUMBER = "line number"

DESC_TITLE = "title of a text"
DESC_DIALECT = "dialect of a text <code>Barwar Urmi_C</code>"
DESC_TID = "id of a text"
DESC_TPLACE = "place of a text"

DESC_L_LETTER = "Some letters are expressed by multiple characters in some representations."
DESC_L_WORD = "Some words are affixed to others without intervening space."
DESC_L_SENTENCE = "Sentences are delimited by full stops."
DESC_L_LINE = "Lines are really paragraphs."
DESC_L_TEXT = "Texts are stories, having some metadata, consisting of lines."

In [10]:
MAP_LANG = {x[0]: i + 1 for (i, x) in enumerate(F.lang.freqList())}
MAP_SPEAKER = {x[0]: i + 1 for (i, x) in enumerate(F.speaker.freqList())}
MAP_CLS = {
    "vowel": "V",
    "consonant": "C",
}
MAP_VOICE = {
    "plain": "P",
    "unvoiced_aspirated": "H",
    "voiced": "V",
    "unvoiced": "F",
    "unvoiced_unaspirated": "G",
    "emphatic": "X",
}
MAP_PLACE = map = {
    "dental-alveolar": "D",
    "labial": "B",
    "palatal-alveolar": "C",
    "palatal": "J",
    "velar": "G",
    "uvular": "X",
    "pharyngeal": "Q",
    "laryngeal": "H",
}
MAP_MANNER = {
    "affricative": "A",
    "nasal": "N",
    "other": "X",
    "fricative": "F",
    "lateral": "L",
    "sibilant": "S",
}
MAP_POS = {
    "NOUN": "n",
    "PART": "pt",
    "PRON": "pn",
    "NUMR": "nr",
    "ADJV": "aj",
    "ADVB": "ab",
    "MODI": "m",
    "INTJ": "i",
    "PREP": "pp",
    "VERB": "v",
    "NOUN|PART": "n|pt",
    "NOUN|NOUN": "n|n",
    "PRON|PART": "pn|pt",
    "PART|PRON": "pt|pn",
    "MODI|NOUN": "m|n",
    "MODI|PRON": "m|pn",
    "PART|NOUN": "pt|n",
    "ADVB|NOUN": "ab|n",
    "NOUN|ADVB": "n|ab",
    "NOUN|ADJV": "n|aj",
    "ADJV|ADJV": "aj|aj",
    "ADJV|NOUN": "aj|n",
    "NUMR|NUMR": "nr|nr",
    "ADJV|ADVB": "aj|ab",
    "NOUN|INTJ": "n|intj",
    "NOUN|NOUN|NOUN": "n|n|n",
    "PART|PART|PART": "pt|pt|pt",
    "ADJV|NOUN|NOUN": "aj|n|n",
    "ADJV|NOUN|NOUN|NOUN": "aj|n|n|n",
    "NOUN|NOUN|NOUN|NOUN": "n|n|n|n",
}

In [11]:
CSS = """
<style>
.lc {
    font-family: normal, sans-serif;
    font-size: 22pt;
    background-color: white;
    border: 2pt solid #ffcccc;
}
.l {
    font-family: normal, sans-serif;
    font-size: x-large;
    direction: ltr;
    unicode-bidi: isolate-override;
}
.c {
    font-family: monospace;
    font-size: x-small;
    direction: ltr;
    unicode-bidi: isolate-override;
}
.p {
    font-family: monospace;
    font-size: medium;
    font-weight: bold;
    background-color: yellow;
    direction: ltr;
    unicode-bidi: isolate-override;
}
.lrg {
    font-size: 22pt;
    font-weight: bold;
}
span.sp {
    background-color: rgba(0, 255, 0, 0.5);
}
div.cn {
    text-align: center
}
div.ch.p {
    background-color: #ffeedd;
    text-align: center
}
span.cni {
    background-color: #eeeeee;
    padding-top: 4pt;
    padding-bottom: 4pt;
    padding-left: 8pt;
    padding-right: 8pt;
    border: 2pt solid #66aaaa;
    display: inline-block;
}
div.ch,div.cht,div.chs {
    border: 2pt solid #cccccc;
    display: inline-flex;
    flex-flow: column nowrap;
    max-width: 10em;
}
div.ch {
    background-color: #ddffff;
}
div.chs {
    background-color: #ccffcc;
}
div.chm {
    background-color: #44ff44;
}
div.sr {
    display: flex;
    flex-flow: row wrap;
}
:root {
    --fog-rim:          hsla(  0,   0%,  60%, 0.5  );
}
</style>
"""

display(HTML(CSS))


def showString(s, asString=False):
    shtml = f"""<span class="r">{s}</span>"""
    html = """<div class="sr">""" + (
        "".join(showChar(c) for c in s) + "</div>"
    )
    if asString:
        return f"""<span>{shtml}</span>{html}"""

    display(HTML(f"""<p>{shtml}</p>{html}"""))
    
def showChar(c):
    if c == "":
        extra = ""
        ccode = ""
        crep = "\u00a0"
        cname = "EMPTY"
    else:
        ccode = (
            f"""<span class="c">{ord(c):>04x}</span>"""
        )
        crep = (
            f"""<span class="lc">{c}"""
        )
        cname = f"""<span class="c">{uname(c)}</span>"""

    return f"""
<div class="ch">
    <div class="cn">{ccode}</div>
    <div class="cn"><span class="cni">{crep}</span></div>
    <div class="cn">{cname}</div>
</div>
"""
    
def findFirstDiff(feat1, feat2):
    fv1 = Fs(feat1).v
    fv2 = Fs(feat2).v
    equal = True
    for n in F.otype.s("word"):
        v1 = fv1(n) or ""
        v2 = fv2(n) or ""
        ln1 = len(v1)
        ln2 = len(v2)
        if ln1 != ln2:
            equal = False
            break
    if equal:
        print(f"{feat1} and {feat2} EQUAL")
    else:
        print(f"{feat1} and {feat2} NOT EQUAL")
        print(f"node {n}")
        showString(v1)
        showString(v2)

In [12]:
for (feat1, feat2) in (
    ("text", "fuzzy"),
    ("text", "lite"),
    ("fuzzy", "lite"),
):
    findFirstDiff(feat1, feat2)

text and fuzzy NOT EQUAL
node 713443


text and lite NOT EQUAL
node 713443


fuzzy and lite NOT EQUAL
node 713454


In [13]:
SETTINGS["word"] = dict(
    word=dict(
        description=DESC_L_WORD,
        layers=dict(
            lang=dict(
                feature="lang",
                description=DESC_LANG,
                map=MAP_LANG,
                default=0,
                pos=None,
                visible=False,
            ),
            speaker=dict(
                feature="speaker",
                description=DESC_SPEAKER,
                map=MAP_SPEAKER,
                default=0,
                pos=None,
                visible=False,
            ),
            text=dict(
                feature="text",
                description=DESC_TEXT,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=True,
            ),
            full=dict(
                feature="full",
                description=DESC_FULL,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=False,
            ),
            fuzzy=dict(
                feature="fuzzy",
                description=DESC_FUZZY,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=True,
                example="mute",
            ),
            lite=dict(
                feature="lite",
                description=DESC_LITE,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=False,
            ),
            pos=dict(
                feature="pos",
                description=DESC_POS,
                map=MAP_POS,
                default=PH_ABSENT,
                visible=False,
                pos=None,
            ),
            cls=dict(
                feature="phonetic_class",
                description=DESC_CLS,
                descend="letter",
                map=MAP_CLS,
                default=PH_ABSENT,
                visible=False,
                pos=None,
            ),
            voice=dict(
                feature="phonation",
                description=DESC_VOICE,
                descend="letter",
                map=MAP_VOICE,
                default=PH_ABSENT,
                pos="cls",
                visible=False,
            ),
            place=dict(
                feature="phonetic_place",
                description=DESC_PLACE,
                descend="letter",
                map=MAP_PLACE,
                default=PH_ABSENT,
                pos="cls",
                visible=False,
            ),
            manner=dict(
                feature="phonetic_manner",
                description=DESC_MANNER,
                descend="letter",
                map=MAP_MANNER,
                default=PH_ABSENT,
                pos="cls",
                visible=False,
            ),
        ),
        afterFeature="full_end",
        afterDefault="/",
    ),
    sentence=dict(
        description=DESC_L_SENTENCE,
        afterDefault="\n",
        by=True,
    ),
    line=dict(
        description=DESC_L_LINE,
        layers=dict(
            number=dict(
                feature="line_number",
                description=DESC_NUMBER,
                map=None,
                default=-1,
                pos=None,
                visible=False,
            ),
        ),
        afterDefault="\n",
    ),
    text=dict(
        description=DESC_L_TEXT,
        layers=dict(
            title=dict(
                feature="title",
                description=DESC_TITLE,
                map=None,
                default="¿",
                pos=None,
                visible=False,
                example="A",
            ),
            dialect=dict(
                feature="dialect",
                description=DESC_DIALECT,
                ascend="dialect",
                map=None,
                default="¿",
                pos=None,
                visible=False,
            ),
            tid=dict(
                feature="text_id",
                description=DESC_TID,
                map=None,
                default="¿",
                pos=None,
                visible=False,
            ),
            place=dict(
                feature="place",
                description=DESC_TPLACE,
                map=None,
                default="¿",
                pos=None,
                visible=False,
                example="Dure",
            ),
        ),
        afterDefault="\n",
    ),
)

In [14]:
SETTINGS["letter"] = dict(
    letter=dict(
        description=DESC_L_LETTER,
        layers=dict(
            text=dict(
                feature="text",
                description=DESC_TEXT,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=True,
            ),
            full=dict(
                feature="full",
                description=DESC_FULL,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=False,
            ),
            fuzzy=dict(
                feature="fuzzy",
                description=DESC_FUZZY,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=True,
                example="mute",
            ),
            lite=dict(
                feature="lite",
                description=DESC_LITE,
                map=None,
                default=CH_ABSENT,
                pos=None,
                visible=False,
            ),
            pos=dict(
                feature="pos",
                description=DESC_POS,
                map=MAP_POS,
                default=PH_ABSENT,
                visible=False,
                pos=None,
            ),
            cls=dict(
                feature="phonetic_class",
                description=DESC_CLS,
                map=MAP_CLS,
                default=PH_ABSENT,
                visible=False,
                pos=None,
            ),
            voice=dict(
                feature="phonation",
                description=DESC_VOICE,
                map=MAP_VOICE,
                default=PH_ABSENT,
                pos="cls",
                visible=False,
            ),
            place=dict(
                feature="phonetic_place",
                description=DESC_PLACE,
                map=MAP_PLACE,
                default=PH_ABSENT,
                pos="cls",
                visible=False,
            ),
            manner=dict(
                feature="phonetic_manner",
                description=DESC_MANNER,
                map=MAP_MANNER,
                default=PH_ABSENT,
                pos="cls",
                visible=False,
            ),
        ),
    ),
    word=dict(
        description=DESC_L_WORD,
        layers=dict(
            lang=dict(
                feature="lang",
                description=DESC_LANG,
                map=MAP_LANG,
                default=0,
                pos=None,
                visible=False,
            ),
            speaker=dict(
                feature="speaker",
                description=DESC_SPEAKER,
                map=MAP_SPEAKER,
                default=0,
                pos=None,
                visible=False,
            ),
        ),
        afterFeature="full_end",
        afterDefault="/",
    ),
    sentence=dict(
        description=DESC_L_SENTENCE,
        afterDefault="\n",
        by=True,
    ),
    line=dict(
        description=DESC_L_LINE,
        layers=dict(
            number=dict(
                description=DESC_NUMBER,
                feature="line_number",
                map=None,
                default=-1,
                pos=None,
                visible=False,
            ),
        ),
        afterDefault="\n",
    ),
    text=dict(
        description=DESC_L_TEXT,
        layers=dict(
            title=dict(
                feature="title",
                description=DESC_TITLE,
                map=None,
                default="¿",
                pos=None,
                visible=False,
                example="A",
            ),
            dialect=dict(
                feature="dialect",
                description=DESC_DIALECT,
                ascend="dialect",
                map=None,
                default="¿",
                pos=None,
                visible=False,
            ),
            tid=dict(
                feature="text_id",
                description=DESC_TID,
                map=None,
                default="¿",
                pos=None,
                visible=False,
            ),
            place=dict(
                feature="place",
                description=DESC_TPLACE,
                map=None,
                default="¿",
                pos=None,
                visible=False,
                example="Dure",
            ),
        ),
        afterDefault="\n",
    ),
)

In [15]:
def checkSettings(baseLevel):
    sys.stdout.write(f"Making data based on {baseLevel}-settings")

    layerSettings = SETTINGS[baseLevel]
    typeSeq = list(layerSettings)
    typesLower = {}

    for (i, tp) in enumerate(typeSeq):
        typesLower[tp] = typeSeq[0 : i + 1]

    settings = dict(
        name=NAME,
        description=DESCRIPTION,
        layerSettings=layerSettings,
        typeSeq=typeSeq,
        typesLower=typesLower,
        simpleBase=SIMPLE_BASE[baseLevel],
    )

    # check visible and by attributes

    theBys = []
    theVisibles = []

    for (nType, typeInfo) in layerSettings.items():
        if typeInfo.get("by", False):
            theBys.append(nType)

        for (name, layerInfo) in layerSettings[nType].get("layers", {}).items():
            if layerInfo.get("visible", False):
                theVisibles.append((nType, name))
            theMap = layerInfo.get("map", None)
            if theMap is not None:
                default = layerInfo.get("default", None)
                if default is not None:
                    theMap[""] = default

    if len(theBys) == 0:
        containerType = None
        sys.stderr.write("No node type is declared as result container ('by')\n")
    else:
        containerType = theBys[0]
        if len(theBys) > 1:
            sys.stderr.write("Multiple node types declared as result container ('by'):\n")
            sys.stderr.write("\t" + (", ".join(theBys)) + "\n")
        else:
            sys.stdout.write("Node type declared as result container ('by'):\n")
            sys.stdout.write(f"\t{containerType}\n")
            
    settings["containerType"] = containerType

    sys.stderr.flush()
    sys.stdout.flush()

    if len(theVisibles) == 0:
        sys.stderr.write("No layer type is declared as visible in the result ('visible')\n")
    else:
        sys.stdout.write("Layers declared as visible in the result ('visible'):\n")
        sys.stdout.write("\t" + (", ".join("/".join(s) for s in theVisibles)) + "\n")

    sys.stderr.flush()
    sys.stdout.flush()
    return settings

We take care that for every phonetic property, the value is always exactly one character, no more no less.
That means that all recorded phonetic texts have the same mapping between character positions and slot numbers.

For the full text it is different: there are 18 letters with an empty full text, and some letters use multiple characters for their full text.

In the end, we only have to produce two mappings for the character node type: for the full text and for the phonetics.
We choose the phonetic `cls` text to carry the phonetic mapping.

As to the mapping from letter nodes to words, sentences, lines and texts: we only need to do that once, and we create
it as a single *up* relation, stored outside the recorders.

The *up* relation goes from nodes from one type to containing nodes of another type.

We make use of the fact that texts are built from lines, which are built from sentences, which are built from words,
which are built from characters. 

This simplifies the *up* relation considerably: we may assume that every node *n* has a single *up* parent:

* look in the node type that is one level higher than the type of *n*
* pick a node *u* in that type that embeds *n*
* *u* must be the only node with that proeprty w.r.t. *n*, since the nodes of these types act as building blocks.

Another simplifying hypothesis that holds for this data, is that each character position corresponds with at most
one node per node type.

So if we have a set of nodes that all correspond with the same character position, they must all belong to different types.
Hence, when we organize mappings from character positions to nodes, and we do that for each node type separately,
then such mappings map each character position to at most one nodes.

Characters on positions that are not mapped by a layer to nodes cannnot be compared with character positions in other layers.
So they will fall out of the results if more than one layer is being compared.

**N.B.**

These simplifying hypothesis make it easier to code the layered search interface in Javascript.
But they are not needed for the concept to work.

Since this is my first implementation of layered search, written under time constraints, I thankfully make use of these
simplifications.

In [16]:
def compress(data):
    sets = {}
    
    compressed = []

    for n in sorted(data):
        sets.setdefault(data[n], []).append(n)
        
    for (value, nset) in sorted(
        sets.items(), key=lambda x: (x[1][0], x[1][-1])
    ):
        nSpec = list(nset)[0] if len(nset) == 1 else specFromRanges(rangesFromSet(nset))
        compressed.append(f"{nSpec}\t{value}")
        
    return compressed

def invert(data):
    return {v: k for (k,v) in data.items()}

In [23]:
def invertMap(map):
    return None if map is None else {v: k for (k, v) in map.items()}


def record(baseLevel):
    A.indent(reset=True)
    A.info("preparing ... ")
    settings = checkSettings(baseLevel)
    layerSettings = settings["layerSettings"]
    typeSeq = settings["typeSeq"]
    typesLower = settings["typesLower"]
    letterLevel = baseLevel == "letter"

    A.info("start recording")

    up = {}
    visible = {}
    layers = {}
    levels = {}
    texts = {}
    positions = {}
    recorders = {}
    accumulators = {}
    testTexts = []

    preSep = dict(
        text="text",
        line="\tline",
        sentence="\t\tsent",
        word="\t\t\tword",
        letter="\t\t\t\tletter",
    )
    postSep = dict(
        text="\n",
        line="\n",
        sentence="\n",
        word="\n",
        letter="\n",
    )

    for (nType, typeInfo) in layerSettings.items():
        levels[nType] = typeInfo.get("description", "")
        ti = typeInfo.get("layers", None)
        if ti is None:
            continue

        visible[nType] = {name: ti[name].get("visible", False) for name in ti}
        layers[nType] = {
            name: dict(
                valueMap=invertMap(ti[name]["map"]),
                pos=ti[name]["pos"] or name,
                value=ti[name].get("example", ""),
                description=ti[name].get("description", ""),
            )
            for name in ti
        }
        texts[nType] = {name: None for name in ti}
        positions[nType] = {name: None for name in ti if ti[name]["pos"] is None}
        recorders[nType] = {
            name: Recorder(A.api) for name in ti if ti[name]["pos"] is None
        }
        accumulators[nType] = {name: [] for name in ti if ti[name]["pos"] is not None}

    nChAbsent = 0

    def addValue(node):
        returnValue = None

        nType = F.otype.v(node)
        typeInfo = layerSettings[nType]
        theseLayers = typeInfo.get("layers", {})

        first = True

        pre = preSep[nType]
        post = postSep[nType]

        if nType == "text":
            testText = []
            testTexts.append((node, testText))
        else:
            testText = testTexts[-1][-1]

        testText.append(f"{pre} {node} [")

        for name in theseLayers:
            info = theseLayers[name]
            descend = info.get("descend", False)
            ascend = info.get("ascend", False)
            vMap = info["map"]
            default = info["default"]
            pos = info["pos"]
            if descend:
                value = ""
                for n in L.d(node, otype=descend):
                    val = Fs(info["feature"]).v(n)
                    if vMap:
                        val = vMap.get(val, default)
                    else:
                        val = val or default
                    value += str(val)
            else:
                refNode = L.u(node, otype=ascend)[0] if ascend else node
                value = Fs(info["feature"]).v(refNode)
                if vMap:
                    value = vMap.get(value, default)
                else:
                    value = value or default
                value = str(value)

            if pos is None:
                recorders[nType][name].add(value)
            else:
                accumulators[nType][name].append(value)

            testText.append(("" if first else "|") + value)

            if first:
                returnValue = value
                first = False

        testText.append(f"]{post}")

        return returnValue

    def addAfterValue(node):
        nType = F.otype.v(node)
        typeInfo = layerSettings[nType]
        afterFeature = typeInfo.get("afterFeature", None)
        afterDefault = typeInfo.get("afterDefault", None)
        value = ""
        if afterFeature is not None:
            value = Fs(afterFeature).v(node)
        if afterDefault is not None:
            if not value:
                value = afterDefault
        if value:
            addAll(nType, value)

    def addAll(nType, value):
        lowerTypes = typesLower[nType]
        for nType in lowerTypes:
            if nType in recorders:
                for x in recorders[nType].values():
                    x.add(value)
            if nType in accumulators:
                for x in accumulators[nType].values():
                    x.append(value)

    def deliverAll():
        for (nType, typeInfo) in recorders.items():
            for (name, x) in typeInfo.items():
                texts[nType][name] = x.text()
                # here we are going to use that there is at most one node per node type
                # that corresponds to a character position
                positions[nType][name] = [
                    tuple(nodes)[0] if nodes else None for nodes in x.positions()
                ]

        for (nType, typeInfo) in accumulators.items():
            for (name, x) in typeInfo.items():
                texts[nType][name] = "".join(x)

    def startNode(node):
        # we have organized recorders by node type
        # we only record nodes of matching type in recorders

        nType = F.otype.v(node)

        if nType in recorders:
            for rec in recorders[nType].values():
                rec.start(node)

    def endNode(node):
        # we have organized recorders by node type
        # we only record nodes of matching type in recorders
        nType = F.otype.v(node)

        if nType in recorders:
            for rec in recorders[nType].values():
                rec.end(node)

    # note the `up[n] = m` statements below:
    # we only let `up` connect nodes from one level to one level higher

    for (i, text) in enumerate(F.otype.s("text")):
        startNode(text)
        title = addValue(text)
        sys.stdout.write("\r" + f"{i + 1:>3} {title:<80}")

        for line in L.d(text, otype="line"):
            up[line] = text
            startNode(line)
            addValue(line)

            for sent in L.d(line, otype="sentence"):
                up[sent] = line
                startNode(sent)
                addValue(sent)

                for word in L.d(sent, otype="word"):
                    up[word] = sent
                    startNode(word)
                    addValue(word)

                    if letterLevel:
                        for letter in L.d(word, otype="letter"):
                            up[letter] = word
                            startNode(letter)

                            ch = addValue(letter)
                            if ch == CH_ABSENT:
                                nChAbsent += 1

                            endNode(letter)
                            addAfterValue(letter)

                    endNode(word)
                    addAfterValue(word)

                endNode(sent)
                addAfterValue(sent)

            endNode(line)
            addAfterValue(line)

        endNode(text)
        addAfterValue(text)

    deliverAll()

    sys.stdout.write("\n")

    if letterLevel:
        A.info(f"{nChAbsent} letter nodes with empty full text")

    config = dict(
        captions=dict(
            title="NENA phono search",
        ),
        name=settings["name"],
        description=settings["description"],
        containerType=settings["containerType"],
        simpleBase=settings["simpleBase"],
        ntypes=typeSeq,
        ntypesinit={level[0]: level[2] for level in C.levels.data},
        dtypeOf={typeSeq[i + 1]: tp for (i, tp) in enumerate(typeSeq[0:-1])},
        utypeOf={tp: typeSeq[i + 1] for (i, tp) in enumerate(typeSeq[0:-1])},
        visible=visible,
        levels=levels,
        layers=layers,
    )
    data = dict(
        texts=texts,
        positions=positions,
        up=compress(up),
    )

    return (config, data, testTexts)

In [24]:
def dumpData(config, data, testTexts, debug=False):
    A.indent(reset=True)
    A.info("Dumping data to a single compact json file")

    for d in (OUTPUT,) + ((TEST_TEXTS_DIR,) if debug else ()):
        if not os.path.exists(d):
            os.makedirs(d, exist_ok=True)
    
    fileNameConfig = f"{OUTPUT}/config.js"
    fileNameData = f"{OUTPUT}/corpus.js"
    
    with open(fileNameConfig, "w") as fh:
        fh.write("const config = ")
        json.dump(config, fh, ensure_ascii=False, indent=1)
    A.info(f"Config written to file {fileNameConfig}")
        
    with open(fileNameData, "w") as fh:
        fh.write("const corpus = ")
        json.dump(data, fh, ensure_ascii=False, indent=None, separators=(',', ':'))
    A.info(f"Data written to file {fileNameData}")
    
    if debug:
        A.info(f"Writing same data in human readable form")
        for (kind, subData) in data.items():
            if kind == "up":
                fileName = f"{DEBUG_OUTPUT}/{kind}.tsv"
                A.info(fileName)
                with open(fileName, "w") as fh:
                    fh.write("\n".join(subData) + "\n")
                continue
                
            for (nType, typeData) in subData.items():
                for (name, layerData) in typeData.items():
                    ext = "txt" if kind == "texts" else "tsv" if kind == "positions" else "json"
                    fileName = f"{DEBUG_OUTPUT}/{kind}-{nType}-{name}.{ext}"
                    A.info(fileName)
                    with open(fileName, "w") as fh:
                        if ext == "json":
                            json.dump(layerData, fh, ensure_ascii=False, indent=1)
                        elif ext == "tsv":
                            for entry in layerData:
                                fh.write(f"{entry}\n")
                        else:
                            fh.write(layerData)
                            
        for (node, testText) in testTexts:
            fileName = f"{TEST_TEXTS_DIR}/{node:>06}.txt"

            with open(fileName, "w") as fh:
                fh.write("".join(testText))
        A.info(f"Test texts written to directory {TEST_TEXTS_DIR}")

In [25]:
(config, data, testText) = record(BASE_LEVEL)

  0.00s preparing ... 
Making data based on word-settingsNode type declared as result container ('by'):
	sentence
Layers declared as visible in the result ('visible'):
	word/text, word/fuzzy
  0.00s start recording
126 Women Do Things Best                                                            


In [26]:
dumpData(config, data, testText, debug=True)

  0.00s Dumping data to a single compact json file
  0.00s Config written to file ~/github/CambridgeSemiticsLab/nena_tf/nena2search/app/config.js
  2.64s Data written to file ~/github/CambridgeSemiticsLab/nena_tf/nena2search/app/corpus.js
  2.64s Writing same data in human readable form
  2.64s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-lang.txt
  2.64s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-speaker.txt
  2.64s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-text.txt
  2.65s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-full.txt
  2.65s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-fuzzy.txt
  2.65s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-lite.txt
  2.65s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-pos.txt
  2.66s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-cls.txt
  2.66s ~/github/CambridgeSemiticsLab/nena_tf/_local/texts-word-voice.txt
  2.66s ~/github/CambridgeSemiticsLab/nena_tf/_local