In [1]:
%load_ext autoreload
%autoreload 2

Make plain text data, remembering the nodes that the text comes from.

In [2]:
import os
import sys
import json

from tf.app import use
from tf.convert.recorder import Recorder
from tf.core.helpers import specFromRanges, rangesFromSet

In [3]:
A = use("nena:clone", checkout="clone", hoist=globals())

In [4]:
for (tp, av, start, end) in C.levels.data:
    print(f"{end - start + 1:>6} {tp:<12} nodes average {int(round(av)):>6} chars")

     2 dialect      nodes average 269689 chars
   126 text         nodes average   4281 chars
   350 paragraph    nodes average   1541 chars
  2544 line         nodes average    212 chars
 16326 sentence     nodes average     33 chars
 24497 subsentence  nodes average     22 chars
 36444 inton        nodes average     15 chars
 93766 stress       nodes average      6 chars
120151 word         nodes average      4 chars
539378 letter       nodes average      1 chars


# Generate full layered data

We use the `full` transcription.

We remember nodes of the types *letter*, *word*, *sentence*, *line*, and *text*.

We store the positions by node type.

In [63]:
GH = os.path.expanduser("~/github")
ORG = "CambridgeSemiticsLab"
REPO = "nena_tf"
REL = "nena2search/app"
OUTPUT = f"{GH}/{ORG}/{REPO}/{REL}"
DEBUG_OUTPUT = f"{GH}/{ORG}/{REPO}/_local"
TEST_TEXTS_DIR = f"{DEBUG_OUTPUT}/texts"

In [64]:
PH_ABSENT = "z"
CH_ABSENT = "¿"

CONFIG = {}

In [65]:
CONFIG["word"] = dict(
    word=dict(
        layers=dict(
            lang=dict(
                feature="lang",
                map={x[0]: i + 1 for (i, x) in enumerate(F.lang.freqList())},
                default=0,
                pos=None,
            ),
            speaker=dict(
                feature="speaker",
                map={x[0]: i + 1 for (i, x) in enumerate(F.speaker.freqList())},
                default=0,
                show=True,
                pos=None,
            ),
            full=dict(
                feature="full",
                map=None,
                default=CH_ABSENT,
                pos=None,
                show=True,
                example="mu83",
            ),
            cls=dict(
                feature="phonetic_class",
                descend="letter",
                map={
                    "vowel": "V",
                    "consonant": "C",
                },
                default=PH_ABSENT,
                show=True,
                pos=None,
            ),
            voice=dict(
                feature="phonation",
                descend="letter",
                map={
                    "plain": "P",
                    "unvoiced_aspirated": "H",
                    "voiced": "V",
                    "unvoiced": "F",
                    "unvoiced_unaspirated": "G",
                    "emphatic": "X",
                },
                default=PH_ABSENT,
                pos="cls",
            ),
            place=dict(
                feature="phonetic_place",
                descend="letter",
                map={
                    "dental-alveolar": "D",
                    "labial": "B",
                    "palatal-alveolar": "C",
                    "palatal": "J",
                    "velar": "G",
                    "uvular": "X",
                    "pharyngeal": "Q",
                    "laryngeal": "H",
                },
                default=PH_ABSENT,
                pos="cls",
            ),
            manner=dict(
                feature="phonetic_manner",
                descend="letter",
                map={
                    "affricative": "A",
                    "nasal": "N",
                    "other": "X",
                    "fricative": "F",
                    "lateral": "L",
                    "sibilant": "S",
                },
                default=PH_ABSENT,
                pos="cls",
            ),
        ),
        afterFeature="full_end",
        afterDefault="/",
    ),
    sentence=dict(
        afterDefault="\n",
        by=True,
    ),
    line=dict(
        layers=dict(
            number=dict(
                feature="line_number",
                map=None,
                default=-1,
                pos=None,
                show=True,
            ),
        ),
        afterDefault="\n",
    ),
    text=dict(
        layers=dict(
            title=dict(
                feature="title",
                map=None,
                default="¿",
                pos=None,
                show=True,
                example="A",
            ),
            dialect=dict(
                feature="dialect",
                ascend="dialect",
                map=None,
                default="¿",
                pos=None,
                show=True,
            ),
            tid=dict(
                feature="text_id",
                map=None,
                default="¿",
                pos=None,
            ),
            place=dict(
                feature="place",
                map=None,
                default="¿",
                pos=None,
                example="Dure",
            ),
        ),
        afterDefault="\n",
    ),
)

In [66]:
CONFIG["letter"] = dict(
    letter=dict(
        layers=dict(
            full=dict(
                feature="full",
                map=None,
                default=CH_ABSENT,
                pos=None,
                show=True,
                example="mu83",
            ),
            cls=dict(
                feature="phonetic_class",
                map={
                    "vowel": "V",
                    "consonant": "C",
                },
                default=PH_ABSENT,
                show=True,
                pos=None,
            ),
            voice=dict(
                feature="phonation",
                map={
                    "plain": "P",
                    "unvoiced_aspirated": "H",
                    "voiced": "V",
                    "unvoiced": "F",
                    "unvoiced_unaspirated": "G",
                    "emphatic": "X",
                },
                default=PH_ABSENT,
                pos="cls",
            ),
            place=dict(
                feature="phonetic_place",
                map={
                    "dental-alveolar": "D",
                    "labial": "B",
                    "palatal-alveolar": "C",
                    "palatal": "J",
                    "velar": "G",
                    "uvular": "X",
                    "pharyngeal": "Q",
                    "laryngeal": "H",
                },
                default=PH_ABSENT,
                pos="cls",
            ),
            manner=dict(
                feature="phonetic_manner",
                map={
                    "affricative": "A",
                    "nasal": "N",
                    "other": "X",
                    "fricative": "F",
                    "lateral": "L",
                    "sibilant": "S",
                },
                default=PH_ABSENT,
                pos="cls",
            ),
        ),
    ),
    word=dict(
        layers=dict(
            lang=dict(
                feature="lang",
                map={x[0]: i + 1 for (i, x) in enumerate(F.lang.freqList())},
                default=0,
                pos=None,
            ),
            speaker=dict(
                feature="speaker",
                map={x[0]: i + 1 for (i, x) in enumerate(F.speaker.freqList())},
                default=0,
                pos=None,
                show=True,
            ),
        ),
        afterFeature="full_end",
        afterDefault="/",
    ),
    sentence=dict(
        afterDefault="\n",
        by=True,
    ),
    line=dict(
        layers=dict(
            number=dict(
                feature="line_number",
                map=None,
                default=-1,
                pos=None,
                show=True,
            ),
        ),
        afterDefault="\n",
    ),
    text=dict(
        layers=dict(
            title=dict(
                feature="title",
                map=None,
                default="¿",
                pos=None,
                show=True,
                example="A",
            ),
            dialect=dict(
                feature="dialect",
                ascend="dialect",
                map=None,
                default="¿",
                pos=None,
                show=True,
            ),
            tid=dict(
                feature="text_id",
                map=None,
                default="¿",
                pos=None,
            ),
            place=dict(
                feature="place",
                map=None,
                default="¿",
                pos=None,
                example="Dure",
            ),
        ),
        afterDefault="\n",
    ),
)

In [67]:
def checkConfig(kind):
    sys.stdout.write(f"Making data based on {kind}-config")
    
    layerConfig = CONFIG[kind]
    typeSeq = list(layerConfig)
    typesLower = {}

    for (i, tp) in enumerate(typeSeq):
        typesLower[tp] = typeSeq[0: i + 1]
        
    config = dict(layerConfig=layerConfig, typeSeq=typeSeq, typesLower=typesLower)

    # check show and by attributes

    theBys = []
    theShows = []

    for (nType, typeInfo) in layerConfig.items():
        if typeInfo.get("by", False):
            theBys.append(nType)

        for (name, layerInfo) in layerConfig[nType].get("layers", {}).items():
            if layerInfo.get("show", False):
                theShows.append((nType, name))

    if len(theBys) == 0:
        sys.stderr.write("No node type is declared as result container ('by')\n")
    elif len(theBys) > 1:
        sys.stderr.write("Multiple node types declared as result container ('by'):\n")
        sys.stderr.write("\t" + (", ".join(theBys)) + "\n")
    else:
        sys.stdout.write("Node type declared as result container ('by'):\n")
        sys.stdout.write(f"\t{theBys[0]}\n")

    sys.stderr.flush()
    sys.stdout.flush()

    if len(theShows) == 0:
        sys.stderr.write("No layer type is declared as result shower ('show')\n")
    else:
        sys.stdout.write("Layers declared as result showers ('show'):\n")
        sys.stdout.write("\t" + (", ".join("/".join(s) for s in theShows)) + "\n")

    sys.stderr.flush()
    sys.stdout.flush()
    return config

We take care that for every phonetic property, the value is always exactly one character, no more no less.
That means that all recorded phonetic texts have the same mapping between character positions and slot numbers.

For the full text it is different: there are 18 letters with an empty full text, and some letters use multiple characters for their full text.

In the end, we only have to produce two mappings for the character node type: for the full text and for the phonetics.
We choose the phonetic `cls` text to carry the phonetic mapping.

As to the mapping from letter nodes to words, sentences, lines and texts: we only need to do that once, and we create
it as a single *up* relation, stored outside the recorders.

The *up* relation goes from nodes from one type to containing nodes of another type.

We make use of the fact that texts are built from lines, which are built from sentences, which are built from words,
which are built from characters. 

This simplifies the *up* relation considerably: we may assume that every node *n* has a single *up* parent:

* look in the node type that is one level higher than the type of *n*
* pick a node *u* in that type that embeds *n*
* *u* must be the only node with that proeprty w.r.t. *n*, since the nodes of these types act as building blocks.

Another simplifying hypothesis that holds for this data, is that each character position corresponds with at most
one node per node type.

So if we have a set of nodes that all correspond with the same character position, they must all belong to different types.
Hence, when we organize mappings from character positions to nodes, and we do that for each node type separately,
then such mappings map each character position to at most one nodes.

Characters on positions that are not mapped by a layer to nodes cannnot be compared with character positions in other layers.
So they will fall out of the results if more than one layer is being compared.

**N.B.**

These simplifying hypothesis make it easier to code the layered search interface in Javascript.
But they are not needed for the concept to work.

Since this is my first implementation of layered search, written under time constraints, I thankfully make use of these
simplifications.

In [68]:
def compress(data):
    sets = {}
    
    compressed = []

    for n in sorted(data):
        sets.setdefault(data[n], []).append(n)
        
    for (value, nset) in sorted(
        sets.items(), key=lambda x: (x[1][0], x[1][-1])
    ):
        nSpec = n if len(nset) == 1 else specFromRanges(rangesFromSet(nset))
        compressed.append(f"{nSpec}\t{value}")
        
    return compressed

def invert(data):
    return {v: k for (k,v) in data.items()}

In [69]:
def record(kind):
    A.indent(reset=True)
    A.info("preparing ... ")
    config = checkConfig(kind)
    layerConfig = config["layerConfig"]
    typeSeq = config["typeSeq"]
    typesLower = config["typesLower"]
    letterLevel = kind == "letter"

    A.info("start recording")

    up = {}
    by = {}
    show = {}
    layers = {}
    texts = {}
    positions = {}
    recorders = {}
    accumulators = {}
    testTexts = []

    preSep = dict(
        text="text",
        line="\tline",
        sentence="\t\tsent",
        word="\t\t\tword",
        letter="\t\t\t\tletter",
    )
    postSep = dict(
        text="\n",
        line="\n",
        sentence="\n",
        word="\n",
        letter="\n",
    )

    for (nType, typeInfo) in layerConfig.items():
        ti = typeInfo.get("layers", None)
        by[nType] = typeInfo.get("by", False)
        if ti is None:
            continue

        show[nType] = {name: ti[name].get("show", False) for name in ti}
        layers[nType] = {
            name: dict(
                map=ti[name]["map"],
                pos=ti[name]["pos"] or name,
                value=ti[name].get("example", ""),
            )
            for name in ti
        }
        texts[nType] = {name: None for name in ti}
        positions[nType] = {name: None for name in ti if ti[name]["pos"] is None}
        recorders[nType] = {
            name: Recorder(A.api) for name in ti if ti[name]["pos"] is None
        }
        accumulators[nType] = {name: [] for name in ti if ti[name]["pos"] is not None}

    nChAbsent = 0

    def addValue(node):
        returnValue = None

        nType = F.otype.v(node)
        typeInfo = layerConfig[nType]
        theseLayers = typeInfo.get("layers", {})

        first = True

        pre = preSep[nType]
        post = postSep[nType]

        if nType == "text":
            testText = []
            testTexts.append((node, testText))
        else:
            testText = testTexts[-1][-1]

        testText.append(f"{pre} {node} [")

        for name in theseLayers:
            info = theseLayers[name]
            descend = info.get("descend", False)
            ascend = info.get("ascend", False)
            vMap = info["map"]
            default = info["default"]
            pos = info["pos"]
            if descend:
                value = ""
                for n in L.d(node, otype=descend):
                    val = Fs(info["feature"]).v(n)
                    if vMap:
                        val = vMap.get(val, default)
                    else:
                        val = val or default
                    value += str(val)
            else:
                refNode = L.u(node, otype=ascend)[0] if ascend else node
                value = Fs(info["feature"]).v(refNode)
                if vMap:
                    value = vMap.get(value, default)
                else:
                    value = value or default
                value = str(value)

            if pos is None:
                recorders[nType][name].add(value)
            else:
                accumulators[nType][name].append(value)

            testText.append(("" if first else "|") + value)

            if first:
                returnValue = value
                first = False

        testText.append(f"]{post}")

        return returnValue

    def addAfterValue(node):
        nType = F.otype.v(node)
        typeInfo = layerConfig[nType]
        afterFeature = typeInfo.get("afterFeature", None)
        afterDefault = typeInfo.get("afterDefault", None)
        value = ""
        if afterFeature is not None:
            value = Fs(afterFeature).v(node)
        if afterDefault is not None:
            if not value:
                value = afterDefault
        if value:
            addAll(nType, value)

    def addAll(nType, value):
        lowerTypes = typesLower[nType]
        for nType in lowerTypes:
            if nType in recorders:
                for x in recorders[nType].values():
                    x.add(value)
            if nType in accumulators:
                for x in accumulators[nType].values():
                    x.append(value)

    def deliverAll():
        for (nType, typeInfo) in recorders.items():
            for (name, x) in typeInfo.items():
                texts[nType][name] = x.text()
                # here we are going to use that there is at most one node per node type
                # that corresponds to a character position
                positions[nType][name] = [
                    tuple(nodes)[0] if nodes else None for nodes in x.positions()
                ]

        for (nType, typeInfo) in accumulators.items():
            for (name, x) in typeInfo.items():
                texts[nType][name] = "".join(x)

    def startNode(node):
        # we have organized recorders by node type
        # we only record nodes of matching type in recorders

        nType = F.otype.v(node)

        if nType in recorders:
            for rec in recorders[nType].values():
                rec.start(node)

    def endNode(node):
        # we have organized recorders by node type
        # we only record nodes of matching type in recorders
        nType = F.otype.v(node)

        if nType in recorders:
            for rec in recorders[nType].values():
                rec.end(node)

    # note the `up[n] = m` statements below:
    # we only let `up` connect nodes from one level to one level higher

    for (i, text) in enumerate(F.otype.s("text")):
        startNode(text)
        title = addValue(text)
        sys.stdout.write("\r" + f"{i + 1:>3} {title:<80}")

        for line in L.d(text, otype="line"):
            up[line] = text
            startNode(line)
            addValue(line)

            for sent in L.d(line, otype="sentence"):
                up[sent] = line
                startNode(sent)
                addValue(sent)

                for word in L.d(sent, otype="word"):
                    up[word] = sent
                    startNode(word)
                    addValue(word)

                    if letterLevel:
                        for letter in L.d(word, otype="letter"):
                            up[letter] = word
                            startNode(letter)

                            ch = addValue(letter)
                            if ch == CH_ABSENT:
                                nChAbsent += 1

                            endNode(letter)
                            addAfterValue(letter)

                    endNode(word)
                    addAfterValue(word)

                endNode(sent)
                addAfterValue(sent)

            endNode(line)
            addAfterValue(line)

        endNode(text)
        addAfterValue(text)

    deliverAll()

    sys.stdout.write("\n")

    if letterLevel:
        A.info(f"{nChAbsent} letter nodes with empty full text")

    data = dict(
        captions=dict(
            title="NENA phono search",
        ),
        ntypes=typeSeq,
        dtypeOf={typeSeq[i + 1]: tp for (i, tp) in enumerate(typeSeq[0:-1])},
        utypeOf={tp: typeSeq[i + 1] for (i, tp) in enumerate(typeSeq[0:-1])},
        by=by,
        show=show,
        layers=layers,
        texts=texts,
        positions=positions,
        up=compress(up),
    )

    return (data, testTexts)

In [70]:
def dumpData(data, testTexts, debug=False):
    A.indent(reset=True)
    A.info("Dumping data to a single compact json file")

    for d in (OUTPUT,) + ((TEST_TEXTS_DIR,) if debug else ()):
        if not os.path.exists(d):
            os.makedirs(d, exist_ok=True)
    
    fileName = f"{OUTPUT}/corpus.js"
    
    with open(fileName, "w") as fh:
        fh.write("const corpus = ")
        json.dump(data, fh, ensure_ascii=False, indent=None, separators=(',', ':'))
        
    A.info(f"Data written to file {fileName}")
    
    if debug:
        A.info(f"Writing same data as non-compact json file")
        fileName = f"{DEBUG_OUTPUT}/corpus.js"
        with open(fileName, "w") as fh:
            fh.write("const corpus = ")
            json.dump(data, fh, ensure_ascii=False, indent=1)
        A.info(f"Data written to file {fileName}")
            
        A.info(f"Writing same data as separate, human readable files")
        for (kind, subData) in data.items():
            if kind in {"ntypes", "up"}:
                fileName = f"{DEBUG_OUTPUT}/{kind}.tsv"
                A.info(fileName)
                with open(fileName, "w") as fh:
                    fh.write("\n".join(subData) + "\n")
                continue
                
            if kind in {"captions", "by", "show", "utypeOf", "dtypeOf"}:
                fileName = f"{DEBUG_OUTPUT}/{kind}.json"
                A.info(fileName)
                with open(fileName, "w") as fh:
                    json.dump(subData, fh, ensure_ascii=False, indent=1)
                continue
                
            for (nType, typeData) in subData.items():
                for (name, layerData) in typeData.items():
                    ext = "txt" if kind == "texts" else "tsv" if kind == "positions" else "json"
                    fileName = f"{DEBUG_OUTPUT}/{kind}-{nType}-{name}.{ext}"
                    A.info(fileName)
                    with open(fileName, "w") as fh:
                        if ext == "json":
                            json.dump(layerData, fh, ensure_ascii=False, indent=1)
                        elif ext == "tsv":
                            for entry in layerData:
                                fh.write(f"{entry}\n")
                        else:
                            fh.write(layerData)
                            
        for (node, testText) in testTexts:
            fileName = f"{TEST_TEXTS_DIR}/{node:>06}.txt"

            with open(fileName, "w") as fh:
                fh.write("".join(testText))
        A.info(f"Test texts written to directory {TEST_TEXTS_DIR}")

In [75]:
# kind = "word"
kind = "letter"

(data, testText) = record(kind)

  0.00s preparing ... 
Making data based on letter-configNode type declared as result container ('by'):
	sentence
Layers declared as result showers ('show'):
	letter/full, letter/cls, word/speaker, line/number, text/title, text/dialect
  0.00s start recording
126 Women Do Things Best                                                            
    12s 18 letter nodes with empty full text


In [76]:
dumpData(data, testText, debug=True)

  0.00s Dumping data to a single compact json file
  1.37s Data written to file ~/github/CambridgeSemiticsLab/nena_tf/nena2search/app/corpus.js
  1.37s Writing same data as non-compact json file
  2.77s Data written to file ~/github/CambridgeSemiticsLab/nena_tf/_local/corpus.js
  2.77s Writing same data as separate, human readable files
  2.77s ~/github/CambridgeSemiticsLab/nena_tf/_local/captions.json
  2.77s ~/github/CambridgeSemiticsLab/nena_tf/_local/ntypes.tsv
  2.77s ~/github/CambridgeSemiticsLab/nena_tf/_local/dtypeOf.json
  2.77s ~/github/CambridgeSemiticsLab/nena_tf/_local/utypeOf.json
  2.77s ~/github/CambridgeSemiticsLab/nena_tf/_local/by.json
  2.78s ~/github/CambridgeSemiticsLab/nena_tf/_local/show.json
  2.78s ~/github/CambridgeSemiticsLab/nena_tf/_local/layers-letter-full.json
  2.78s ~/github/CambridgeSemiticsLab/nena_tf/_local/layers-letter-cls.json
  2.78s ~/github/CambridgeSemiticsLab/nena_tf/_local/layers-letter-voice.json
  2.78s ~/github/CambridgeSemiticsLab/nena_