In [1]:
%load_ext autoreload
%autoreload 2

Make plain text data, remembering the nodes that the text comes from.

In [49]:
import os
import sys
import json

from tf.app import use
from tf.convert.recorder import Recorder
from tf.core.helpers import specFromRangesLogical, rangesFromSet

In [3]:
A = use("nena:clone", checkout="clone", hoist=globals())

In [4]:
for (tp, av, start, end) in C.levels.data:
    print(f"{end - start + 1:>6} {tp:<12} nodes average {int(round(av)):>6} chars")

     2 dialect      nodes average 269689 chars
   126 text         nodes average   4281 chars
   350 paragraph    nodes average   1541 chars
  2544 line         nodes average    212 chars
 16326 sentence     nodes average     33 chars
 24497 subsentence  nodes average     22 chars
 36444 inton        nodes average     15 chars
 93766 stress       nodes average      6 chars
120151 word         nodes average      4 chars
539378 letter       nodes average      1 chars


# Generate full text

We use the `full` transcription.

We remember nodes of the types *letter*, *word*, *sentence*, *line*, and *text*.

We store the positions by node type.

In [47]:
GH = os.path.expanduser("~/github")
ORG = "CambridgeSemiticsLab"
REPO = "nena_tf"
REL = "nena2search/app"
OUTPUT = f"{GH}/{ORG}/{REPO}/{REL}"

if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT, exist_ok=True)

In [35]:
PH_ABSENT = "z"
CH_ABSENT = "\u00a0"

In [36]:
F.phonation.freqList()

(('plain', 140049),
 ('unvoiced_aspirated', 56659),
 ('voiced', 50887),
 ('unvoiced', 44043),
 ('unvoiced_unaspirated', 10671),
 ('emphatic', 4472))

In [37]:
F.phonetic_class.freqList()

(('consonant', 310962), ('vowel', 228398))

In [38]:
F.phonetic_manner.freqList()

(('affricative', 108364),
 ('nasal', 52559),
 ('other', 48948),
 ('fricative', 40138),
 ('lateral', 39248),
 ('sibilant', 21705))

In [39]:
F.phonetic_place.freqList()

(('dental-alveolar', 150177),
 ('labial', 62013),
 ('velar', 32035),
 ('palatal', 25837),
 ('laryngeal', 23789),
 ('palatal-alveolar', 12557),
 ('uvular', 4181),
 ('pharyngeal', 373))

We take care that for every phonetic property, the value is always exactly one character, no more no less.
That means that all recorded phonetic texts have the same mapping between character positions and slot numbers.

For the full text it is different: there are 18 letters with an empty full text, and some letters use multiple characters for their full text.

In the end, we only have to produce two mappings: for the full text and for the phonetics.
We choose the phonetic class text to carry the phonetic mapping.

As to the mapping from letter nodes to words, sentences, lines and texts: we only need to do that once, and we create
it as a single *parent* relation, stored outside the recorders.

In [40]:
F.lang.freqList()

(('NENA', 117093),
 ('K.', 1767),
 ('A.', 775),
 ('K./A.', 263),
 ('A.|A.|K.', 65),
 ('A.|K.', 35),
 ('K./T.', 32),
 ('K.|K.', 26),
 ('K.|K.|K.', 18),
 ('A.|A.', 16),
 ('Urm.', 16),
 ('E.', 12),
 ('K./A./E.', 9),
 ('P.', 5),
 ('A./K.', 4),
 ('K./A.|K./A.', 4),
 ('T.', 4),
 ('Ṭiy.', 3),
 ('A./E.', 2),
 ('K./E.', 1),
 ('K./T.|K./T.', 1))

In [41]:
F.speaker.freqList()

(('Dawið ʾAdam', 21184),
 ('Yulia Davudi', 18191),
 ('Yuwarəš Xošăba Kena', 10124),
 ('Manya Givoyev', 6746),
 ('Yuwəl Yuḥanna', 5953),
 ('Nanəs Bənyamən', 5424),
 ('Yosəp bet Yosəp', 5256),
 ('Yonan Petrus', 4345),
 ('Natan Khoshaba', 4176),
 ('Arsen Mikhaylov', 3338),
 ('Xošebo ʾOdišo', 3281),
 ('Nancy George', 3131),
 ('Awiko Sulaqa', 3096),
 ('Maryam Gwirgis', 2954),
 ('Alice Bet-Yosəp', 2618),
 ('Bənyamən Bənyamən', 2598),
 ('MB', 2317),
 ('Mišayel Barčəm', 1818),
 ('Nadia Aloverdova', 1754),
 ('Frederic Ayyubkhan', 1615),
 ('Victor Orshan', 1426),
 ('Merab Badalov', 1162),
 ('Sophia Danielova', 1109),
 ('Blandina Barwari', 1030),
 ('YD', 998),
 ('Dawið Gwərgəs', 865),
 ('Gwərgəs Dawið', 658),
 ('AB', 603),
 ('Jacob Petrus', 534),
 ('Dawid Adam', 492),
 ('NK', 326),
 ('YP', 320),
 ('JP', 261),
 ('Kena Kena', 174),
 ('Nawiya ʾOdišo', 102),
 ('GK', 101),
 ('Leya ʾOraha', 71))

In [67]:
def compress(data):
    sets = {}
    
    compressed = []

    for n in sorted(data):
        sets.setdefault(data[n], []).append(n)
        
    for (value, nset) in sorted(
        sets.items(), key=lambda x: (x[1][0], x[1][-1])
    ):
        nodeSpec = n if len(nset) == 1 else specFromRangesLogical(rangesFromSet(nset))
        compressed.append([nodeSpec, value])
        
    return compressed

def invert(data):
    return {v: k for (k,v) in data.items()}

In [68]:
def record():
    A.indent(reset=True)
    A.info("preparing ... ")

    phVoice = {
        "": PH_ABSENT,
        "plain": "P",
        "unvoiced_aspirated": "H",
        "voiced": "V",
        "unvoiced": "F",
        "unvoiced_unaspirated": "G",
        "emphatic": "X",
        "": PH_ABSENT,
    }

    phClass = {
        "": PH_ABSENT,
        "vowel": "V",
        "consonant": "C",
    }

    phPlace = {
        "": PH_ABSENT,
        "dental-alveolar": "D",
        "labial": "B",
        "palatal-alveolar": "C",
        "palatal": "J",
        "velar": "G",
        "uvular": "X",
        "pharyngeal": "Q",
        "laryngeal": "H",
    }

    phManner = {
        "": PH_ABSENT,
        "affricative": "A",
        "nasal": "N",
        "other": "X",
        "fricative": "F",
        "lateral": "L",
        "sibilant": "S",
    }

    languageMap = {lang[0]: lang[0].replace(".", "") for lang in F.lang.freqList()}
    languageMap[""] = ""
    languageMap["NENA"] = ""

    speakerMap = {speaker[0]: i + 1 for (i, speaker) in enumerate(F.speaker.freqList())}
    speakerMap[""] = 0

    A.info("start recording")

    up = {}
    textInfo = {}
    lineInfo = {}
    language = {}
    speakers = {}

    recorders = dict(
        full=Recorder(A.api),
        cv=Recorder(A.api),
    )
    recFull = recorders["full"]
    recCv = recorders["cv"]

    accumulators = dict(
        voice=[],
        place=[],
        manner=[],
    )
    accManner = accumulators["manner"]
    accVoice = accumulators["voice"]
    accPlace = accumulators["place"]

    nChAbsent = 0

    for (i, text) in enumerate(F.otype.s("text")):
        title = F.title.v(text)
        sys.stdout.write("\r" + f"{i + 1:>3} {title:<80}")
        textInfo[text] = [
            F.title.v(text) or "",
            F.text_id.v(text) or "",
            F.place.v(text) or "",
            F.speakers.v(text) or "",
        ]
        for line in L.d(text, otype="line"):
            lineInfo[line] = F.line_number.v(line)
            up[line] = text

            for sent in L.d(line, otype="sentence"):
                up[sent] = line

                for word in L.d(sent, otype="word"):
                    up[word] = sent

                    lang = languageMap[F.lang.v(word) or ""]
                    if lang:
                        language[word] = lang

                    speaker = speakerMap[F.speaker.v(word) or ""]
                    if speaker:
                        speakers[word] = speaker

                    for letter in L.d(word, otype="letter"):
                        up[letter] = word

                        for rec in recorders.values():
                            rec.start(letter)

                        ch = F.full.v(letter)
                        if not ch:
                            ch = CH_ABSENT
                            nChAbsent += 1

                        recFull.add(ch)

                        ph = F.phonetic_class.v(letter) or ""
                        recCv.add(phClass[ph])

                        ph = F.phonation.v(letter) or ""
                        accVoice.append(phVoice[ph])

                        ph = F.phonetic_manner.v(letter) or ""
                        accManner.append(phManner[ph])

                        ph = F.phonetic_place.v(letter) or ""
                        accPlace.append(phPlace[ph])

                        for rec in recorders.values():
                            rec.end(letter)

                    fullEnd = F.full_end.v(word) or ""
                    for r in recorders.values():
                        r.add(fullEnd)
                    for r in accumulators.values():
                        r.append(fullEnd)

                for r in recorders.values():
                    r.add("\n")
                for r in accumulators.values():
                    r.append("\n")

            for r in recorders.values():
                r.add("\n")
            for r in accumulators.values():
                r.append("\n")

        for r in recorders.values():
            r.add("\n")
        for r in accumulators.values():
            r.append("\n")

    sys.stdout.write("\n")

    A.info(f"{nChAbsent} letter nodes with empty full text")

    data = dict(
        texts={},
        maps={},
        legend={
            name: invert(data)
            for (name, data) in (
                ("phVoice", phVoice),
                ("phClass", phClass),
                ("phPlace", phPlace),
                ("phManner", phManner),
                ("language", languageMap),
                ("speaker", speakerMap),
            )
        },
        textInfo=textInfo,
        lineInfo=lineInfo,
        language=compress(language),
        speakers=compress(speakers),
        up=compress(up),
    )

    texts = data["texts"]
    maps = data["maps"]

    for (name, recorder) in recorders.items():
        texts[name] = recorder.text()
        maps[name] = [
            tuple(nodes)[0] if nodes else None for nodes in recorder.positions()
        ]

    for (name, accumulator) in accumulators.items():
        texts[name] = "".join(accumulator)

    return data

In [69]:
def dumpData(data):
    A.indent(reset=True)
    A.info("Dumping data")
    
    fileName = f"{OUTPUT}/corpus.js"
    
    with open(fileName, "w") as fh:
        fh.write("const corpus = ")
        json.dump(data, fh, ensure_ascii=False, indent=None, separators=(',', ':'))
        
    A.info(f"Data written to {fileName}")

In [70]:
data = record()

  0.00s preparing ... 
  0.07s start recording
126 Women Do Things Best                                                            
  5.35s 18 letter nodes with empty full text


In [71]:
dumpData(data)

  0.00s Dumping data
  1.65s Data written to ~/github/CambridgeSemiticsLab/nena_tf/nena2search/app/corpus.js
