In [None]:
%load_ext autoreload
%autoreload 2

Make plain text data, remembering the nodes that the text comes from.

In [1]:
import os
import sys

from tf.app import use
from tf.convert.recorder import Recorder

In [2]:
A = use("nena:clone", checkout="clone", hoist=globals())

In [3]:
for (tp, av, start, end) in C.levels.data:
    print(f"{end - start + 1:>6} {tp:<12} nodes average {int(round(av)):>6} chars")

     2 dialect      nodes average 269689 chars
   126 text         nodes average   4281 chars
   350 paragraph    nodes average   1541 chars
  2544 line         nodes average    212 chars
 16326 sentence     nodes average     33 chars
 24497 subsentence  nodes average     22 chars
 36444 inton        nodes average     15 chars
 93766 stress       nodes average      6 chars
120151 word         nodes average      4 chars
539378 letter       nodes average      1 chars


# Generate full text

We use the `full` transcription.

We remember nodes of the types *letter*, *word*, *sentence*, *line*, and *text*.

We store the positions by node type.

In [4]:
OUTPUT = os.path.expanduser("~/local/nena")

if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT, exist_ok=True)

In [5]:
F.phonation.freqList()

(('plain', 140049),
 ('unvoiced_aspirated', 56659),
 ('voiced', 50887),
 ('unvoiced', 44043),
 ('unvoiced_unaspirated', 10671),
 ('emphatic', 4472))

In [6]:
phVoice = dict(
    plain="P",
    unvoiced_aspirated="H",
    voiced="V",
    unvoiced="F",
    unvoiced_unaspirated="G",
    emphatic="X",
)

In [7]:
F.phonetic_class.freqList()

(('consonant', 310962), ('vowel', 228398))

In [8]:
phClass = dict(
    vowel="V",
    consonant="C",
)

In [9]:
F.phonetic_manner.freqList()

(('affricative', 108364),
 ('nasal', 52559),
 ('other', 48948),
 ('fricative', 40138),
 ('lateral', 39248),
 ('sibilant', 21705))

In [10]:
phManner = dict(
    affricative="A",
    nasal="N",
    other="O",
    fricative="F",
    lateral="L",
    sibilant="S",
)

In [11]:
F.phonetic_place.freqList()

(('dental-alveolar', 150177),
 ('labial', 62013),
 ('velar', 32035),
 ('palatal', 25837),
 ('laryngeal', 23789),
 ('palatal-alveolar', 12557),
 ('uvular', 4181),
 ('pharyngeal', 373))

In [12]:
phPlace = {
    "dental-alveolar": "D",
    "labial": "B",
    "palatal-alveolar": "C",
    "palatal": "J",
    "velar": "G",
    "uvular": "X",
    "pharyngeal": "Q",
    "laryngeal": "H",
}

In [14]:
def record():
    textInfo = {}
    lineInfo = {}
    speaker = {}
    lang = {}

    A.indent(reset=True)
    A.info("start recording")
    recorders = dict(
        full=Recorder(A.api),
        cv=Recorder(A.api),
        manner=Recorder(A.api),
        voice=Recorder(A.api),
        place=Recorder(A.api),
    )

    for (i, text) in enumerate(F.otype.s("text")):
        title = F.title.v(text)
        sys.stdout.write("\r" + f"{i + 1:>3} {title:<80}")
        textInfo[text] = dict(
            title=F.title.v(text),
            tid=F.text_id.v(text),
            place=F.place.v(text),
            speakers=F.speakers.v(text),
        )
        for r in recorders.values():
            r.start(text)

        for line in L.d(text, otype="line"):
            lineInfo[line] = F.line_number.v(line)
            for r in recorders.values():
                r.start(line)

            for sent in L.d(line, otype="sentence"):
                for r in recorders.values():
                    r.start(sent)

                for word in L.d(sent, otype="word"):
                    speaker[word] = F.speaker.v(word)
                    lang[word] = F.lang.v(word)
                    for r in recorders.values():
                        r.start(word)

                    for letter in L.d(word, otype="letter"):
                        for r in recorders.values():
                            r.start(letter)

                        recorders["full"].add(F.full.v(letter))

                        ph = F.phonation.v(letter)
                        if ph is not None:
                            recorders["voice"].add(phVoice[ph])

                        ph = F.phonetic_class.v(letter)
                        if ph is not None:
                            recorders["cv"].add(phClass[ph])

                        ph = F.phonetic_manner.v(letter)
                        if ph is not None:
                            recorders["manner"].add(phManner[ph])

                        ph = F.phonetic_place.v(letter)
                        if ph is not None:
                            recorders["place"].add(phPlace[ph])

                        for r in recorders.values():
                            r.end(letter)

                    for r in recorders.values():
                        r.add(F.full_end.v(word))
                        r.end(word)

                for r in recorders.values():
                    r.add("\n")
                    r.end(sent)

            for r in recorders.values():
                r.add("\n")
                r.end(line)

        for r in recorders.values():
            r.add("\n")
            r.end(text)

    sys.stdout.write("\n")
    A.info("done")
    return (recorders, dict(text=textInfo, line=lineInfo, speaker=speaker, lang=lang))

In [15]:
def writeInfo(recorders, only=None):
    A.info("writing")
    for (rName, rec) in recorders.items():
        if only is not None and rName not in only:
            continue
        A.indent(level=True, reset=True)
        A.info(rName)
        recorders[rName].write(f"{OUTPUT}/{rName}.txt", posPath=f"{OUTPUT}/{rName}.tsv", byType=True)
        A.info("done")
        A.indent(level=False)
    A.info("done")

In [16]:
(recorders, info) = record()

  0.00s start recording
126 Women Do Things Best                                                            
  9.65s done


In [59]:
writeInfo(recorders, only=["place"])

    13s writing
   |     0.00s place
   |      |     0.00s gathering nodes ...
   |      |     0.14s found 137415 nodes in 4 types
   |      |     0.14s partitioning nodes over types ...
   |      |     3.79s done
   |      |     0.00s line                 => ~/local/nena/place-line.tsv
   |      |     0.03s sentence             => ~/local/nena/place-sentence.tsv
   |      |     0.08s word                 => ~/local/nena/place-word.tsv
   |      |     0.26s text                 => ~/local/nena/place-text.tsv
   |     4.14s done
    17s done


In [17]:
writeInfo(recorders)

    20s writing
   |     0.00s full
   |      |     0.00s gathering nodes ...
   |      |     0.42s found 576627 nodes in 5 types
   |      |     0.42s partitioning nodes over types ...
   |      |     8.15s done
   |      |     0.00s sentence             => ~/local/nena/full-sentence.tsv
   |      |     0.11s word                 => ~/local/nena/full-word.tsv
   |      |     0.37s text                 => ~/local/nena/full-text.tsv
   |      |     0.46s line                 => ~/local/nena/full-line.tsv
   |      |     0.55s letter               => ~/local/nena/full-letter.tsv
   |     9.53s done
   |     0.00s cv
   |      |     0.00s gathering nodes ...
   |      |     0.45s found 675228 nodes in 5 types
   |      |     0.45s partitioning nodes over types ...
   |      |     7.69s done
   |      |     0.00s sentence             => ~/local/nena/cv-sentence.tsv
   |      |     0.12s word                 => ~/local/nena/cv-word.tsv
   |      |     0.38s text                 => ~/local/n