In [49]:
import os
import re
from tf.extra.bhsa import Bhsa

In [2]:
B = Bhsa(hoist=globals())

Using bhsa-c r1.4 in ~/text-fabric-data/etcbc/bhsa/tf/c
Using phono-c r1.1 in ~/text-fabric-data/etcbc/phono/tf/c
Using parallels-c r1.1 in ~/text-fabric-data/etcbc/parallels/tf/c


**Documentation:** <a target="_blank" href="https://etcbc.github.io/bhsa" title="provenance of this corpus">BHSA</a> <a target="_blank" href="https://etcbc.github.io/bhsa/features/hebrew/c/0_home.html" title="BHSA feature documentation">Feature docs</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/Bhsa/" title="BHSA API documentation">BHSA API</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/" title="text-fabric-api">Text-Fabric API 6.2.2</a> <a target="_blank" href="https://dans-labs.github.io/text-fabric/Api/General/#search-templates" title="Search Templates Introduction and Reference">Search Reference</a>


This notebook online:
<a target="_blank" href="https://nbviewer.jupyter.org/github/etcbc/lingo/blob/master/learning/baayen.ipynb">NBViewer</a>
<a target="_blank" href="https://github.com/etcbc/lingo/blob/master/learning/baayen.ipynb">GitHub</a>


In [7]:
C.levels.data

(('book', 10938.051282051281, 426585, 426623),
 ('chapter', 459.18622174381056, 426624, 427552),
 ('lex', 46.2021011588866, 1437567, 1446799),
 ('verse', 18.37694395381898, 1414354, 1437566),
 ('half_verse', 9.441876936697653, 606362, 651541),
 ('sentence', 6.693928789994822, 1172290, 1236016),
 ('sentence_atom', 6.611142967841921, 1236017, 1300541),
 ('clause', 4.8408892318516585, 427553, 515673),
 ('clause_atom', 4.703863796753705, 515674, 606361),
 ('phrase', 1.684724355961723, 651542, 904748),
 ('phrase_atom', 1.5944621572020736, 904749, 1172289),
 ('subphrase', 1.4231715460584122, 1300542, 1414353),
 ('word', 1, 1, 426584))

In [3]:
TEMP = "_temp"
PHRASES = f"{TEMP}/phrases.tsv"

if not os.path.exists(TEMP):
    os.makedirs(TEMP, exist_ok=True)

In [4]:
headersLex = tuple(
    """
  word
  gloss
  lex
  pdp
  stem
  tense
  p-n-g
  suffix-p-n-h
""".strip().split()
)

nHeaders = len(headersLex)

In [5]:
emptyS = ""
VERB = "verb"
NONE_VALUES = {"NA", "unknown"}

# Generate sentences

 beresith bara elohim

phonetisch, per sentence

In [10]:
T.formats

{'lex-default',
 'lex-orig-full',
 'lex-orig-plain',
 'lex-trans-full',
 'lex-trans-plain',
 'text-orig-full',
 'text-orig-full-ketiv',
 'text-orig-plain',
 'text-phono-full',
 'text-trans-full',
 'text-trans-full-ketiv',
 'text-trans-plain'}

In [17]:
"bᵊrēšˌîṯ bārˈā ʔᵉlōhˈîm ʔˌēṯ haššāmˌayim wᵊʔˌēṯ hāʔˈāreṣ .".rstrip(".").rstrip()

'bᵊrēšˌîṯ bārˈā ʔᵉlōhˈîm ʔˌēṯ haššāmˌayim wᵊʔˌēṯ hāʔˈāreṣ'

In [50]:
endPat = r"\s*\..*"
endRe = re.compile(endPat)

In [51]:
nf = open(f"{TEMP}/nodes.txt", "w")
with open(f"{TEMP}/sentences.txt", "w") as sf:
    for s in F.otype.s("sentence"):
        words = L.d(s, otype="word")
        if any(F.language.v(w) == "Aramaic" for w in words):
            continue
        sText = T.text(words, fmt="text-phono-full")
        sText = endRe.sub("", sText)
        sText = (
            sText.replace("[", "")
            .replace("]", "")
            .replace("*", "")
            .replace("ˌ", "")
            .replace("ˈ", "")
            .replace("-", " ")
        )
        sf.write(f"{sText}\n")
        nf.write(f"{s}\n")

# Generate featured data

  be(prep)_resith(noun:sg)_bara(verb:3:s:m)

  be386_prep_resith256_noun_s bara356_verb_qal_perfect_3_s_m

  lexeme, vocalized lexeme (ETCBC trans) plus node nummer (voor disambiguatie)

In [35]:
NONE_VALUES

{'NA', None, 'none', 'unknown'}

In [40]:
suffixInfo = set(
    """
  ExsS
  IntS
  ModS
  NCoS
  PrcS
  PreO
  PreS
  PtcO
""".strip().split()
)

In [42]:
noneValues = {"n/a"} | NONE_VALUES

features = """
  lex
  pdp
  vs
  vt
  ps
  nu
  gn
  st
""".strip().split()
suffixFeatures = """
  prs_ps
  prs_nu
  prs_gn
""".strip().split()

TF.indent(reset=True)
TF.info("writing lemmas")
with open(f"{TEMP}/lemmas.txt", "w") as sf:
    i = 0
    for s in F.otype.s("sentence"):
        words = L.d(s, otype="word")
        if any(F.language.v(w) == "Aramaic" for w in words):
            continue
        i += 1
        sText = ""
        for w in words:
            lx = L.u(w, otype="lex")[0]
            sep = "_" if F.trailer.v(w) == "" else " "
            contrib = "_".join(
                Fs(feature).v(w)
                for feature in features
                if Fs(feature).v(w) not in noneValues
            )
            suffixContrib = "_".join(
                f"${Fs(feature).v(w)}"
                for feature in suffixFeatures
                if Fs(feature).v(w) not in noneValues
            )
            suffixExtra = ""
            if suffixContrib:
                p = L.u(w, otype="phrase")[0]
                phraseFunction = F.function.v(p)
                suffixExtra = (
                    phraseFunction
                    if phraseFunction in suffixInfo
                    else "Poss"
                    if F.pdp.v(w) == "subs"
                    else ""
                )
                suffixExtra = f"_${suffixExtra}" if suffixExtra else ""
            sText += f"{contrib}{suffixContrib}{suffixExtra}{sep}"
        sf.write(f"{sText}\n")
TF.info(f"{i} sentences")

  0.00s writing lemmas
    13s 62934 sentences


In [6]:
TF.indent(reset=True)
TF.info("Writing phrases")
nP = 0
nV = 0
nW = 0

with open(PHRASES, "w", encoding="utf_16_le") as ph:
    ph.write("\uFEFF")
    # ph.write('\t'.join(('book', 'chapter', 'verse') + headersLex) + '\n')
    for p in F.otype.s("phrase"):
        passage = (str(s) for s in T.sectionFromNode(p))
        nP += 1
        firstVerb = None
        words = []
        for w in L.d(p, otype="word"):
            nW += 1
            lx = L.u(w, otype="lex")[0]
            wordInfoPre = tuple(
                emptyS if x is None or x in NONE_VALUES else x
                for x in (
                    F.g_word_utf8.v(w),
                    F.gloss.v(lx),
                    F.lex_utf8.v(w),
                    F.pdp.v(w),
                    F.vs.v(w) or emptyS,
                    F.vt.v(w) or emptyS,
                    F.ps.v(w) or emptyS,
                    F.nu.v(w) or emptyS,
                    F.gn.v(w) or emptyS,
                    F.prs_ps.v(w) or emptyS,
                    F.prs_nu.v(w) or emptyS,
                    F.prs_gn.v(w) or emptyS,
                )
            )
            wordInfo = tuple(
                (
                    *wordInfoPre[0:6],
                    "{}_{}_{}".format(*wordInfoPre[6:9]),
                    "{}_{}_{}".format(*wordInfoPre[9:12]),
                )
            )

            isVerb = wordInfo[3] == VERB

            if firstVerb or not isVerb:
                words.append(wordInfo)
            else:
                nV += 1
                firstVerb = wordInfo
        if firstVerb is None:
            firstVerb = ("",) * nHeaders
        ph.write(
            "\t".join(
                "\t".join(component) for component in [passage, firstVerb] + words
            )
            + "\n"
        )
TF.info(
    f"""Done
  {nP} phrases
  {nW} words
  {nV} verbs (first in phrase)
"""
)

  0.00s Writing phrases
    21s Done 
  253207 phrases
  426584 words
  69024 verbs (first in phrase)



In [44]:
B.pretty(1174017)

In [45]:
B.pretty(1172386)