In [12]:
import yaml
import collections

from tf.fabric import Fabric
from tf.convert.walker import CV
from pathlib import Path
from functools import cmp_to_key

## Load Data

In [61]:
BHSA_CORE_DATA = Path('/Users/cody/github/etcbc/bhsa')
BHSA_TF = BHSA_CORE_DATA / 'tf/2021'
BHSA_YAML = BHSA_CORE_DATA / 'yaml'
BHSA_METADATA_FILES = ['core.yaml', 'lexicon.yaml', 'ketivqere.yaml', 'paragraph.yaml', 'stats.yaml']
BHSA_METADATA_PATHS = [BHSA_YAML / file for file in BHSA_METADATA_FILES]

BHSA_GENERIC = BHSA_YAML / 'generic.yaml'

In [30]:
def load_yaml(filepath):
    """Load yaml config as dict."""
    with open(filepath, 'r') as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)
    

def load_all_feature_metadata(feature_metadata_paths):
    """Load all feature metadata into a single dictionary."""
    return {
        feature: value
        for path in feature_metadata_paths
        for feature, value in load_yaml(path).items()
    }

In [8]:

tf_builder = Fabric(locations='./test_corpus')
cv = CV(tf_builder)

  0.00s Not all of the warp features otype and oslots are present in
test_corpus
  0.00s Only the Feature and Edge APIs will be enabled
  0.00s Warp feature "otext" not found. Working without Text-API



In [9]:
TF = Fabric('test_corpus')

  0.00s Not all of the warp features otype and oslots are present in
test_corpus
  0.00s Only the Feature and Edge APIs will be enabled
  0.00s Warp feature "otext" not found. Working without Text-API



In [43]:
TF.save(
    nodeFeatures={
        'test': {1: 'yes', 2: 'ooo yeah'},
    },
    edgeFeatures={
        'tedge': {1: {3, 4}, 2: {5, 6}},
        'oslots': {1: {1}, 2: {2}, 3: {3}, 4: {4}, 5: {5}}
    },
)

  0.00s Exporting 1 node and 2 edge and 0 config features to test_corpus:
  0.01s VALIDATING oslots feature


  0.01s ERROR: cannot check validity of oslots feature
   |     0.00s test: Missing @valueType. Should be one of str, int


   |     0.00s T test                 to test_corpus


   |     0.00s oslots: Missing @valueType. Should be one of str, int


   |     0.00s T oslots               to test_corpus


   |     0.00s tedge: Missing @valueType. Should be one of str, int


   |     0.00s T tedge                to test_corpus
  0.02s Exported 1 node features and 2 edge features and 0 config features to test_corpus


False

In [11]:
tf_bhsa = Fabric('/Users/cody/github/etcbc/bhsa/tf/2021')
bhsa = tf_bhsa.loadAll()

  2.04s Feature overview: 109 for nodes; 6 for edges; 1 configs; 9 computed


In [75]:
SLOT_TYPE = 'word'

OTEXT = {

    "fmt:lex-default": "{voc_lex_utf8}",
    "fmt:lex-orig-full": "{g_lex_utf8}",
    "fmt:lex-orig-plain": "{lex_utf8}",
    "fmt:lex-trans-full": "{g_lex}",
    "fmt:lex-trans-plain": "{lex}",
    "fmt:text-orig-full": "{qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}",
    "fmt:text-orig-full-ketiv": "{g_word_utf8}{trailer_utf8}",
    "fmt:text-orig-plain": "{g_cons_utf8}{trailer_utf8}",
    "fmt:text-trans-full": "{qere/g_word}{qere_trailer/trailer}",
    "fmt:text-trans-full-ketiv": "{g_word}{trailer}",
    "fmt:text-trans-plain": "{g_cons}{trailer}",

}

GENERIC = {
    "name": "BHSA-Kingham-thesis",
    "description": "A modified version of the ETCBC's BHSA for my Cambridge PhD thesis",
    "version": "1.0",
    "editor": "Cody Kingham",
    "source": "Eep Talstra Centre for Bible and Computer",
    "source-url": "https://github.com/etcbc/bhsa",
    "encoders": "Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)",
}

INT_FEATURES = {}

GENERIC_META = load_yaml(BHSA_GENERIC)
GENERIC_META.update({
    'dateWritten': None,
    'writtenBy': None,
})

In [68]:
bhsa.Eall()

['distributional_parent',
 'functional_parent',
 'mother',
 'omap@2017-2021',
 'omap@c-2021',
 'oslots']

In [77]:
bhsa.Fall()

['book',
 'book@am',
 'book@ar',
 'book@bn',
 'book@da',
 'book@de',
 'book@el',
 'book@en',
 'book@es',
 'book@fa',
 'book@fr',
 'book@he',
 'book@hi',
 'book@id',
 'book@ja',
 'book@ko',
 'book@la',
 'book@nl',
 'book@pa',
 'book@pt',
 'book@ru',
 'book@sw',
 'book@syc',
 'book@tr',
 'book@ur',
 'book@yo',
 'book@zh',
 'chapter',
 'code',
 'det',
 'dist',
 'dist_unit',
 'domain',
 'freq_lex',
 'freq_occ',
 'function',
 'g_cons',
 'g_cons_utf8',
 'g_lex',
 'g_lex_utf8',
 'g_nme',
 'g_nme_utf8',
 'g_pfm',
 'g_pfm_utf8',
 'g_prs',
 'g_prs_utf8',
 'g_uvf',
 'g_uvf_utf8',
 'g_vbe',
 'g_vbe_utf8',
 'g_vbs',
 'g_vbs_utf8',
 'g_word',
 'g_word_utf8',
 'gloss',
 'gn',
 'instruction',
 'is_root',
 'kind',
 'kq_hybrid',
 'kq_hybrid_utf8',
 'label',
 'language',
 'languageISO',
 'lex',
 'lex0',
 'lex_utf8',
 'lexeme_count',
 'ls',
 'mother_object_type',
 'nametype',
 'nme',
 'nu',
 'number',
 'otype',
 'pargr',
 'pdp',
 'pfm',
 'prs',
 'prs_gn',
 'prs_nu',
 'prs_ps',
 'ps',
 'qere',
 'qere_trail

In [71]:
list(
    bhsa.EdgeString('mother').items()
)[:5]

[(427567, frozenset({427566})),
 (427579, frozenset({105})),
 (427580, frozenset({115})),
 (427593, frozenset({427592})),
 (427596, frozenset({651689}))]

In [72]:
list(
    bhsa.EdgeString('omap@c-2021').items()
)[:5]

[(1, {1: None}),
 (2, {2: None}),
 (3, {3: None}),
 (4, {4: None}),
 (5, {5: None})]

In [78]:
bhsa.FeatureString('function').data

{651573: 'Time',
 651574: 'Pred',
 651575: 'Subj',
 651576: 'Objc',
 651577: 'Conj',
 651578: 'Subj',
 651579: 'Pred',
 651580: 'PreC',
 651581: 'Conj',
 651582: 'Subj',
 651583: 'PreC',
 651584: 'Conj',
 651585: 'Subj',
 651586: 'PreC',
 651587: 'Cmpl',
 651588: 'Conj',
 651589: 'Pred',
 651590: 'Subj',
 651591: 'Pred',
 651592: 'Subj',
 651593: 'Conj',
 651594: 'Pred',
 651595: 'Subj',
 651596: 'Conj',
 651597: 'Pred',
 651598: 'Subj',
 651599: 'Objc',
 651600: 'Conj',
 651601: 'Pred',
 651602: 'Conj',
 651603: 'Pred',
 651604: 'Subj',
 651605: 'Cmpl',
 651606: 'Conj',
 651607: 'Pred',
 651608: 'Subj',
 651609: 'Cmpl',
 651610: 'Objc',
 651611: 'Conj',
 651612: 'Cmpl',
 651613: 'Pred',
 651614: 'Objc',
 651615: 'Conj',
 651616: 'Pred',
 651617: 'Subj',
 651618: 'Conj',
 651619: 'Pred',
 651620: 'Subj',
 651621: 'PreC',
 651622: 'Conj',
 651623: 'Pred',
 651624: 'Subj',
 651625: 'Pred',
 651626: 'Subj',
 651627: 'PreC',
 651628: 'Conj',
 651629: 'Pred',
 651630: 'PreC',
 651631: 'Cmpl

In [85]:
def get_all_feature_dicts(tf_api):
    features = {}
    edges = {}
    for feature in tf_api.Fall():
        try:
            features[feature] = bhsa.FeatureString(feature).data.copy()
        except AttributeError:
            print(feature)
    return features

In [86]:
all_features = get_all_feature_dicts(bhsa)

otype


In [87]:
all_features['pdp'][1]

'prep'

In [73]:
def get_non_generic_meta(metadata):
    return {k: v for k,v in metadata.items() if k not in GENERIC_META}

In [74]:
get_non_generic_meta(bhsa.FeatureString('function').meta)

{'description': '✅ syntactic function of phrase (Cmpl; Objc; Pred; ...)',
 'valueType': 'str',
 'version': '2021'}

In [58]:
bhsa.FeatureString('function').meta

{'author': 'Eep Talstra Centre for Bible and Computer',
 'dataset': 'BHSA',
 'datasetName': 'Biblia Hebraica Stuttgartensia Amstelodamensis',
 'description': '✅ syntactic function of phrase (Cmpl; Objc; Pred; ...)',
 'email': 'shebanq@ancient-data.org',
 'encoders': 'Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)',
 'valueType': 'str',
 'version': '2021',
 'website': 'https://shebanq.ancient-data.org',
 'writtenBy': 'Text-Fabric',
 'dateWritten': '2021-12-09T14:17:57Z'}

In [59]:
bhsa.FeatureString('suffix_gender').meta

{'author': 'Eep Talstra Centre for Bible and Computer',
 'dataset': 'BHSA',
 'datasetName': 'Biblia Hebraica Stuttgartensia Amstelodamensis',
 'email': 'shebanq@ancient-data.org',
 'encoders': 'Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)',
 'valueType': 'str',
 'version': '2021',
 'website': 'https://shebanq.ancient-data.org',
 'writtenBy': 'Text-Fabric',
 'dateWritten': '2021-12-09T14:18:15Z'}

In [53]:
bhsa.EdgeString('mother')

<tf.core.edgefeature.EdgeFeature at 0x296fb7610>

In [54]:
bhsa.FeatureString('function')

<tf.core.nodefeature.NodeFeature at 0x296fb6c20>

In [56]:
list(bhsa.EdgeString('omap@c-2021').items())

[(1, {1: None}),
 (2, {2: None}),
 (3, {3: None}),
 (4, {4: None}),
 (5, {5: None}),
 (6, {6: None}),
 (7, {7: None}),
 (8, {8: None}),
 (9, {9: None}),
 (10, {10: None}),
 (11, {11: None}),
 (12, {12: None}),
 (13, {13: None}),
 (14, {14: None}),
 (15, {15: None}),
 (16, {16: None}),
 (17, {17: None}),
 (18, {18: None}),
 (19, {19: None}),
 (20, {20: None}),
 (21, {21: None}),
 (22, {22: None}),
 (23, {23: None}),
 (24, {24: None}),
 (25, {25: None}),
 (26, {26: None}),
 (27, {27: None}),
 (28, {28: None}),
 (29, {29: None}),
 (30, {30: None}),
 (31, {31: None}),
 (32, {32: None}),
 (33, {33: None}),
 (34, {34: None}),
 (35, {35: None}),
 (36, {36: None}),
 (37, {37: None}),
 (38, {38: None}),
 (39, {39: None}),
 (40, {40: None}),
 (41, {41: None}),
 (42, {42: None}),
 (43, {43: None}),
 (44, {44: None}),
 (45, {45: None}),
 (46, {46: None}),
 (47, {47: None}),
 (48, {48: None}),
 (49, {49: None}),
 (50, {50: None}),
 (51, {51: None}),
 (52, {52: None}),
 (53, {53: None}),
 (54, {54: 

In [52]:
dir(bhsa.FeatureString('function'))

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'api',
 'data',
 'freqList',
 'items',
 'meta',
 's',
 'v']

## Conversion Script

In [14]:
BOOK_CUTOFF = bhsa.T.nodeFromSection(('2_Kings',))

NON_SLOT_OTYPES = {
    # NB: exclude books since these will be handled separately
    # in order to exclude books past the BOOK_CUTOFF
    'chapter',
    'lex',
    'verse',
    'half_verse',
    'sentence',
    'sentence_atom',
    'clause',
    'clause_atom',
    'phrase',
    'phrase_atom',
    'subphrase',
}

In [15]:
bhsa.F.otype.all[:-1]

('book',
 'chapter',
 'lex',
 'verse',
 'half_verse',
 'sentence',
 'sentence_atom',
 'clause',
 'clause_atom',
 'phrase',
 'phrase_atom',
 'subphrase')

In [16]:
bhsa.L.u(1, 'clause')

(427559,)

In [17]:
for otype in bhsa.F.otype.all[:-1]:

    slot_order = sorted(
        (bhsa.L.d(n, 'word'), -len(bhsa.L.d(n, 'word')), n)
        for n in bhsa.F.otype.s(otype)
    )

    iter_order = [
        (bhsa.L.d(n, 'word'), -len(bhsa.L.d(n, 'word')), n)
        for n in bhsa.F.otype.s(otype)
    ]
    
    try:
        assert slot_order == iter_order
    except:
        print(otype)

subphrase


In [18]:
from functools import cmp_to_key


def convert_to_chunk(node):
    slots = bhsa.L.d(node, 'word')
    return (node, set(slots))


def _canonical_order(chunk_a, chunk_b):
    (n1, slotsA) = chunk_a
    (n2, slotsB) = chunk_b
    
    if slotsA == slotsB:
        return 0

    aWithoutB = slotsA - slotsB
    if not aWithoutB:
        return 1

    bWithoutA = slotsB - slotsA
    if not bWithoutA:
        return -1

    aMin = min(aWithoutB)
    bMin = min(bWithoutA)
    return -1 if aMin < bMin else 1


canonical_order = cmp_to_key(_canonical_order)

otype = 'subphrase'
canon_order = sorted(
    (convert_to_chunk(n)
    for n in bhsa.F.otype.s(otype)),
    key=canonical_order,
)
iter_order = [
    convert_to_chunk(n) for n in bhsa.F.otype.s(otype)
]

assert canon_order == iter_order

In [20]:
canon_order[32:35]

[(1300573, {256, 257, 258, 259, 260}),
 (1300571, {256, 257}),
 (1300572, {259, 260})]

In [21]:
iter_order[32:35]

[(1300573, {256, 257, 258, 259, 260}),
 (1300571, {256, 257}),
 (1300572, {259, 260})]

In [119]:
merges = [
    # merge operation
    # actions to take:
    # 1) delete last nodes, leave first
    # 2) update oslots for first node
    # 3) delete all features for last nodes
    # 4) update features for first
    [
        # nodes to merge
        [427559, 427560],
        # new features
        ['XQtl', ...],
        # new edges
        [...],
    ],
]

splits = [
    # split operation
    # actions:
    # 1) add new nodes with new oslots
    # 2) update oslots for node
    # 3) update features for all new nodes and for first
    [
        # node
        427559,
        # new oslot map
        [(1, 2, 3, 4), (5, 6, 7)],
        # new cl features
        [
            ('XQtl', ...),
        ],
        # new cl edges
        [
            (...),
            ...
        ],
    ],
]

In [103]:
oslots = collections.defaultdict(dict)

In [110]:
for node in bhsa.N.walk():
    
    otype = bhsa.F.otype.v(node)
    
    # skip slots
    if otype == 'word':
        continue
        
    slots = bhsa.L.d(node, 'word')
    oslots[otype].append((
        slots[0],
        slots[-1] + 1,
    ))

In [111]:
oslots['clause'][:100]

[(1, 12),
 (12, 19),
 (19, 24),
 (24, 32),
 (32, 35),
 (35, 37),
 (37, 40),
 (40, 46),
 (46, 48),
 (48, 58),
 (58, 65),
 (65, 71),
 (71, 74),
 (74, 77),
 (77, 79),
 (79, 82),
 (82, 88),
 (88, 95),
 (95, 101),
 (101, 116),
 (106, 112),
 (116, 122),
 (122, 125),
 (125, 132),
 (132, 135),
 (135, 138),
 (138, 140),
 (140, 143),
 (143, 153),
 (153, 157),
 (157, 160),
 (160, 167),
 (167, 174),
 (174, 177),
 (177, 179),
 (179, 182),
 (182, 191),
 (187, 189),
 (191, 201),
 (195, 198),
 (201, 204),
 (204, 216),
 (210, 214),
 (216, 223),
 (218, 221),
 (223, 226),
 (226, 228),
 (228, 231),
 (231, 234),
 (234, 236),
 (236, 239),
 (239, 245),
 (245, 254),
 (254, 266),
 (266, 274),
 (274, 279),
 (279, 282),
 (282, 314),
 (314, 322),
 (322, 327),
 (327, 337),
 (337, 347),
 (347, 350),
 (350, 352),
 (352, 355),
 (355, 358),
 (358, 360),
 (360, 363),
 (363, 369),
 (369, 380),
 (380, 402),
 (392, 394),
 (394, 396),
 (396, 400),
 (402, 409),
 (409, 412),
 (412, 414),
 (414, 418),
 (418, 420),
 (420, 421)

In [85]:
precedence = (
    'book',
    'chapter',
    'lex',
    'verse',
    'half_verse',
    'sentence',
    'sentence_atom',
    'clause',
    'clause_atom',
    'phrase',
    'phrase_atom',
    'subphrase',
)


class NodeManager:
    """A manager object for adding / removing nodes."""
    
    def __init__(
        self, 
        cv: CV, 
        precedence: Tuple[str]
    ) -> None:
        """Initialize the manager."""
        self.cv = cv
        self.active = []
        self.precedence: Dict[str, int] = self._get_prec_dict(precedence)

    @staticmethod
    def _get_prec_dict(precedence: Tuple[str]) -> Dict[str, int]:
        """Assign precedence dict."""
        return {otype: i for i, otype in enumerate(precedence)}
    
    def add_node(self, node: int, otype: str):
        """Add new nodes and terminate old ones as needed."""
        for node in active:

    def add_slot(self):
        """Add new slots."""
        pass

IndentationError: expected an indented block after function definition on line 33 (3058577605.py, line 36)

In [89]:
test = CV.node('test')

TypeError: CV.node() missing 1 required positional argument: 'nType'

In [86]:
def build_corpus(
    tf_api,
    cv,
    book_cutoff=None,
) -> None:
    """Build a text-fabric corpus from bhsa."""

    for book_node in tf_api.F.otype.s('book'):
        
        # skip books that don't make the cutoff
        if book_cutoff and book_node > book_cutoff:
            continue
        
        book = cv.node('book')
        for node in tf_api.L.d(book):
            otype = tf_api.F.otype.v(node)
            
            else:
                cv.terminate()
        
        cv.terminate(book)

SyntaxError: invalid syntax (455042060.py, line 18)

In [87]:
build_corpus(
    bhsa,
    book_cutoff=BOOK_CUTOFF,
)

426591 Genesis
426592 Exodus
426593 Leviticus
426594 Numeri
426595 Deuteronomium
426596 Josua
426597 Judices
426598 Samuel_I
426599 Samuel_II
426600 Reges_I
426601 Reges_II
