In [79]:
import yaml
import collections
import shutil

from copy import deepcopy
from tf.fabric import Fabric
from tf.core.api import Api
from tf.convert.walker import CV
from pathlib import Path
from functools import cmp_to_key
from typing import Dict, Tuple, Union, Optional, Set, List, Callable, TypedDict, NamedTuple

## Load Data

In [80]:
BHSA_CORE_DATA = Path('/Users/cody/github/etcbc/bhsa')
BHSA_TF = BHSA_CORE_DATA / 'tf/2021'
BHSA_YAML = BHSA_CORE_DATA / 'yaml'
BHSA_METADATA_FILES = ['core.yaml', 'lexicon.yaml', 'ketivqere.yaml', 'paragraph.yaml', 'stats.yaml']
BHSA_METADATA_PATHS = [BHSA_YAML / file for file in BHSA_METADATA_FILES]

BHSA_GENERIC = BHSA_YAML / 'generic.yaml'

In [81]:
def load_yaml(filepath):
    """Load yaml config as dict."""
    with open(filepath, 'r') as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)
    

def load_all_feature_metadata(feature_metadata_paths):
    """Load all feature metadata into a single dictionary."""
    return {
        feature: value
        for path in feature_metadata_paths
        for feature, value in load_yaml(path).items()
    }

In [82]:
tf_bhsa = Fabric('/Users/cody/github/etcbc/bhsa/tf/2021')
bhsa = tf_bhsa.loadAll()

  1.76s Feature overview: 109 for nodes; 6 for edges; 1 configs; 9 computed


In [85]:
verseless_clauses = []
for cl in bhsa.F.otype.s('clause'):
    if not bhsa.L.u(cl, 'verse'):
        verseless_clauses.append(cl)

In [86]:
len(verseless_clauses)

50

In [87]:
verseless_clauses[0]

428158

In [88]:
bhsa.T.text(428158)

'מִן־הַבְּהֵמָה֙ הַטְּהֹורָ֔ה וּמִן־הַ֨בְּהֵמָ֔ה וּמִ֨ן־הָעֹ֔וף וְכֹ֥ל שְׁנַ֨יִם שְׁנַ֜יִם בָּ֧אוּ אֶל־נֹ֛חַ אֶל־הַתֵּבָ֖ה זָכָ֣ר וּנְקֵבָ֑ה '

In [89]:
bhsa.T.sectionFromNode(428158)

('Genesis', 7, 8)

In [5]:
BHSAKT_METADATA = {
    "corpus": "BHSA-Kingham-thesis",
    "description": "A modified version of the ETCBC's BHSA for my Cambridge PhD thesis",
    "version": "1.0",
    "editor": "Cody Kingham",
    "source": "Eep Talstra Centre for Bible and Computer",
    "source-url": "https://github.com/etcbc/bhsa",
    "encoders": "Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)",
}


GENERIC_META = load_yaml(BHSA_GENERIC)
GENERIC_META.update({
    'dateWritten': None,
    'writtenBy': None,
})

In [6]:
GENERIC_META

{'dataset': 'BHSA',
 'datasetName': 'Biblia Hebraica Stuttgartensia Amstelodamensis',
 'author': 'Eep Talstra Centre for Bible and Computer',
 'encoders': 'Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)',
 'website': 'https://shebanq.ancient-data.org',
 'email': 'shebanq@ancient-data.org',
 'dateWritten': None,
 'writtenBy': None}

In [7]:
def convert_to_chunk(node):
    slots = bhsa.L.d(node, 'word')
    return (node, set(slots))


def _canonical_order(node_chunk_a, node_chunk_b):
    """Sort items in canonical sorting order."""
    na, prec_a, slotsA = node_chunk_a
    nb, prec_b, slotsB = node_chunk_b
    
    # compare based on node precedence
    if prec_a > prec_b:
        return -1
    elif prec_b > prec_a:
        return 1
    
    # compare based on slots
    else:
        # slots are equivalent
        if slotsA == slotsB:
            return 0

        # a is subset of b
        aWithoutB = slotsA - slotsB
        if not aWithoutB:
            return 1

        # b is subset of a
        bWithoutA = slotsB - slotsA
        if not bWithoutA:
            return -1

        # compare based on slots
        aMin = min(aWithoutB)
        bMin = min(bWithoutA)
        return -1 if aMin < bMin else 1


canonical_order = cmp_to_key(_canonical_order)

In [8]:
# test the canonical sorting

test = [
    (1, 9, {1, 2, 3}),
    (2, 10, {1, 2, 3}),
    (3, 5, {3, 4}),
    (3, 5, {1, 2}),
    (3, 6, {1, 2, 3}),
    (3, 6, {1, 2})
]

sorted(test, key=canonical_order)

[(2, 10, {1, 2, 3}),
 (1, 9, {1, 2, 3}),
 (3, 6, {1, 2, 3}),
 (3, 6, {1, 2}),
 (3, 5, {1, 2}),
 (3, 5, {3, 4})]

In [39]:
def _copy_meta_dicts(feature_dict):
    """Extract metakwargs."""
    return {
        feat: deepcopy(feat_obj.meta)
        for feat, feat_obj in feature_dict.items()
    }


def _copy_feature_dicts(feature_dict):
    """Extract feature dicts."""
    return {
        feat: dict(feat_obj.items())
        for feat, feat_obj in feature_dict.items()
    }

    
def get_copy_of_corpus(tf_fabric: Fabric):
    """Get a copy of a corpus's resources."""
    tf_api = tf_fabric.api
    node_features = _copy_feature_dicts(tf_api.F.__dict__)
    edge_features = _copy_feature_dicts(tf_api.E.__dict__)
    metadata = {
        **_copy_meta_dicts(tf_api.F.__dict__),
        **_copy_meta_dicts(tf_api.E.__dict__),
    }
    metadata['otext'] = tf_fabric.features['otext'].metaData
    return {
        'nodeFeatures': node_features,
        'edgeFeatures': edge_features,
        'metaData': metadata,
    }


# add types
featureType = Union[str, int]
edgeType = Union[Set[int], Dict[int, featureType]]
nodeFeatureDict = Dict[str, Dict[int, featureType]]
edgeFeatureDict = Dict[str, Dict[int, edgeType]]
metaDataDict = Dict[str, Dict[str, str]]

class corpusData(TypedDict):
    nodeFeature: nodeFeatureDict
    edgeFeature: edgeFeatureDict
    metaData: metaDataDict

featureGenerator = Callable[corpusData, nodeFeatureDict]


class editAction:
    """TypedDict for grouping related corpus edits."""
    
    def __init__(
            self,
            deletions: Optional[Set[int]] = None,
            feature_updates: Optional[nodeFeatureDict] = None,
            edge_updates: Optional[edgeFeatureDict] = None,
    ):
        """Initialize edit action object."""
        self.deletions = deletions or set()
        self.feature_updates = feature_updates or {}
        self.edge_updates = edge_updates or {}


class ThesisCorpusBuilder:
    """Class for building thesis corpus."""
    
    def __init__(
        self, 
        locations: Optional[List[str]] = None,
        book_limit: Optional[int] = None,
        delete_features: Optional[Set[str]] = None,
        rename_features: Optional[Dict[str, str]] = None,
        add_features: Optional[Dict[str, featureGenerator]] = None,
        update_metadata: Optional[metaDataDict] = None,
        update_features: Optional[nodeFeatureDict] = None,
        update_edges: Optional[edgeFeatureDict] = None,
        delete_nodes: Optional[Set[int]] = None,
        edit_actions: Optional[List[editAction]] = None,
        tf_fabric: Optional[Fabric] = None,
    ):
        """Initialize the thesis corpus builder."""
        self.locations = locations or ''
        self.book_limit = book_limit
        self.add_features = add_features or {}
        self.update_metadata = update_metadata or {}
        self.delete_features = delete_features or set()
        self.rename_features = rename_features or {}
        self.delete_nodes = delete_nodes or set()
        self.update_features = update_features or {}
        self.update_edges = update_edges or {}
        self._add_edit_actions(
            edit_actions or [],
            self.delete_nodes,
            self.update_features,
            self.update_edges,
        )
        self.tf_fabric = tf_fabric
        self.tf_api = tf_fabric.api if tf_fabric else None

    @staticmethod
    def _add_edit_actions(
            edit_actions: List[editAction],
            delete_nodes: Set[int],
            update_features: nodeFeatureDict,
            update_edges: edgeFeatureDict,
    ) -> None:
        """Add all edit actions to the correct dicts / sets."""
        for action in edit_actions:
            delete_nodes.update(action.deletions)
            update_features.update(action.feature_updates)
            update_edges.update(action.edge_updates)
        
    def _get_keep_node_set(self):
        """Get set of nodes to keep."""
        keep_nodes = set()
        max_slot = 0
        book_limit = (
            self.tf_api.T.nodeFromSection((self.book_limit,))
            if self.book_limit else None
        )
        for book_node in self.tf_api.F.otype.s('book'):
            if book_limit and book_node > book_limit:
                break
            keep_nodes.add(book_node)
            for node in self.tf_api.L.d(book_node):
                keep_nodes.add(node)
                if self.tf_api.F.otype.v(node) == 'word':
                    max_slot = node
        return keep_nodes, max_slot
    
    @staticmethod
    def _filter_nodes_from_feature_dict(feature_dict, keep_nodes):
        """Filter keep nodes."""
        filtered_feature_dict = {}
        for feature, node_dict in feature_dict.items():
            filtered_feature_dict[feature] = {
                node: feature
                for node, feature in node_dict.items()
                if node in keep_nodes
            }
        return filtered_feature_dict
    
    def _filter_feature_dict_nodes(self, corpus_data, keep_node_set):
        """Filter feature dict nodes."""
        corpus_data['nodeFeatures'] = self._filter_nodes_from_feature_dict(
            corpus_data['nodeFeatures'],
            keep_node_set,
        )
        corpus_data['edgeFeatures'] = self._filter_nodes_from_feature_dict(
            corpus_data['edgeFeatures'],
            keep_node_set
        )
    
    def _rebuild_nodes_from_oslots(
            self, 
            oslot_map,
            otype_map,
            max_slot,
    ) -> Dict[int, int]:
        """Rebuild node numbering scheme from oslots."""
        # get sorted list of oslot data
        oslots = []
        for node, oslot_set in oslot_map.items():
            otype = otype_map[node]
            otype_rank = self.tf_api.Nodes.otypeRank[otype]
            oslots.append((node, otype_rank, set(oslot_set)))
        oslots.sort(key=canonical_order)
        
        # create mapping to new node numbers
        new_node_map = {
            old_node: (i + max_slot)
            for i, (old_node, _, _) in enumerate(oslots, 1)
        }
        return new_node_map
        
    @staticmethod
    def _reindex_node_features(old_node_features, remapper):
        """Reindex node features."""
        node_features = collections.defaultdict(dict)
        for feature, node_dict in old_node_features.items():
            for node, fvalue in node_dict.items():
                node_features[feature][remapper(node)] = fvalue
        return node_features
    
    @staticmethod
    def _reindex_edge_features(old_edge_features, remapper):
        """Reindex edge features."""
        edge_features = collections.defaultdict(dict)
        for feature, edge_dict in old_edge_features.items():
            for node, edges in edge_dict.items():
                if isinstance(edges, dict):
                    edge_features[feature][remapper(node)] = {
                        remapper(n) for n, v
                        in edges.items()
                    }
                else:
                    edge_features[feature][remapper(node)] = set(
                        remapper(n) for n in edges
                    )
        return edge_features
        
    def _reindex_nodes(self, corpus_data: corpusData, max_slot):
        """Reindex nodes."""
        # rebuild node numbering from oslot data
        new_node_map = self._rebuild_nodes_from_oslots(
            corpus_data['edgeFeatures']['oslots'],
            corpus_data['nodeFeatures']['otype'],
            max_slot,
        )
        
        # remap all node features using the new numbering scheme
        remapper = lambda node: new_node_map.get(node, node)
        node_features = self._reindex_node_features(corpus_data['nodeFeatures'], remapper)
        edge_features = self._reindex_edge_features(corpus_data['edgeFeatures'], remapper)

        # apply the changes to the dict
        corpus_data['nodeFeatures'] = node_features
        corpus_data['edgeFeatures'] = edge_features
        
        # add a helper map back to old nodes for referencing
        corpus_data['edgeFeatures']['omap@2021-KT'] = {
            new_node: {old_node: None}
            for old_node, new_node in new_node_map.items()
        }

    def _update_metadata(self, corpus_data: corpusData):
        """Update metadata fields."""
        for field, data in self.update_metadata.items():
            corpus_data['metaData'].setdefault(field, {}).update(data)
        
    def _rebuild_metadata(self, corpus_data: corpusData):
        """Remap metadata for this project."""
        new_metadata = {}
        for feature, meta in corpus_data['metaData'].items():
            unique_meta = {
                k: v for k, v in meta.items()
                if k not in GENERIC_META
            }
            new_metadata[feature] = {
                **BHSAKT_METADATA,
                **unique_meta,
            }
        corpus_data['metaData'] = new_metadata
    
    def _delete_features(self, corpus_data: corpusData):
        """Remove features from the dataset."""
        for feature in self.delete_features:
            for data_type, data_dict in corpus_data.items():                
                if feature in data_dict:
                    del data_dict[feature]

    def _rename_features(self, corpus_data: corpusData):
        """Rename features in the dataset."""
        for old_name, new_name in self.rename_features.items():
            for data_type, data_dict in corpus_data.items():
                if old_name in data_dict:    
                    data_dict[new_name] = data_dict[old_name]
                    del data_dict[old_name]

    def _update_features(self, corpus_data: corpusData):
        """Update feature node mappings."""
        for feature, update_dict in self.update_features.items():
            corpus_data['nodeFeatures'].setdefault(feature, {}).update(update_dict)

    def _update_edges(self, corpus_data: corpusData):
        """Update edge feature node mappings."""
        for feature, update_dict in self.update_edges.items():
            corpus_data['edgeFeatures'].setdefault(feature, {}).update(update_dict)

    def _delete_nodes(self, corpus_data: corpusData):
        """Delete nodes from the corpus."""
        # delete from feature values
        for feature, node_data in corpus_data['nodeFeatures'].items():
            corpus_data['nodeFeatures'][feature] = {
                node: value
                for node, value in node_data.items()
                if node not in self.delete_nodes
            }
        # delete from edge relations
        new_edges = collections.defaultdict(dict)
        for feature, edge_data in corpus_data['edgeFeatures'].items():
            for node, edges in edge_data.items():
                if node in self.delete_nodes:
                    continue
                elif isinstance(edges, dict):
                    new_edges[feature][node] = {
                        n: value
                        for n, value in edges.items()
                        if n not in self.delete_nodes
                    }
                else:
                    new_edges[feature][node] = set(
                        n for n in edges
                        if n not in self.delete_nodes
                    )
        corpus_data['edgeFeatures'] = new_edges

    @staticmethod
    def _clear_directory(dest_dir: str):
        """Empty a destination directory of old data."""
        shutil.rmtree(dest_dir)
        Path(dest_dir).mkdir(parents=True)
            
    def _load_tf_corpus(self):
        """Load Text Fabric corpus."""
        if not self.tf_fabric:
            self.tf_fabric = Fabric(self.locations)
            self.tf_api = self.tf_fabric.loadAll()
                    
    def build(self, dest_dir: str):
        """Build the corpus."""
        print('Loading TF corpus...')
        self._load_tf_corpus()
        
        print('Getting a copy of the corpus...')
        corpus_data = get_copy_of_corpus(self.tf_fabric)
        
        print('Filtering the nodes...')
        keep_node_set, max_slot = self._get_keep_node_set()
        self._filter_feature_dict_nodes(corpus_data, keep_node_set)

        print('Applying graph edits...')
        self._delete_nodes(corpus_data)
        self._update_features(corpus_data)
        self._update_edges(corpus_data)

        print('Reindexing the nodes...')
        self._reindex_nodes(corpus_data, max_slot)
        
        print('Rebuilding metadata...')
        self._update_metadata(corpus_data)
        self._rebuild_metadata(corpus_data)
        
        print('Refactoring features...')
        self._delete_features(corpus_data)
        self._rename_features(corpus_data)
        
        print('Saving new corpus...')
        self._clear_directory(dest_dir)
        saver = Fabric(dest_dir)
        saver.save(**corpus_data)
        return corpus_data

In [66]:
book_limit = '2_Kings'

delete_features = {
    'book@am', 'book@ar', 'book@bn', 'book@da',
    'book@de', 'book@el', 'book@es', 'book@fa', 
    'book@fr', 'book@he', 'book@hi', 'book@id', 
    'book@ja', 'book@ko', 'book@la', 'book@nl', 
    'book@pa', 'book@pt', 'book@ru', 'book@sw', 
    'book@syc', 'book@tr', 'book@ur', 'book@yo', 
    'book@zh', 'book', 'dist', 'dist_unit', 
    'mother_object_type', 'number',
    'functional_parent', 'distributional_parent',
    'language', 'languageISO',
    'omap@c-KT', 'omap@c-2021', 'omap@2017-2021',
}

rename_features = {
    'book@en': 'book',
}

locations = [
    '/Users/cody/github/etcbc/bhsa/tf/2021',
    '/Users/cody/github/etcbc/genre_synvar/tf/2021',
]

edit_actions = [
    editAction(
        feature_updates={
            'function': {974852: 'LocaTime'},
            'prep_type': {974852: 'B_simul'},
        },
        edge_updates={
            'oslots': {
                484384: {283995, 283996, 283997, 283998, 
                         283999, 284000, 284001},
                484385: {284002, 284003, 284004, 284005},
            },
        },
    ),
    editAction(
        deletions={454316},
        edge_updates={
            'oslots': {
                454315: {143086, 143087, 143088, 143089,
                         143090, 143091, 143092, 143093,
                         143094, 143095, 143096, 143097,
                         143098, 143099}
            },
        },
    ),
]


update_metadata = {
    'omap@2021-KT': {
        'description': 'Mapping between nodes in BHSA 2021 version to BHSA Kingham Thesis version',
        'valueType': 'int',
    },
    'prep_type': {'description': 'test123', 'valueType': 'str'},
}

corpus_builder = ThesisCorpusBuilder(
    locations,
    book_limit=None,
    update_metadata=update_metadata,
    delete_features=delete_features,
    rename_features=rename_features,
    edit_actions=edit_actions,
)

In [67]:
test_corpus = corpus_builder.build('test_corpus')

Loading TF corpus...
  1.91s Feature overview: 110 for nodes; 6 for edges; 1 configs; 9 computed
Getting a copy of the corpus...
Filtering the nodes...
Applying node edits...
Reindexing the nodes...
Rebuilding metadata...
Refactoring features...
Saving new corpus...
  0.00s Not all of the warp features otype and oslots are present in
test_corpus
  0.00s Only the Feature and Edge APIs will be enabled
  0.00s Warp feature "otext" not found. Working without Text-API

  0.00s Exporting 79 node and 3 edge and 1 config features to test_corpus:
  0.00s VALIDATING oslots feature
  0.07s VALIDATING oslots feature
  0.07s maxSlot=     426590
  0.07s maxNode=    1441824
  0.14s OK: oslots is valid
   |     0.00s T book                 to test_corpus
   |     0.01s T chapter              to test_corpus
   |     0.04s T code                 to test_corpus
   |     0.26s T det                  to test_corpus
   |     0.04s T domain               to test_corpus
   |     0.20s T freq_lex             t

In [None]:
corpus_builder.tf_fabric.

In [65]:
test_corpus['edgeFeatures']['omap@2021-KT'].get(655162)

{454315: None}

In [52]:
bhsa.T.text(454315)

'בַּיֹּ֤ום הַהוּא֙ אָקִ֣ים אֶל־עֵלִ֔י אֵ֛ת כָּל־'

In [29]:
bhsa.T.text(454315)

'בַּיֹּ֤ום הַהוּא֙ אָקִ֣ים אֶל־עֵלִ֔י אֵ֛ת כָּל־'

In [33]:
bhsa.L.d(454316, 'word')

(143096, 143097, 143098, 143099)

In [31]:
bhsa.L.d(454315, 'word')

(143086,
 143087,
 143088,
 143089,
 143090,
 143091,
 143092,
 143093,
 143094,
 143095)

In [27]:
bhsa.Es('mother').f(543347)

(543346,)

In [28]:
bhsa.T.text(543346)

'בַּיֹּ֤ום הַהוּא֙ אָקִ֣ים אֶל־עֵלִ֔י אֵ֛ת כָּל־'

In [14]:
bhsa = corpus_builder.tf_api

In [19]:
bhsa.Es('omap@c-2021').f(454299)

((454315, None),)

In [68]:
# load the corpus and test
test_fabric = Fabric('test_corpus')
test_api = test_fabric.loadAll()

   |     1.29s T otype                from test_corpus
   |       16s T oslots               from test_corpus
   |     0.01s T qere_utf8            from test_corpus
   |     1.47s T g_lex_utf8           from test_corpus
   |     0.06s T verse                from test_corpus
   |     1.54s T g_word               from test_corpus
   |     0.01s T qere_trailer         from test_corpus
   |     1.42s T g_cons               from test_corpus
   |     1.38s T lex                  from test_corpus
   |     0.00s T book                 from test_corpus
   |     1.39s T lex_utf8             from test_corpus
   |     1.42s T g_cons_utf8          from test_corpus
   |     0.01s T qere_trailer_utf8    from test_corpus
   |     1.41s T g_lex                from test_corpus
   |     1.24s T trailer_utf8         from test_corpus
   |     1.23s T trailer              from test_corpus
   |     0.01s T qere                 from test_corpus
   |     1.54s T g_word_utf8          from test_corpus
   |     1

In [77]:
test_api.Es('omap@2021-KT').t(454317)

(655163,)

In [71]:
test_api.L.u(655162, 'sentence')

(519398,)

In [70]:
test_api.T.text(655162)

'בַּיֹּ֤ום הַהוּא֙ אָקִ֣ים אֶל־עֵלִ֔י אֵ֛ת כָּל־אֲשֶׁ֥ר דִּבַּ֖רְתִּי אֶל־בֵּיתֹ֑ו '

In [72]:
test_api.T.text(519398)

'בַּיֹּ֤ום הַהוּא֙ אָקִ֣ים אֶל־עֵלִ֔י אֵ֛ת כָּל־אֲשֶׁ֥ר דִּבַּ֖רְתִּי אֶל־בֵּיתֹ֑ו הָחֵ֖ל וְכַלֵּֽה׃ '

In [78]:
test_api.T.text(655163)

'הָחֵ֖ל '

In [24]:
for ph in test_api.L.d(685231, 'phrase'):
    print(ph)
    print(test_api.F.function.v(ph))
    print(test_api.T.text(ph))
    print()

974850
Conj
וְ

974851
Pred
הָיָ֣ה׀ 

974852
Time
בַּיֹּ֣ום הַה֗וּא 



In [21]:
test_api.L.d(685232, 'word')

(284002, 284003, 284004, 284005)

In [165]:
test_api.T.text(484385)

'מִשְׁפַּ֨חַת בֵּית־דָּוִ֤יד לְבָד֙ וּנְשֵׁיהֶ֣ם לְבָ֔ד מִשְׁפַּ֤חַת בֵּית־נָתָן֙ לְבָ֔ד וּנְשֵׁיהֶ֖ם לְבָֽד׃ '

In [None]:
test_api.T.text(484384)

In [None]:
bhsa = corpus_builder.tf_api

In [None]:
bhsa.F.mother_object_type.v(484385)

'clause'

In [None]:
bhsa.F.typ.v(484385)

'xYqX'

In [None]:
bhsa.Es('omap@c-2021').f(484368)

((484385, None),)

In [None]:
bhsa.T.text(484385)

'בַּיֹּ֣ום הַה֗וּא יַעֲל֤וּ דְבָרִים֙ עַל־לְבָבֶ֔ךָ '

In [None]:
bhsa.E.functional_parent.f(484386)

(1213703,)

In [78]:
bhsa.T.text(1213703)

'וְחָשַׁבְתָּ֖ מַחֲשֶׁ֥בֶת רָעָֽה׃ '

In [68]:
bhsa.F.otype.v(1213703)

'sentence'

In [67]:
bhsa.E.functional_parent.t(484386)

(574184, 819188, 819189, 819190)

In [59]:
# get all clause annotations

In [60]:
corpus_copy = get_copy_of_corpus(corpus_builder.tf_fabric)

In [110]:
def find_all_features(node, corpus_copy):
    all_feats = {'nodeFeatures': set(), 'edgeFeatures': set()}
    for feat_type in ('nodeFeatures', 'edgeFeatures'):
        for feature, node_dict in corpus_copy[feat_type].items():
            if node in node_dict:
                all_feats[feat_type].add(feature)
    return all_feats

In [111]:
find_all_features(484385, corpus_copy)

{'nodeFeatures': {'dist',
  'dist_unit',
  'domain',
  'kind',
  'mother_object_type',
  'number',
  'otype',
  'rela',
  'txt',
  'typ'},
 'edgeFeatures': {'functional_parent',
  'omap@2017-2021',
  'omap@c-2021',
  'oslots'}}

In [120]:
find_all_features(1081944, corpus_copy)

{'nodeFeatures': {'det',
  'dist',
  'dist_unit',
  'number',
  'otype',
  'rela',
  'typ'},
 'edgeFeatures': {'distributional_parent',
  'functional_parent',
  'omap@2017-2021',
  'omap@c-2021',
  'oslots'}}

In [None]:
bhsa.E.

In [113]:
bhsa.E.functional_parent.f(819184)

(484385,)

In [117]:
bhsa.T.text(819184)

'בַּיֹּ֣ום הַה֗וּא '

In [114]:
bhsa.F.otype.v(484385)

'phrase'

In [115]:
bhsa.T.text(484385)

'בַּיֹּ֣ום הַה֗וּא יַעֲל֤וּ דְבָרִים֙ עַל־לְבָבֶ֔ךָ '

In [79]:
to_delete = {
    'dist', 'dist_unit', 
    'mother_object_type', 'number',
    
}

In [63]:
all_cl_feats

{'nodeFeatures': {'dist',
  'dist_unit',
  'domain',
  'kind',
  'mother_object_type',
  'number',
  'otype',
  'rela',
  'txt',
  'typ'},
 'edgeFeatures': {'functional_parent',
  'omap@2017-2021',
  'omap@c-2021',
  'oslots'}}

In [103]:
for slot in bhsa.L.d(484385, 'word'):
    print(slot, bhsa.T.text(slot))

283997 בַּ
283998 
283999 יֹּ֣ום 
284000 הַ
284001 ה֗וּא 
284002 יַעֲל֤וּ 
284003 דְבָרִים֙ 
284004 עַל־
284005 לְבָבֶ֔ךָ 


In [144]:
for slot in bhsa.L.d(484384, 'word'):
    print(slot, bhsa.T.text(slot))

283995 וְ
283996 הָיָ֣ה׀ 


In [106]:
bhsa.L.u(283997, 'phrase')

(819184,)

In [107]:
bhsa.L.u(283997, 'phrase_atom')

(1081944,)

In [118]:
bhsa.T.text(484384)

'וְהָיָ֣ה׀ '

In [143]:
bhsa.T.text(484385)

'בַּיֹּ֣ום הַה֗וּא יַעֲל֤וּ דְבָרִים֙ עַל־לְבָבֶ֔ךָ '

In [142]:
test

<__main__.editAction at 0x3368a95a0>

In [119]:
shift_slots = (
    (
        # shift from left to right
        # left, slots, right
        [
            484385, 
            {283997, 283998, 283999, 284000, 284001}, 
            484384
        ],
        
        # node feature updates
        {},
        
        # edge feature updates
        {},
    ),
)

merges = (
    (
        # nodes to merge, to leftmost
        [],
        # node feature updates
        {},
        # edge feature updates
        {},
    ),
)

splits = [
    # split operation
    # actions:
    # 1) add new nodes with new oslots
    # 2) update oslots for node
    # 3) update features for all new nodes and for first
    [
        # node
        427559,
        # new oslot map
        [(1, 2, 3, 4), (5, 6, 7)],
        # features for new nodes
        [],
        # edges for new nodes
        [],
        # (existing) node feature updates
        [],
        # edge feature updates
        [],
    ],
]

typing.Tuple[typing.Tuple[int], typing.Dict[str, typing.Union[str, int]], typing.Union[typing.Set[int], typing.Dict[int, typing.Union[str, int]]]]

typing.Tuple[int, typing.List[typing.Dict[str, typing.Union[str, int]]], typing.List[typing.Union[typing.Set[int], typing.Dict[int, typing.Union[str, int]]]]]