In [1]:
import yaml
import collections

from copy import deepcopy
from tf.fabric import Fabric
from tf.core.api import Api
from tf.convert.walker import CV
from pathlib import Path
from functools import cmp_to_key
from typing import Dict, Tuple, Union, Optional, Set, List

## Load Data

In [2]:
BHSA_CORE_DATA = Path('/Users/cody/github/etcbc/bhsa')
BHSA_TF = BHSA_CORE_DATA / 'tf/2021'
BHSA_YAML = BHSA_CORE_DATA / 'yaml'
BHSA_METADATA_FILES = ['core.yaml', 'lexicon.yaml', 'ketivqere.yaml', 'paragraph.yaml', 'stats.yaml']
BHSA_METADATA_PATHS = [BHSA_YAML / file for file in BHSA_METADATA_FILES]

BHSA_GENERIC = BHSA_YAML / 'generic.yaml'

In [3]:
def load_yaml(filepath):
    """Load yaml config as dict."""
    with open(filepath, 'r') as infile:
        return yaml.load(infile, Loader=yaml.FullLoader)
    

def load_all_feature_metadata(feature_metadata_paths):
    """Load all feature metadata into a single dictionary."""
    return {
        feature: value
        for path in feature_metadata_paths
        for feature, value in load_yaml(path).items()
    }

In [4]:
# tf_bhsa = Fabric('/Users/cody/github/etcbc/bhsa/tf/2021')
# bhsa = tf_bhsa.loadAll()

In [5]:
BHSAKT_METADATA = {
    "corpus": "BHSA-Kingham-thesis",
    "description": "A modified version of the ETCBC's BHSA for my Cambridge PhD thesis",
    "version": "1.0",
    "editor": "Cody Kingham",
    "source": "Eep Talstra Centre for Bible and Computer",
    "source-url": "https://github.com/etcbc/bhsa",
    "encoders": "Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)",
}


GENERIC_META = load_yaml(BHSA_GENERIC)
GENERIC_META.update({
    'dateWritten': None,
    'writtenBy': None,
})

In [6]:
GENERIC_META

{'dataset': 'BHSA',
 'datasetName': 'Biblia Hebraica Stuttgartensia Amstelodamensis',
 'author': 'Eep Talstra Centre for Bible and Computer',
 'encoders': 'Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)',
 'website': 'https://shebanq.ancient-data.org',
 'email': 'shebanq@ancient-data.org',
 'dateWritten': None,
 'writtenBy': None}

In [7]:
def convert_to_chunk(node):
    slots = bhsa.L.d(node, 'word')
    return (node, set(slots))


def _canonical_order(node_chunk_a, node_chunk_b):
    """Sort items in canonical sorting order."""
    na, prec_a, slotsA = node_chunk_a
    nb, prec_b, slotsB = node_chunk_b
    
    # compare based on node precedence
    if prec_a > prec_b:
        return -1
    elif prec_b > prec_a:
        return 1
    
    # compare based on slots
    else:
        # slots are equivalent
        if slotsA == slotsB:
            return 0

        # a is subset of b
        aWithoutB = slotsA - slotsB
        if not aWithoutB:
            return 1

        # b is subset of a
        bWithoutA = slotsB - slotsA
        if not bWithoutA:
            return -1

        # compare based on slots
        aMin = min(aWithoutB)
        bMin = min(bWithoutA)
        return -1 if aMin < bMin else 1


canonical_order = cmp_to_key(_canonical_order)

In [8]:
# test the canonical sorting

test = [
    (1, 9, {1, 2, 3}),
    (2, 10, {1, 2, 3}),
    (3, 5, {3, 4}),
    (3, 5, {1, 2}),
    (3, 6, {1, 2, 3}),
    (3, 6, {1, 2})
]

sorted(test, key=canonical_order)

[(2, 10, {1, 2, 3}),
 (1, 9, {1, 2, 3}),
 (3, 6, {1, 2, 3}),
 (3, 6, {1, 2}),
 (3, 5, {1, 2}),
 (3, 5, {3, 4})]

In [9]:
def _copy_meta_dicts(feature_dict):
    """Extract metakwargs."""
    return {
        feat: deepcopy(feat_obj.meta)
        for feat, feat_obj in feature_dict.items()
    }


def _copy_feature_dicts(feature_dict):
    """Extract feature dicts."""
    return {
        feat: dict(feat_obj.items())
        for feat, feat_obj in feature_dict.items()
    }

    
def get_copy_of_corpus(tf_fabric: Fabric):
    """Get a copy of a corpus's resources."""
    tf_api = tf_fabric.api
    node_features = _copy_feature_dicts(tf_api.F.__dict__)
    edge_features = _copy_feature_dicts(tf_api.E.__dict__)
    metadata = {
        **_copy_meta_dicts(tf_api.F.__dict__),
        **_copy_meta_dicts(tf_api.E.__dict__),
    }
    metadata['otext'] = tf_fabric.features['otext'].metaData
    return {
        'nodeFeatures': node_features,
        'edgeFeatures': edge_features,
        'metaData': metadata,
    }


class ThesisCorpusBuilder:
    """Class for building thesis corpus."""
    
    def __init__(
        self, 
        locations: Optional[List[str]] = None,
        book_limit: Optional[int] = None,
        delete_features: Optional[Set[str]] = None,
        rename_features: Optional[Dict[str, str]] = None,
        tf_fabric: Optional[Fabric] = None,
    ):
        """Initialize the thesis corpus builder."""
        self.locations = locations or ''
        self.book_limit = book_limit
        self.delete_features = delete_features or set()
        self.rename_features = rename_features or {}
        self.tf_fabric = tf_fabric
        self.tf_api = tf_fabric.api if tf_fabric else None

    def _get_keep_node_set(self):
        """Get set of nodes to keep."""
        keep_nodes = set()
        max_slot = 0
        book_limit = (
            self.tf_api.T.nodeFromSection((self.book_limit,))
            if self.book_limit else None
        )
        for book_node in self.tf_api.F.otype.s('book'):
            if book_limit and book_node > book_limit:
                break
            keep_nodes.add(book_node)
            for node in self.tf_api.L.d(book_node):
                keep_nodes.add(node)
                if self.tf_api.F.otype.v(node) == 'word':
                    max_slot = node
        return keep_nodes, max_slot
    
    @staticmethod
    def _filter_nodes_from_feature_dict(feature_dict, keep_nodes):
        """Filter keep nodes."""
        filtered_feature_dict = {}
        for feature, node_dict in feature_dict.items():
            filtered_feature_dict[feature] = {
                node: feature
                for node, feature in node_dict.items()
                if node in keep_nodes
            }
        return filtered_feature_dict
    
    def _filter_feature_dict_nodes(self, corpus_data, keep_node_set):
        """Filter feature dict nodes."""
        corpus_data['nodeFeatures'] = self._filter_nodes_from_feature_dict(
            corpus_data['nodeFeatures'],
            keep_node_set,
        )
        corpus_data['edgeFeatures'] = self._filter_nodes_from_feature_dict(
            corpus_data['edgeFeatures'],
            keep_node_set
        )
    
    def _rebuild_nodes_from_oslots(
            self, 
            oslot_map,
            otype_map,
            max_slot,
    ) -> Dict[int, int]:
        """Rebuild node numbering scheme from oslots."""
        # get sorted list of oslot data
        oslots = []
        for node, oslot_set in oslot_map.items():
            otype = otype_map[node]
            otype_rank = self.tf_api.Nodes.otypeRank[otype]
            oslots.append((node, otype_rank, set(oslot_set)))
        oslots.sort(key=canonical_order)
        
        # create mapping to new node numbers
        new_node_map = {
            old_node: (i + max_slot)
            for i, (old_node, _, _) in enumerate(oslots, 1)
        }
        return new_node_map
        
    @staticmethod
    def _reindex_node_features(old_node_features, remapper):
        """Reindex node features."""
        node_features = collections.defaultdict(dict)
        for feature, node_dict in old_node_features.items():
            for node, fvalue in node_dict.items():
                node_features[feature][remapper(node)] = fvalue
        return node_features
    
    @staticmethod
    def _reindex_edge_features(old_edge_features, remapper):
        """Reindex edge features."""
        edge_features = collections.defaultdict(dict)
        for feature, edge_dict in old_edge_features.items():
            for node, edges in edge_dict.items():
                if isinstance(edges, dict):
                    edge_features[feature][remapper(node)] = {
                        remapper(n) for n, v
                        in edges.items()
                    }
                else:
                    edge_features[feature][remapper(node)] = set(
                        remapper(n) for n in edges
                    )
        return edge_features
        
    def _reindex_nodes(self, corpus_data, max_slot):
        """Reindex nodes."""
        # rebuild node numbering from oslot data
        new_node_map = self._rebuild_nodes_from_oslots(
            corpus_data['edgeFeatures']['oslots'],
            corpus_data['nodeFeatures']['otype'],
            max_slot,
        )
        
        # remap all node features using the new numbering scheme
        remapper = lambda node: new_node_map.get(node, node)
        node_features = self._reindex_node_features(corpus_data['nodeFeatures'], remapper)
        edge_features = self._reindex_edge_features(corpus_data['edgeFeatures'], remapper)

        # apply the changes to the dict
        corpus_data['nodeFeatures'] = node_features
        corpus_data['edgeFeatures'] = edge_features

    def _rebuild_metadata(self, corpus_data):
        """Remap metadata for this project."""
        new_metadata = {}
        for feature, meta in corpus_data['metaData'].items():
            unique_meta = {
                k: v for k, v in meta.items()
                if k not in GENERIC_META
            }
            new_metadata[feature] = {
                **BHSAKT_METADATA,
                **unique_meta,
            }
        corpus_data['metaData'] = new_metadata
    
    def _delete_features(self, corpus_data):
        """Remove features from the dataset."""
        for feature in self.delete_features:
            for data_type, data_dict in corpus_data.items():                
                if feature in data_dict:
                    del data_dict[feature]

    def _rename_features(self, corpus_data):
        """Rename features in the dataset."""
        for old_name, new_name in self.rename_features.items():
            for data_type, data_dict in corpus_data.items():
                if old_name in data_dict:    
                    data_dict[new_name] = data_dict[old_name]
                    del data_dict[old_name]

    @staticmethod
    def _clear_directory(dest_dir: str):
        """Empty a destination directory of old data."""
        for file in Path(dest_dir).glob('*.tf'):
            file.unlink()
            
    def _load_tf_corpus(self):
        """Load Text Fabric corpus."""
        if not self.tf_fabric:
            self.tf_fabric = Fabric(self.locations)
            self.tf_api = self.tf_fabric.loadAll()
                    
    def build(self, dest_dir: str):
        """Build the corpus."""
        print('Loading TF corpus...')
        self._load_tf_corpus()
        
        print('Getting a copy of the corpus...')
        corpus_data = get_copy_of_corpus(self.tf_fabric)
        
        print('Filtering the nodes...')
        keep_node_set, max_slot = self._get_keep_node_set()
        self._filter_feature_dict_nodes(corpus_data, keep_node_set)
        
        print('Reindexing the nodes...')
        self._reindex_nodes(corpus_data, max_slot)
        
        print('Rebuilding metadata...')
        self._rebuild_metadata(corpus_data)
        
        print('Refactoring features...')
        self._delete_features(corpus_data)
        self._rename_features(corpus_data)
        
        print('Saving new corpus...')
        self._clear_directory(dest_dir)
        saver = Fabric(dest_dir)
        saver.save(**corpus_data)

In [10]:
book_limit = '2_Kings'

delete_features = {
    'book@am', 'book@ar', 'book@bn', 'book@da',
    'book@de', 'book@el', 'book@es', 'book@fa', 
    'book@fr', 'book@he', 'book@hi', 'book@id', 
    'book@ja', 'book@ko', 'book@la', 'book@nl', 
    'book@pa', 'book@pt', 'book@ru', 'book@sw', 
    'book@syc', 'book@tr', 'book@ur', 'book@yo', 
    'book@zh', 'book',
}

rename_features = {
    'book@en': 'book',
}

locations = [
    '/Users/cody/github/etcbc/bhsa/tf/2021',
    '/Users/cody/github/etcbc/genre_synvar/tf/2021',
]

corpus_builder = ThesisCorpusBuilder(
    locations,
    book_limit=book_limit,
    delete_features=delete_features,
    rename_features=rename_features,
)

In [11]:
corpus_builder.build('test_corpus')

Loading TF corpus...
  2.01s Feature overview: 110 for nodes; 6 for edges; 1 configs; 9 computed
Getting a copy of the corpus...
Filtering the nodes...
Reindexing the nodes...
Rebuilding metadata...
Refactoring features...
Saving new corpus...
  0.00s Not all of the warp features otype and oslots are present in
test_corpus
  0.00s Only the Feature and Edge APIs will be enabled
  0.00s Warp feature "otext" not found. Working without Text-API

  0.00s Exporting 84 node and 6 edge and 1 config features to test_corpus:
  0.00s VALIDATING oslots feature
  0.03s VALIDATING oslots feature
  0.03s maxSlot=     212072
  0.03s maxNode=     694564
  0.06s OK: oslots is valid
   |     0.00s T book                 to test_corpus
   |     0.01s T chapter              to test_corpus
   |     0.02s T code                 to test_corpus
   |     0.12s T det                  to test_corpus
   |     0.17s T dist                 to test_corpus
   |     0.15s T dist_unit            to test_corpus
   |     

In [28]:
tf_fabric = corpus_builder.tf_fabric

In [30]:
tf_fabric.api.F.pdp.v(1)

'prep'

In [119]:
merges = [
    # merge operation
    # actions to take:
    # 1) delete last nodes, leave first
    # 2) update oslots for first node
    # 3) delete all features for last nodes
    # 4) update features for first
    [
        # nodes to merge
        [427559, 427560],
        # new features
        ['XQtl', ...],
        # new edges
        [...],
    ],
]

splits = [
    # split operation
    # actions:
    # 1) add new nodes with new oslots
    # 2) update oslots for node
    # 3) update features for all new nodes and for first
    [
        # node
        427559,
        # new oslot map
        [(1, 2, 3, 4), (5, 6, 7)],
        # new cl features
        [
            ('XQtl', ...),
        ],
        # new cl edges
        [
            (...),
            ...
        ],
    ],
]