# Autolabeling

In [383]:
import collections

from datetime import datetime
from typing import NamedTuple, Tuple, List, Dict, Set, Optional
from textwrap import dedent
from abc import abstractmethod, ABC

from tf.fabric import Fabric
from tf.core.api import Api
from tf.app import use

In [85]:
CORPUS = '/Users/cody/github/BH_time_collocations/data/data/corpus/'

tf_fabric = Fabric(locations=CORPUS)
api = tf_fabric.loadAll()

F, E, T, L, S = (getattr(api, l) for l in 'FETLS')

  1.26s Feature overview: 81 for nodes; 3 for edges; 2 configs; 9 computed


In [396]:
app = use('ETCBC/bhsa', api=api)

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,11,19279.27,100
chapter,334,634.95,100
verse,10171,20.85,100
half_verse,19786,10.72,100
sentence,29519,7.18,100
sentence_atom,29764,7.13,100
clause,40592,5.22,100
clause_atom,41834,5.07,100
lex,1310,1.97,1
phrase,122547,1.73,100


In [397]:
app.displaySetup(
    condenseType='clause',
    withNodes=True,
)

In [289]:
class AnnotationObjectSpecifier(NamedTuple):
    """Object of interest to collect for annotation."""
    name: str
    query: str


class ObjectIdentifier(NamedTuple):
    """Slot-based identifier for a linguistic object."""
    slots: Tuple[int]
    otype: str

    def serialize(self) -> Tuple[Tuple[int], str]:
        """Serialize the identifier."""
        return self.slots, self.otype

    @classmethod
    def from_serialization(cls, serialization: Tuple[Tuple[int], str]) -> 'ObjectIdentifier':
        """Get an ObjectIdentifier from a serialized format."""
        return cls(*serialization)

    
class LingLabel(NamedTuple):
    """Object for storing linguistic labels."""
    label: str
    value: str
    node: int
    target: str

In [344]:
class BaseLabelProcessor(ABC):
    """Base object for label processing."""
    
    @abstractmethod
    def label(
        label_target_list: List[LabelTargets]
    ) -> Dict[str, Dict[str, Set[int]]]:
        """
        Process targets for a given label name.
        
        :param label_target_list: a list of LabelTarget types
        :return: a dict conforming to the following structure:
            {label: value: {12345, 12346, 12347}}
            where integers are object nodes in Text-Fabric
        """


class LabelQuery:
    """Class for storing label-value queries."""
    
    def __init__(
            self,
            targets: Set[str],
            label: str,
            value: str,
            query: str,
    ) -> None:
        """Initialize the label query object."""
        self.label = label
        self.value = value
        self.targets = targets
        self._query = query
        
    @property
    def query(self) -> str:
        """Retrieve query value."""
        return dedent(self._query)
        
        
class QueryLabeler(BaseLabelProcessor):
    """
    Processor for autolabeling with Text-Fabric queries.
    
    The processor is initialized with a dict of specs. E.g.

        {
            # label
            "clause_type": (
                # label values
                ("x_clause", [query])
                ("clause_x", [query])
            )
        }

    [query] should utilize the standard "target" set.
    For example, "x_clause" could be defined as:
    
        t:time_phrase
        /with/
        clause
            t
            <: phrase function=Pred
        /-/
    
    where `time_phrase` is a key defined by the output of the 
    AnnotationObjectSpecifier with a set of nodes as the value.
    
    NB: Queries are defined in-order, so that if an earlier query produces a result that
    overlaps with a later one, the later on will take precedence.
    """

    def __init__(
            self, 
            tf_fabric: Fabric,
            label_queries: List[LabelQuery],
    ) -> None:
        """
        Initialize the labeler.
        
        :param tf_fabric: Text-Fabric object to use for running the queries
        :param label_queries: a dictionary that maps labels to label values
            to queries
        :return: None
        """
        self.tf_fabric = tf_fabric
        self.api = tf_fabric.api
        self.label_queries = label_queries
    
    def _run_query(
            self,
            query: str,
            targets: Set[str],
    ) -> Set[int]:
        """Execute query and return results."""
        result_set = self.api.S.search(
            query,
            shallow=True,
            sets={'target': targets},
        )
        return result_set

    def label(
            self,
            annotation_objects: Dict[str, Set[int]],
    ) -> List[LingLabel]:
        """Assign labels to targets based on queries."""
        labeled_targets: List[LingLabel] = []
        for label_query in self.label_queries:
            for target in label_query.targets:
                target_set = annotation_objects[target]
                print(f'Running query for: {label_query.label}={label_query.value}...')
                query_results = self._run_query(label_query.query, target_set)
                print(f'\tresults: {len(query_results)}')
                for node in query_results:
                    labeled_targets.append(
                        LingLabel(
                            label_query.label, 
                            label_query.value,
                            node, 
                            target,
                        )
                    )
        return labeled_targets

In [374]:
class AutoLabeler:
    """Object for assigning labels to linguistic objects automatically."""
    
    def __init__(
            self,
            outdir: str,
            tf_fabric: Fabric,
            annotation_obj_specs: List[AnnotationObjectSpecifier],
            label_specs: Dict[str, Set[str]],
            label_processors: List[BaseLabelProcessor],
    ) -> None:
        """Initialialize the autolabeler."""
        self.outdir = outdir
        self.annotation_obj_specs = annotation_obj_specs
        self.label_specs = label_specs
        self.label_processors = label_processors
        self.tf_fabric = tf_fabric
        self.tf_api = tf_fabric.api
        self.clause_rank = self.tf_api.Nodes.otypeRank['clause']

    def _log(self, message: str, ts=False, indent=0):
        """Print log messages."""
        indent_str = '\t' * indent
        now = f'{datetime.now()}  ' if ts else ''
        print(f'{indent_str}{now}{message}')
        
    def _run_object_query(self, query: str) -> Set[int]:
        """Run a Text-Fabric query for an annotation object."""
        result_set = self.tf_api.S.search(query, shallow=True)
        return result_set

    def _collect_annotation_objects(
        self, 
        object_specs: List[AnnotationObjectSpecifier]
    ) -> Dict[str, Set[int]]:
        """Collect all annotation objects."""
        annotation_objects: Dict[str, Set[int]] = collections.defaultdict(set)
        for spec in object_specs:
            annotation_objects[spec.name] = self._run_object_query(spec.query)
        return annotation_objects

    def _filter_labeled_objects(
        self,
        annotation_objects: Dict[str, Set[int]],
    ) -> Dict[str, Set[int]]:
        """Filter out objects that have already been annotated."""
        # TODO
        return annotation_objects

    def _get_auto_labels(
            self, 
            annotation_objects: Dict[str, Set[int]],
    ) -> Dict[int, Dict[str, str]]:
        """Get autolabels for all targeted nodes."""
        # collect all labels 
        auto_labels: List[LingLabel] = []
        covered_targets = collections.defaultdict(set)
        n_labeled = collections.Counter()
        
        # collect all labels produced by processors
        for processor in self.label_processors:
            for label in processor.label(annotation_objects):
                covered_targets[label.label].add(label.node)
                auto_labels.append(label)
                n_labeled[label.label] += 1
                
        # append empty labels for unlabeled targets
        n_unlabeled = collections.Counter()
        for name, nodes in annotation_objects.items():
            expected_labels = self.label_specs[name]
            for node in nodes:
                for label_str in expected_labels:
                    if node not in covered_targets[label_str]:
                        n_unlabeled[label_str] += 1
                        auto_labels.append(
                            LingLabel(label_str, '', node, name)
                        )

        # give report on labeling outcome
        self._log('**** Successfully Autolabeled ****')
        self._log(n_labeled.most_common(), ts=False, indent=1)
        self._log('********* Needs Labels ***********')
        self._log(n_unlabeled.most_common(), ts=False, indent=1)
        
        # done
        return auto_labels
    
    def _get_clause_node(self, node: int) -> int:
        """Assign a clause node for a given node."""
        label_otype = self.tf_api.F.otype.v(node)
        rank = self.tf_api.Nodes.otypeRank[label_otype]
        if rank > self.clause_rank:
            raise Exception(f'node {node} has a otype > clause!')
        elif label_otype == 'clause':
            return node
        else:
            return self.tf_api.L.u(node, 'clause')[0]

    def _cluster_labels_by_clause(self, labels: List[LingLabel]):
        """Cluster labels by clause."""
        cl_clustered_labels = collections.defaultdict(list)
        for label in labels:
            cl_node = self._get_clause_node(label.node)
            cl_clustered_labels[cl_node].append(label)
        return cl_clustered_labels
    
    def labelize(self) -> None:
        """Generate labels and output an annotation file."""
        annotation_objects = self._collect_annotation_objects(self.annotation_obj_specs)
        new_annotation_objs = self._filter_labeled_objects(annotation_objects)  
        auto_labels = self._get_auto_labels(new_annotation_objs)
        cl_clustered_labels = self._cluster_labels_by_clause(auto_labels)
        return cl_clustered_labels

In [375]:
annotation_obj_specs = [
    AnnotationObjectSpecifier(
        "time_clause",
        dedent("""
            clause
            /with/
                phrase function=Time
            /-/
        """),
    ),
    AnnotationObjectSpecifier(
        "time_phrase",
        dedent("""
            phrase function=Time
        """)
    ),
    AnnotationObjectSpecifier(
        "verb",
        dedent("""
            w:word pdp=verb
            /with/
            clause
                phrase function=Time
                w
            /-/
        """)
    ),
]


label_specs = {
    'time_clause': {'cl_type'},
    'time_phrase': {'tp_cluster'},
    'verb': {'tense'},
}


label_queries = [
    LabelQuery(
        targets={'time_clause'},
        label='cl_type',
        value='x_clause',
        query="""
            t:target
            /with/
                phrase function=Time
                < phrase function=Pred
            /-/
        """,
    ),
]


processors = [
    QueryLabeler(tf_fabric, label_queries),
]


labeler = AutoLabeler(
    outdir='',
    tf_fabric=tf_fabric,
    annotation_obj_specs=annotation_obj_specs,
    label_specs=label_specs,
    label_processors=processors,
)

In [376]:
labels = labeler.labelize()

Running query for: cl_type=x_clause...
	results: 408
**** Successfully Autolabeled ****
	[('cl_type', 408)]
********* Needs Labels ***********
	[('tp_cluster', 2168), ('tense', 1775), ('cl_type', 1679)]


In [379]:
list(labels.items())[:25]

[(306178,
  [LingLabel(label='cl_type', value='x_clause', node=306178, target='time_clause'),
   LingLabel(label='tp_cluster', value='', node=395151, target='time_phrase'),
   LingLabel(label='tense', value='', node=15925, target='verb')]),
 (324610,
  [LingLabel(label='cl_type', value='x_clause', node=324610, target='time_clause'),
   LingLabel(label='tp_cluster', value='', node=450979, target='time_phrase'),
   LingLabel(label='tense', value='', node=115422, target='verb')]),
 (303108,
  [LingLabel(label='cl_type', value='x_clause', node=303108, target='time_clause'),
   LingLabel(label='tp_cluster', value='', node=385809, target='time_phrase'),
   LingLabel(label='tense', value='', node=751, target='verb')]),
 (303109,
  [LingLabel(label='cl_type', value='x_clause', node=303109, target='time_clause'),
   LingLabel(label='tp_cluster', value='', node=385814, target='time_phrase'),
   LingLabel(label='tense', value='', node=761, target='verb')]),
 (343044,
  [LingLabel(label='cl_type',

# Build Queries

In [422]:
search = app.search("""

clause
    word pdp=verb lex=BW>[|HLK[|CWB[|QRB[|>MR[
    phrase function=Time
    phrase function=Cmpl
        word lex=>L pdp=prep

""")

  0.39s 56 results


In [423]:
app.show(search)

In [424]:
from pathlib import Path

In [427]:
list(Path('../../data/corpus/').glob('*.tf'))

[PosixPath('../../data/corpus/g_vbe_utf8.tf'),
 PosixPath('../../data/corpus/function.tf'),
 PosixPath('../../data/corpus/uvf.tf'),
 PosixPath('../../data/corpus/g_vbs.tf'),
 PosixPath('../../data/corpus/vbe.tf'),
 PosixPath('../../data/corpus/prs_gn.tf'),
 PosixPath('../../data/corpus/genre.tf'),
 PosixPath('../../data/corpus/instruction.tf'),
 PosixPath('../../data/corpus/tab.tf'),
 PosixPath('../../data/corpus/qere_trailer_utf8.tf'),
 PosixPath('../../data/corpus/prs.tf'),
 PosixPath('../../data/corpus/prs_nu.tf'),
 PosixPath('../../data/corpus/root.tf'),
 PosixPath('../../data/corpus/prs_ps.tf'),
 PosixPath('../../data/corpus/pargr.tf'),
 PosixPath('../../data/corpus/qere.tf'),
 PosixPath('../../data/corpus/book.tf'),
 PosixPath('../../data/corpus/vbs.tf'),
 PosixPath('../../data/corpus/g_vbe.tf'),
 PosixPath('../../data/corpus/g_uvf.tf'),
 PosixPath('../../data/corpus/g_vbs_utf8.tf'),
 PosixPath('../../data/corpus/kq_hybrid.tf'),
 PosixPath('../../data/corpus/verse.tf'),
 PosixPat