# Annotations

In [1]:
from tf.fabric import Fabric
from tf.app import use

In [9]:
import docx
import json
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE
from docx.shared import Pt, RGBColor
from docx.opc.constants import RELATIONSHIP_TYPE as RT

In [22]:
document = Document()

document.core_properties.comments = json.dumps(list(range(200)))

document.add_heading('Document Title', 0)

p = document.add_paragraph('A plain paragraph having some ')
p.add_run('bold').bold = True
p.add_run(' and some ')
p.add_run('italic.').italic = True

document.add_heading('Heading, level 1', level=1)
document.add_paragraph('Intense quote', style='Intense Quote')

document.add_paragraph(
    'first item in unordered list', style='List Bullet'
)
document.add_paragraph(
    'first item in ordered list', style='List Number'
)

records = (
    (3, '101', 'Spam'),
    (7, '422', 'Eggs'),
    (4, '631', 'Spam, spam, eggs, and spam')
)

table = document.add_table(rows=1, cols=3)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Qty'
hdr_cells[1].text = 'Id'
hdr_cells[2].text = 'Desc'
for qty, _id, desc in records:
    row_cells = table.add_row().cells
    row_cells[0].text = str(qty)
    row_cells[1].text = _id
    row_cells[2].text = desc

document.add_page_break()

document.save('demo.docx')

ValueError: exceeded 255 char limit for property, got:

'[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199]'

## Build Prototype Annotation Doc

In [34]:
CORPUS = '/Users/cody/github/BH_time_collocations/data/data/corpus/'

In [204]:
tf_fabric = Fabric(CORPUS)
tf_api = tf_fabric.loadAll()

  1.27s Feature overview: 81 for nodes; 3 for edges; 2 configs; 9 computed


In [206]:
app = use('ETCBC/bhsa', api=tf_api)

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,11,19279.27,100
chapter,334,634.95,100
verse,10171,20.85,100
half_verse,19786,10.72,100
sentence,29519,7.18,100
sentence_atom,29764,7.13,100
clause,40592,5.22,100
clause_atom,41834,5.07,100
lex,1310,1.97,1
phrase,122547,1.73,100


In [211]:
LINK = 'https://shebanq.ancient-data.org/hebrew/text?book={book}&chapter={chapter}&verse={verse}&version=2021'
clauses = list(tf_api.F.otype.s('clause'))[:100]

In [286]:
# create document
document = Document()
styles = document.styles

# define a heading style
heading_style = styles.add_style('Reference', WD_STYLE_TYPE.PARAGRAPH)
heading_style.font.name = 'Times New Roman'
heading_style.font.bold = True
heading_style.font.size = Pt(12)
heading_style.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
heading_style.paragraph_format.keep_with_next = True


# define a Hebrew text style
hebrew_style = styles.add_style('Hebrew', WD_STYLE_TYPE.PARAGRAPH)
hebrew_style.font.size = Pt(15)
hebrew_style.font.name = 'SBL BibLit'
hebrew_style.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
hebrew_style.paragraph_format.keep_with_next = True
hebrew_style.paragraph_format.space_after = 0


# define a table style
# table_style = styles.add_style('Annotation', WD_STYLE_TYPE.TABLE)
# table_style.font.size = Pt(10)
# table_style.font.name = 'Arial'

def add_context(document, context):
    """Append context text to a paragraph."""
    for cl in context:
        cl_text = tf_api.T.text(cl)
        cl_text = document.add_paragraph(cl_text, style=hebrew_style.name)
        cl_text.runs[0].font.color.rgb = RGBColor(169, 169, 169)


def crawl_clauses(clause, direction, limit, position=0):
    if position < limit:
        next_cl = direction(clause, 'clause')
        if next_cl:
            yield next_cl[0]
            yield from crawl_clauses(next_cl[0], direction, limit, position=position+1)

GRAY = RGBColor(130, 130, 130)


for clause in clauses:

    # add reference
    book, ch, vs = tf_api.T.sectionFromNode(clause)
    ref = f'{clause}, {book} {ch}:{vs}'
    shebanq_link = LINK.format(book=book, chapter=str(ch), verse=str(vs))
    heading = document.add_paragraph(style=heading_style.name)
    add_hyperlink(heading, shebanq_link, ref)
    
    # add biblical text
    verse_node = tf_api.L.u(clause, 'verse')
    text = tf_api.T.text(clause)
    clause_text = document.add_paragraph(text, style=hebrew_style.name)
    verse_text = document.add_paragraph(tf_api.T.text(verse_node), style=hebrew_style.name)
    verse_text.runs[0].font.color.rgb = GRAY
    
    # add node-identified text
    node_text = document.add_paragraph(style=hebrew_style.name)
    for word in tf_api.L.d(clause, 'word'):
        node = node_text.add_run(str(word))
        node.font.superscript = True
        node.font.rtl = True
        node.font.color.rgb = RGBColor(255, 0, 0) 
        heb_word = node_text.add_run(tf_api.T.text(word))
        heb_word.font.rtl = True
        heb_word.font.name = 'Times New Roman'
        heb_word.font.color.rgb = GRAY
    node_text.add_run('\n')
    
    # add annotation table
    rows = [
        ['tense', '3', 'verb', 'ברא', 'past'],
        ['aspect', '302968', 'time_clause', '', 'ach_id'],
        ['tp_cluster', '385394', 'time_phrase', 'בראשׁית'] + ['1.1.2'],
        ['tp_head', '385394', 'time_phrase', 'בראשׁית'] + ['2'],
        ['is_rela', '385394', 'time_phrase', 'בראשׁית', 'CT'],
    ]

    table = document.add_table(rows=0, cols=5, style='Table Grid')
    table.alignment = WD_ALIGN_PARAGRAPH.CENTER
    table.style.font.name = 'Helvetica Neue'
    table.style.paragraph_format.keep_with_next = True
    table.autofit = True
    
    for row_data in rows:
        row_cells = table.add_row().cells
        for cell, data in zip(row_cells, row_data):
            cell.text = data
    
    # fix autofit bug:
    # https://github.com/python-openxml/python-docx/issues/209#issuecomment-344417132
    for col in table.columns:
        for cell in col.cells:
            cell._tc.tcPr.tcW.type = 'auto'

    document.add_paragraph('\n')

document.save('annotation.docx')

In [40]:
import yaml

In [42]:
with open("/Users/cody/github/BH_time_collocations/data/pipeline/scripts/labeling/kingham_labeling_project.yaml", 'r') as infile:
    configs = yaml.safe_load(infile)

In [47]:
configs['labels']['tp_cluster']

{'targets': 'time_phrase',
 'values': ['1.1.1.1', '1.1.1.2.1', '1.1.1.2.2', '1.1.1.3', '1.1.2']}

In [291]:
tf_api.F.otype.v(323695)

'clause'

In [297]:
verseless_clauses = []
for cl in tf_api.F.otype.s('clause'):
    if not tf_api.L.u(cl, 'verse'):
        verseless_clauses.append(cl)

In [298]:
len(verseless_clauses)

31

In [316]:
verses = set()
for word in tf_api.L.d(323695, 'word'):
    verses.add(
        tf_api.L.u(word, 'verse')[0]
    )

In [317]:
verses

{219467, 219468}

In [318]:
tf_api.T.text(sorted(verses))

'וַיְצַ֥ו מֹשֶׁ֖ה אֹותָ֣ם לֵאמֹ֑ר מִקֵּ֣ץ׀ שֶׁ֣בַע שָׁנִ֗ים בְּמֹעֵ֛ד שְׁנַ֥ת הַשְּׁמִטָּ֖ה בְּחַ֥ג הַסֻּכֹּֽות׃ בְּבֹ֣וא כָל־יִשְׂרָאֵ֗ל לֵרָאֹות֙ אֶת־פְּנֵי֙ יְהוָ֣ה אֱלֹהֶ֔יךָ בַּמָּקֹ֖ום אֲשֶׁ֣ר יִבְחָ֑ר תִּקְרָ֞א אֶת־הַתֹּורָ֥ה הַזֹּ֛את נֶ֥גֶד כָּל־יִשְׂרָאֵ֖ל בְּאָזְנֵיהֶֽם׃ '

In [313]:
tf_api.F.otype.v(39804)

'word'

In [311]:
tf_api.L.d(39804, 'phrase')

()

In [307]:
verses

set()

In [305]:
tf_api.L.u(39789, 'verse')

(215791,)

In [314]:
app.show([(323695,)], withNodes=True)

In [296]:
tf_api.T.text(39804)

'נֹתֵ֥ן '

In [292]:
tf_api.L.u(323695, 'verse')

()

In [263]:
doc = docx.Document("annotation.docx")

In [264]:
len(doc.tables)

100

In [283]:
from typing import NamedTuple, Tuple


class LingLabel(NamedTuple):
    """Object for storing linguistic labels."""
    
    label: str
    value: str
    node: int
    target: str


class FrozenLingLabel(NamedTuple):
    """
    Object for long-term storage of linguistic labels.
    
    Since node numbers might change after annotations are completed,
    we store the final annotation data under the slots and otype associated
    with the original node. Obsoleted FrozenLingLabels can easily be 
    identified by searching for nodes with the same otype and oslots
    within a newer version of the corpus. Failure to find a match indicates
    the label should be redone.
    """
    
    label: str
    value: str
    node_data: Tuple[str, Tuple[int]]
    target: str
    
    @classmethod
    def from_ling_label(cls, ling_label: LingLabel, tf_api):
        """Get FrozenLingLabel from LingLabel object."""
        otype = tf_api.F.otype.v(ling_label.node)
        oslots = (
            tf_api.L.d(ling_label.node, 'word') 
            if otype != 'word' 
            else ling_label.node
        )
        return FrozenLingLabel(
            label=ling_label.label,
            value=ling_label.value,
            node_data=(otype, oslots),
            target=ling_label.target,
        )


In [280]:
def label_from_row(row):
    """Extract cell valeus from a table row into a LingLabel object."""
    label_cell, node_cell, target_cell, text_cell, value_cell = row.cells
    ling_label = LingLabel(
        label=label_cell.text,
        value=value_cell.text,
        node=int(node_cell.text),
        target=target_cell.text
    )
    return ling_label


labels = []
for table in doc.tables:
    for row in table.rows:
        label = label_from_row(row)
        labels.append(FrozenLingLabel.from_ling_label(label, tf_api))

In [281]:
labels

[FrozenLingLabel(label='tense', value='past', node_data=('word', 3), target='verb'),
 FrozenLingLabel(label='aspect', value='ach_id', node_data=('clause', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)), target='time_clause'),
 FrozenLingLabel(label='tp_cluster', value='1.1.2', node_data=('phrase', (1, 2)), target='time_phrase'),
 FrozenLingLabel(label='tp_head', value='2', node_data=('phrase', (1, 2)), target='time_phrase'),
 FrozenLingLabel(label='is_rela', value='CT', node_data=('phrase', (1, 2)), target='time_phrase'),
 FrozenLingLabel(label='tense', value='past', node_data=('word', 3), target='verb'),
 FrozenLingLabel(label='aspect', value='ach_id', node_data=('clause', (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)), target='time_clause'),
 FrozenLingLabel(label='tp_cluster', value='1.1.2', node_data=('phrase', (1, 2)), target='time_phrase'),
 FrozenLingLabel(label='tp_head', value='2', node_data=('phrase', (1, 2)), target='time_phrase'),
 FrozenLingLabel(label='is_rela', value='CT', node_data=('phras