In [1]:
import re
import collections
import pathlib
import logging
import unicodedata

from IPython.display import display, HTML
from tf.fabric import Fabric
import pandas as pd

In [2]:
from nena_corpus import html_to_text, parse_metadata

logging.getLogger().setLevel(logging.DEBUG)

files_barwar = pathlib.Path.cwd().glob('texts/bar text *.html')
files_urmi_c = pathlib.Path.cwd().glob('texts/cu *.html')

# Characters to be replaced
replace = {
    '\u2011': '\u002d',  # U+2011 NON-BREAKING HYPHEN -> U+002D HYPHEN-MINUS
    '\u01dd': '\u0259',  # U+01DD LATIN SMALL LETTER TURNED E -> U+0259 LATIN SMALL LETTER SCHWA
    '\uf1ea': '\u003d',  # U+F1EA Deprecated SIL character -> U+003D '=' EQUALS SIGN
    '\u2026': '...',  # U+2026 '…' HORIZONTAL ELLIPSIS -> three dots
    'J\u0335': '\u0248',  # 'J' + U+0335 COMBINING SHORT STROKE OVERLAY -> U+0248 'Ɉ' LATIN CAPITAL LETTER J WITH STROKE
    'J\u0336': '\u0248',  # 'J' + U+0336 COMBINING LONG STROKE OVERLAY -> U+0248 'Ɉ' LATIN CAPITAL LETTER J WITH STROKE
    '\u002d\u032d': '\u032d\u002d',  # Switch positions of Hyphen and Circumflex accent below
    '\u2011\u032d': '\u032d\u002d',  # Switch positions of Non-breaking hyphen and Circumflex accent below
}

In [3]:
def iterateKey(dictionary):
    '''
    Auto increments a key from a dictionary.
    '''
    return max(dictionary.keys(), default=0)+1

def combine_chars(text):
    """Yield letters combined with combining diacritics"""
    
    char = []
    
    for c in text:
        if unicodedata.category(c) == 'Mn':  # 'Mn': non-spacing combining mark
            char.append(c)
            continue
        
        if char:
            yield ''.join(char)
        char = [c]
        
    yield ''.join(char)

raw_node_features = collections.defaultdict(lambda:collections.defaultdict(set))
raw_oslots = collections.defaultdict(lambda:collections.defaultdict(set))

slot = 0

this_sentence = 1 # for first iteration since only sentence ends are marked

# units:
#     book/publication/dialect?
#     text
#     paragraph
#     line/verse
#     sentence
#     subsentence
#     word
#     morpheme
#     char

# oslots
# node_features

for dialect, files in (('Barwar', files_barwar), ('Urmi_C', files_urmi_c)):
    
    # TODO At this point record book/publication/dialect?
    # E.g. SSLL_2016_Urmi_C, HOS_2008_Barwar?
    
    for file in files:
        
        logging.info(f'Processing file {file.name} ...')
        
        for p in html_to_text(file, replace=replace):
            # metadata:
            # - dialect
            # - file.name
            
            if p.type.startswith('gp-') and str(p).strip():
                # store metadata from headings:
                # - text_id
                # - title
                # - informant
                # - place
                # - version (if applicable -- only Urmi_C A35)
                if p.type.startswith('gp-sectionheading'):
                    metadata = {}
                for k, v in parse_metadata(p):
                    metadata[k] = v
            #
            elif p.type == 'p':
                # regular paragraphs
                
                # first check if we need to update metadata
                # TODO for now we do not store informant, place, and version,
                # since those are not always features of a text, but of a section
                # of the text, and I do not know how to do that.
                # QUESTION -- do we need to add a layer 'subsection'?
                if (metadata
                    and (not raw_node_features['text_id']
                         or raw_node_features['text_id'][this_text] != metadata['text_id'])):
                    this_text = iterateKey(raw_oslots['text'])
                    raw_node_features['text_id'][this_text] = metadata['text_id']
                    raw_node_features['title'][this_text] = metadata['title']
                
                # increment paragraph
                this_paragraph = iterateKey(raw_oslots['paragraph'])
                
                marker_stack = []
                word_end = True
                
                for text, text_style in p:
                    # check if need to increment verse/line
                    if text_style == 'verse_no':
                        this_line = iterateKey(raw_oslots['line'])
                        raw_node_features['line'][this_line] = text.strip(' ()') # TODO int()?
                        metadata['verse_no'] = text.strip(' ()')  # TODO Remove from metadata dict?
                        continue
                        
                    elif text_style == 'fn_anchor':
                        # TODO handle footnotes in some way, discard for now
                        continue
                    
                    elif text_style == 'comment':
                        continue  # TODO handle comments
                    
                    elif text_style == 'marker':
                        if marker_stack and marker_stack[-1] == text:
                            marker_stack.pop()
                        else:
                            marker_stack.append(text)
                        continue
                    
                    elif text_style not in ('', 'foreign'):
                        logging.debug(f'Unhandled text_style: {repr(text_style)}, {repr(text)}')
                        continue
                    
                    elif text_style == 'foreign':
                        pass  # TODO store feature
                    
                    else: # text_style == '':
                        pass
                    
                    if (text_style == '' and marker_stack
                        and any(c.isalpha() for c in text)
                        and not text.isalpha()):
                        # In one case, there is no closing marker tag, so force closing the marker
                        # Urmi_C A42 9: 'RzdànyəlaR' (p.154, r.28) 'zdàny' roman, 'əla' cursive
                        # Urmi_C A43 17: 'ʾe-Rbuk̭ḗṱ' (p. 174, r.14), no closing 'R'
                        # Urmi_C B2 16: 'Pʾafšɑ̄rī̀P' (p.250 r.17), inital 'ʾ' cursive
                        marker = marker_stack.pop()
                        logging.warning(f'Unfinished marker: {repr(marker)}.')
                        logging.debug(f'{dialect}, {metadata["text_id"]}:{metadata["verse_no"]}')
                        logging.debug(f'Text: {repr(text)}')
                    
                    # If we got this far, we have a text string,
                    # with either text_style '' or 'foreign'.
                    # We will iterate over them character by character.
                    char = []
                    for c in combine_chars(text):
                        
                        if word_end and c[0].isalpha():
                            word_end = False
                            this_word = iterateKey(raw_oslots['word'])
                            
                        elif not word_end and not c[0].isalpha(): # TODO and not in ('-', '=')?
                            word_end = True
                        
                        # TODO check for sentence, subsentence, morpheme boundaries
                        
                        slot += 1
                        raw_node_features['char'][slot] = c

                        raw_oslots['text'][this_text].add(slot)
                        raw_oslots['paragraph'][this_paragraph].add(slot)
                        raw_oslots['line'][this_line].add(slot)
                        if not word_end:
                            raw_oslots['word'][this_word].add(slot)
                
            else:
                logging.debug(f'Unhandled paragraph type: {repr(p.type)}.')
                logging.debug(f'Text: {str(p)}.')

INFO:root:Processing file bar text A14.html ...
INFO:root:Processing file bar text a29.html ...
INFO:root:Processing file bar text A49.html ...
INFO:root:Processing file bar text a28.html ...
INFO:root:Processing file bar text a50-A52.html ...
INFO:root:Processing file bar text A45.html ...
INFO:root:Processing file bar text a31-A33.html ...
INFO:root:Processing file bar text A42-A44.html ...
INFO:root:Processing file bar text a25.html ...
INFO:root:Processing file bar text a30.html ...
INFO:root:Processing file bar text a34.html ...
INFO:root:Processing file bar text a19-A23.html ...
INFO:root:Processing file bar text a24.html ...
DEBUG:root:Unhandled paragraph type: 'footer'.
DEBUG:root:Text:  7 .
INFO:root:Processing file bar text a18.html ...
INFO:root:Processing file bar text A37-A40.html ...
INFO:root:Processing file bar text a1-A7.html ...
DEBUG:root:Unhandled paragraph type: 'sdfootnote1'.
DEBUG:root:Text:  1 The name Čuxo means ‘one who wears the woolen čuxa garment’. .
INFO

## Reindex Objects Above Slot Levels

In [4]:
otype2feature = {
    'text': {'text_id', 'title'},
    'paragraph': {},
    'line': {'line'},
    'word': {}
}

node_features = collections.defaultdict(lambda:collections.defaultdict())

node_features['char'] = raw_node_features['char'] # add slot features
# node_features['trailer'] = raw_node_features['trailer']

In [5]:
for slot in node_features['char']:
    node_features['otype'][slot] = 'char'    

In [6]:
edge_features = collections.defaultdict(lambda:collections.defaultdict(set)) # oslots will go here

onode = max(raw_node_features['char']) # max slot, incremented +1 in loop

for otype in raw_oslots.keys():
    for oID, slots in raw_oslots[otype].items():
        
        # make new object node number
        onode += 1
        node_features['otype'][onode] = otype
        
        # remap node features to node number
        for feat in otype2feature[otype]:
            node_features[feat][onode] = raw_node_features[feat][oID]
        edge_features['oslots'][onode] = raw_oslots[otype][oID]

In [7]:
node_features.keys()

dict_keys(['char', 'otype', 'text_id', 'title', 'line'])

In [8]:
edge_features.keys()

dict_keys(['oslots'])

In [9]:
otext = {
    'sectionTypes': 'text,paragraph,line,',
    'sectionFeatures': 'text_id,line',
    'fmt:text-orig-full': '{char}'
}

meta = {'':{'author': 'Geoffrey Khan, Cody Kingham, and Hannes Vlaardingerbroek'},
        'oslots':{'edgeValues':False, 'valueType':'int'},
        'otype':{'valueType':'str'},
        'text':{'valueType':'str'},
        'paragraph':{'valueType':'str'},
        'line':{'valueType':'str'},
        'word':{'valueType':'str'},
        'char':{'valueType':'str'},
        'text_id':{'valueType':'str'},
        'title':{'valueType':'str'},
        'otext':otext}

TFs = Fabric(locations=['new_tf/'])

This is Text-Fabric 7.8.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

10 features found and 0 ignored


In [10]:
TFs.save(nodeFeatures=node_features, edgeFeatures=edge_features, metaData=meta)

  0.00s Exporting 5 node and 1 edge and 4 config features to new_tf/:
  0.00s VALIDATING oslots feature
  0.12s VALIDATING oslots feature
  0.12s maxSlot=     730892
  0.12s maxNode=     854244
  0.15s OK: oslots is valid
   |     1.13s T char                 to new_tf
   |     0.00s T line                 to new_tf
   |     0.28s T otype                to new_tf
   |     0.00s T text_id              to new_tf
   |     0.00s T title                to new_tf
   |     0.59s T oslots               to new_tf
   |     0.00s M otext                to new_tf
   |     0.00s M paragraph            to new_tf
   |     0.00s M text                 to new_tf
   |     0.00s M word                 to new_tf
  2.16s Exported 5 node features and 1 edge features and 4 config features to new_tf/


True

In [11]:
TF = Fabric(locations='new_tf/')

N = TF.load('''

text_id paragraph line word char otype title

''')

N.makeAvailableIn(globals())
print()

This is Text-Fabric 7.8.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

10 features found and 0 ignored
  0.00s loading features ...
   |     0.37s T otype                from new_tf
   |     0.00s Not enough info for sections in otext, section functionality will not work
   |     0.00s Not enough info for structure in otext, structure functionality will not work
   |     1.95s T char                 from new_tf
   |      |     0.12s C __levels__           from otype, oslots, otext
   |      |     5.48s C __order__            from otype, oslots, __levels__
   |      |     0.40s C __rank__             from otype, __order__
   |      |     5.61s C __levUp__            from otype, oslots, __rank__
   |      |     0.53s C __levDown__          from otype, __levUp__, __rank__
   |      |     4.47s C __boundary__         from otype, oslots, __rank__
   |     0.00s T text_id              from new_tf
   |     0.02s T line                 from new_tf
   |     0.00s T titl

In [12]:
print(len(list(F.otype.s('word'))), 'words in the corpus')

120141 words in the corpus


In [13]:
print('books and their word counts: \n')
for text in F.otype.s('text'):
    text_words = L.d(text, 'word')
    print(text, F.title.v(text))
    print(f'\t{len(text_words)} words')

books and their word counts: 

730893 TALES FROM THE 1001 NIGHTS
	4195 words
730894 THE TALE OF RUSTAM (2)
	2254 words
730895 THE CROW AND THE CHEESE
	71 words
730896 THE TALE OF RUSTAM (1)
	1316 words
730897 THE SISISAMBƏR PLANT
	385 words
730898 QAṬINA RESCUES HIS NEPHEW FROM LELIΘA
	492 words
730899 THE BATTLE WITH YUWANƏS THE ARMENIAN
	780 words
730900 THE FOX AND THE STORK
	102 words
730901 THE GIANT’S CAVE
	336 words
730902 THE FOX AND THE MILLER
	1101 words
730903 THE LION WITH A SWOLLEN LEG
	494 words
730904 THE FOX AND THE LION
	124 words
730905 SOUR GRAPES
	82 words
730906 THE CAT AND THE MICE
	138 words
730907 THE TALE OF FARXO AND SƏTTIYA
	3303 words
730908 THE CRAFTY HIRELING
	1818 words
730909 THE GIRL AND THE SEVEN BROTHERS
	1030 words
730910 THE LELIΘA FROM Č̭ĀL
	321 words
730911 THE BEAR AND THE FOX
	506 words
730912 THE DAUGHTER OF THE KING
	1716 words
730913 THE SALE OF AN OX
	1711 words
730914 THE MAN WHO WANTED TO WORK
	1461 words
730915 THE TALE OF PARIZADA, WA

In [14]:
text = F.otype.s('text').start

In [15]:
len(L.d(text, 'word'))

4195

In [16]:
print(F.title.v(text))

for sent in F.otype.s('line')[:10]:
    print(sent, T.text(sent))


TALES FROM THE 1001 NIGHTS
731561 xa-màlka| kút-yum ðà-brata gawə́rwa.| mbádla qayə́mwa qaṭə̀lwala.| wăzī̀r| xðírre xðìrre,| bnáθa prìqla.| kút-yum ðà,| lìθ.| ʾáwwa wăzī́r ʾíθwale ða-bràta.| ʾa-bráta mə́ra ṭla-wằzir,| ṭla-bába dìya,| mə́ra bábi ʾána nàbəlli| gawrànne ʾáwwa málka| mparqànnux m-áyya qə́ṣṣət.|
731562 qìmtɛla| ʾítwala ða-qàṭu,| nubàltəlla mə́nna díya.| nubáltəlla qáṭu mə́nna dìya,| gwìrtəlle málka.| ʾaw-dmìxɛle,| píštɛla mtanóye ða-qə̀ṣṣət| ṭla-qàṭu.|
731563 mə́ra ṭla-d-à-qaṭu| mə̀ra| qáṭu lɛ́le rìxɛle| mtányən ða-qə̀ṣṣət.| ʾɛ́-dana mbádla qáyəm málka qaṭə̀lli.| sab-kəmà-ṱ-ile gwára,| ʾaṣə́rta gawə̀rra| mbádla qaṭə̀lla.| yăðána mbádla qaṭə́lli ʾaw-màlka.|  
731564 mə́ra ʾíθwa lìθwa,| biš-m-álaha góṛa čú-məndi lìθwa.| — ʾáyya tuníθa ṭla-qàṭu —| mára ʾíθwa xá bàxta,| ʾìtwala| xa-bróna šə́mme díye Kărī̀m-addīn.| mára ʾàwwa,| ʾó Kărī̀m,| bábe mìtle.| bábe díye mì