In [1]:
import re
import collections
import pathlib
import logging
import unicodedata

from IPython.display import display, HTML
from tf.fabric import Fabric
import pandas as pd

In [2]:
from nena_corpus import html_to_text, parse_metadata

logging.getLogger().setLevel(logging.DEBUG)

files_barwar = pathlib.Path.cwd().glob('texts/bar text *.html')
files_urmi_c = pathlib.Path.cwd().glob('texts/cu *.html')

# Characters to be replaced
replace = {
    '\u2011': '\u002d',  # U+2011 NON-BREAKING HYPHEN -> U+002D HYPHEN-MINUS
    '\u01dd': '\u0259',  # U+01DD LATIN SMALL LETTER TURNED E -> U+0259 LATIN SMALL LETTER SCHWA
    '\uf1ea': '\u003d',  # U+F1EA Deprecated SIL character -> U+003D '=' EQUALS SIGN
    '\u2026': '...',  # U+2026 '…' HORIZONTAL ELLIPSIS -> three dots
    'J\u0335': '\u0248',  # 'J' + U+0335 COMBINING SHORT STROKE OVERLAY -> U+0248 'Ɉ' LATIN CAPITAL LETTER J WITH STROKE
    'J\u0336': '\u0248',  # 'J' + U+0336 COMBINING LONG STROKE OVERLAY -> U+0248 'Ɉ' LATIN CAPITAL LETTER J WITH STROKE
    '\u002d\u032d': '\u032d\u002d',  # Switch positions of Hyphen and Circumflex accent below
    '\u2011\u032d': '\u032d\u002d',  # Switch positions of Non-breaking hyphen and Circumflex accent below
}

In [3]:
def combine_chars(text):
    """Yield letters combined with combining diacritics"""
    
    char = []
    
    for c in text:
        if unicodedata.category(c) == 'Mn':  # 'Mn': non-spacing combining mark
            char.append(c)
            continue
        
        if char:
            yield ''.join(char)
        char = [c]
        
    yield ''.join(char)

raw_node_features = collections.defaultdict(lambda:collections.defaultdict(set))
raw_oslots = collections.defaultdict(lambda:collections.defaultdict(set))

# initialize counters (will be increased to start from 1)
this_text = 0
this_paragraph = 0
this_line = 0
this_sentence = 0
this_subsentence = 0
this_word = 0
this_morpheme = 0

slot = 0

for dialect, files in (('Barwar', files_barwar), ('Urmi_C', files_urmi_c)):
    
    # TODO At this point record book/publication/dialect?
    # E.g. SSLL_2016_Urmi_C, HOS_2008_Barwar?
    
    for file in files:
        
        logging.info(f'Processing file {file.name} ...')
        
        for p in html_to_text(file, replace=replace):
            # metadata:
            # - dialect
            # - file.name
            
            if p.type.startswith('gp-') and str(p).strip():
                # store metadata from headings:
                # - text_id
                # - title
                # - informant
                # - place
                # - version (if applicable -- only Urmi_C A35)
                if p.type.startswith('gp-sectionheading'):
                    metadata = {}
                for k, v in parse_metadata(p):
                    metadata[k] = v
            #
            elif p.type == 'p':
                # regular paragraphs
                
                # first check if we need to update metadata
                # TODO for now we do not store informant, place, and version,
                # since those are not always features of a text, but of a section
                # of the text, and I do not know how to do that.
                # QUESTION -- do we need to add a layer 'subsection'?
                if (metadata
                    and (not raw_node_features['text_id']
                         or raw_node_features['text_id'][this_text] != metadata['text_id'])):
                    this_text += 1
                    raw_node_features['text_id'][this_text] = metadata['text_id']
                    raw_node_features['title'][this_text] = metadata['title']
                    raw_node_features['dialect'][this_text] = dialect
                    raw_node_features['filename'][this_text] = file.name
                
                # increment paragraph
                this_paragraph += 1
                
                # start paragraph with an empty marker stack
                marker_stack = []
                
                # set end-of-unit markers to True at the beginning of paragraph,
                # so the units can be increased on encounter of first word character
                sentence_end = True
                subsentence_end = True
                word_end = True
                morpheme_end = True
                
                for text, text_style in p:
                    
                    if text_style == 'verse_no':
                        this_line += 1
                        raw_node_features['line'][this_line] = text.strip(' ()') # TODO int()?
                        metadata['verse_no'] = text.strip(' ()')  # TODO Remove from metadata dict?
                        continue
                        
                    elif text_style == 'fn_anchor':
                        # TODO handle footnotes in some way, discard for now
                        continue
                    
                    elif text_style == 'comment':
                        continue  # TODO handle comments
                    
                    elif text_style == 'marker':
                        if marker_stack and marker_stack[-1] == text:
                            marker_stack.pop()
                        else:
                            marker_stack.append(text)
                        continue
                    
                    elif text_style not in ('', 'foreign'):
                        logging.debug(f'Unhandled text_style: {repr(text_style)}, {repr(text)}')
                        continue
                    
                    elif text_style == 'foreign':
                        pass  # TODO store feature
                    
                    else: # text_style == '':
                        pass
                    
                    if (text_style == '' and marker_stack
                        and any(c.isalpha() for c in text)
                        and not text.isalpha()):
                        # In one case, there is no closing marker tag, so force closing the marker
                        # Urmi_C A42 9: 'RzdànyəlaR' (p.154, r.28) 'zdàny' roman, 'əla' cursive
                        # Urmi_C A43 17: 'ʾe-Rbuk̭ḗṱ' (p. 174, r.14), no closing 'R'
                        # Urmi_C B2 16: 'Pʾafšɑ̄rī̀P' (p.250 r.17), inital 'ʾ' cursive
                        marker = marker_stack.pop()
                        logging.warning(f'Unfinished marker: {repr(marker)}, closed forcibly..')
                        logging.debug(f'{dialect}, {metadata["text_id"]}:{metadata["verse_no"]}')
                        logging.debug(f'Text: {repr(text)}')
                    
                    # If we got this far, we have a text string,
                    # with either text_style '' or 'foreign'.
                    # We will iterate over them character by character.
                    for c in combine_chars(text):
                        
                        if c[0].isalpha() or c == '+':
                            
                            # Increment text units on start of new word
                            if morpheme_end:
                                this_morpheme += 1
                                morpheme_end = False
                            if word_end:
                                this_word += 1
                                word_end = False
                            if subsentence_end:
                                this_subsentence += 1
                                subsentence_end = False
                            if sentence_end:
                                this_sentence += 1
                                sentence_end = False
                            
                        else:  # if c is anything but a letter or '+':
                            if not morpheme_end:
                                morpheme_end = True
                            if c not in ('-', '=') and not word_end:
                                word_end = True
                            if c == ',' and not subsentence_end:
                                subsentence_end = True
                            if c in ('.', '!', '?') and not sentence_end:
                                sentence_end = True
                        
                        slot += 1
                        raw_node_features['char'][slot] = c

                        raw_oslots['text'][this_text].add(slot)
                        raw_oslots['paragraph'][this_paragraph].add(slot)
                        raw_oslots['line'][this_line].add(slot)
                        raw_oslots['sentence'][this_sentence].add(slot)
                        raw_oslots['subsentence'][this_subsentence].add(slot)
                        if not word_end:
                            raw_oslots['word'][this_word].add(slot)
                        if not morpheme_end:
                            raw_oslots['morpheme'][this_morpheme].add(slot)
                
            else:
                logging.debug(f'Unhandled paragraph type: {repr(p.type)}.')
                logging.debug(f'Text: {repr(str(p))}.')

INFO:root:Processing file bar text A14.html ...
INFO:root:Processing file bar text a29.html ...
INFO:root:Processing file bar text A49.html ...
INFO:root:Processing file bar text a28.html ...
INFO:root:Processing file bar text a50-A52.html ...
INFO:root:Processing file bar text A45.html ...
INFO:root:Processing file bar text a31-A33.html ...
INFO:root:Processing file bar text A42-A44.html ...
INFO:root:Processing file bar text a25.html ...
INFO:root:Processing file bar text a30.html ...
INFO:root:Processing file bar text a34.html ...
INFO:root:Processing file bar text a19-A23.html ...
INFO:root:Processing file bar text a24.html ...
DEBUG:root:Unhandled paragraph type: 'footer'.
DEBUG:root:Text: ' 7 '.
INFO:root:Processing file bar text a18.html ...
INFO:root:Processing file bar text A37-A40.html ...
INFO:root:Processing file bar text a1-A7.html ...
DEBUG:root:Unhandled paragraph type: 'sdfootnote1'.
DEBUG:root:Text: ' 1 The name Čuxo means ‘one who wears the woolen čuxa garment’. '.
IN

## Reindex Objects Above Slot Levels

In [4]:
otype2feature = {
    'text': {'text_id', 'title', 'dialect', 'filename'},
    'paragraph': {},
    'line': {'line'},
    'sentence': {},
    'subsentence': {},
    'word': {},
    'morpheme': {},
}

node_features = collections.defaultdict(lambda:collections.defaultdict())

node_features['char'] = raw_node_features['char'] # add slot features
# node_features['trailer'] = raw_node_features['trailer']

In [5]:
for slot in node_features['char']:
    node_features['otype'][slot] = 'char'    

In [6]:
edge_features = collections.defaultdict(lambda:collections.defaultdict(set)) # oslots will go here

onode = max(raw_node_features['char']) # max slot, incremented +1 in loop

for otype in raw_oslots.keys():
    for oID, slots in raw_oslots[otype].items():
        
        # make new object node number
        onode += 1
        node_features['otype'][onode] = otype
        
        # remap node features to node number
        for feat in otype2feature[otype]:
            node_features[feat][onode] = raw_node_features[feat][oID]
        edge_features['oslots'][onode] = raw_oslots[otype][oID]

In [7]:
node_features.keys()

dict_keys(['char', 'otype', 'title', 'filename', 'dialect', 'text_id', 'line'])

In [8]:
edge_features.keys()

dict_keys(['oslots'])

In [9]:
otext = {
    'sectionTypes': 'text,paragraph,line,sentence',
    'sectionFeatures': 'text_id,line',
    'fmt:text-orig-full': '{char}'
}

meta = {'':{'author': 'Geoffrey Khan, Cody Kingham, and Hannes Vlaardingerbroek'},
        'oslots':{'edgeValues':False, 'valueType':'int'},
        'otype':{'valueType':'str'},
        'text':{'valueType':'str'},
        'paragraph':{'valueType':'str'},
        'line':{'valueType':'str'},
        'word':{'valueType':'str'},
        'char':{'valueType':'str'},
        'text_id':{'valueType':'str'},
        'title':{'valueType':'str'},
        'dialect':{'valueType':'str'},
        'filename':{'valueType':'str'},
        'otext':otext}

In [10]:
TFs = Fabric(locations=['new_tf/'])

This is Text-Fabric 7.8.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

12 features found and 0 ignored


In [11]:
TFs.save(nodeFeatures=node_features, edgeFeatures=edge_features, metaData=meta)

  0.00s Exporting 7 node and 1 edge and 4 config features to new_tf/:
  0.00s VALIDATING oslots feature
  0.12s VALIDATING oslots feature
  0.12s maxSlot=     730892
  0.12s maxNode=     973009
  0.16s OK: oslots is valid
   |     1.62s T char                 to new_tf
   |     0.00s T dialect              to new_tf
   |     0.00s T filename             to new_tf
   |     0.01s T line                 to new_tf
   |     0.37s T otype                to new_tf
   |     0.00s T text_id              to new_tf
   |     0.00s T title                to new_tf
   |     1.43s T oslots               to new_tf
   |     0.00s M otext                to new_tf
   |     0.00s M paragraph            to new_tf
   |     0.00s M text                 to new_tf
   |     0.00s M word                 to new_tf
  3.62s Exported 7 node features and 1 edge features and 4 config features to new_tf/


True

In [12]:
TF = Fabric(locations='new_tf/')

This is Text-Fabric 7.8.0
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

12 features found and 0 ignored


In [13]:
N = TF.load('''

text_id paragraph line word char otype title

''')

N.makeAvailableIn(globals())
print()

  0.00s loading features ...
   |     0.40s T otype                from new_tf
   |     0.00s Not enough info for sections in otext, section functionality will not work
   |     0.00s Not enough info for structure in otext, structure functionality will not work
   |     2.74s T char                 from new_tf
   |      |     0.26s C __levels__           from otype, oslots, otext
   |      |       11s C __order__            from otype, oslots, __levels__
   |      |     0.57s C __rank__             from otype, __order__
   |      |       13s C __levUp__            from otype, oslots, __rank__
   |      |     2.59s C __levDown__          from otype, __levUp__, __rank__
   |      |     5.83s C __boundary__         from otype, oslots, __rank__
   |     0.00s T text_id              from new_tf
   |     0.01s T line                 from new_tf
   |     0.00s T title                from new_tf
    42s All features loaded/computed - for details use loadLog()



In [14]:
print(len(list(F.otype.s('word'))), 'words in the corpus')
print(len(list(F.otype.s('morpheme'))), 'morphemes in the corpus')

93762 words in the corpus
120134 morphemes in the corpus


In [15]:
print('books and their word counts: \n')
for text in F.otype.s('text'):
    text_words = L.d(text, 'word')
    text_morphemes = L.d(text, 'morpheme')
    print(text, F.title.v(text))
    print(f'\t{len(text_words)} words, {len(text_morphemes)} morphemes')

books and their word counts: 

730893 TALES FROM THE 1001 NIGHTS
	3018 words, 4195 morphemes
730894 THE TALE OF RUSTAM (2)
	1645 words, 2254 morphemes
730895 THE CROW AND THE CHEESE
	51 words, 71 morphemes
730896 THE TALE OF RUSTAM (1)
	1008 words, 1316 morphemes
730897 THE SISISAMBƏR PLANT
	289 words, 385 morphemes
730898 QAṬINA RESCUES HIS NEPHEW FROM LELIΘA
	358 words, 492 morphemes
730899 THE BATTLE WITH YUWANƏS THE ARMENIAN
	607 words, 780 morphemes
730900 THE FOX AND THE STORK
	70 words, 102 morphemes
730901 THE GIANT’S CAVE
	238 words, 336 morphemes
730902 THE FOX AND THE MILLER
	841 words, 1101 morphemes
730903 THE LION WITH A SWOLLEN LEG
	378 words, 494 morphemes
730904 THE FOX AND THE LION
	95 words, 124 morphemes
730905 SOUR GRAPES
	62 words, 82 morphemes
730906 THE CAT AND THE MICE
	99 words, 138 morphemes
730907 THE TALE OF FARXO AND SƏTTIYA
	2490 words, 3303 morphemes
730908 THE CRAFTY HIRELING
	1362 words, 1818 morphemes
730909 THE GIRL AND THE SEVEN BROTHERS
	744 words,

In [16]:
text = F.otype.s('text')[0]

In [17]:
len(L.d(text, 'word'))

3018

In [18]:
print(F.title.v(text))

for sent in F.otype.s('line')[:10]:
    print(sent, T.text(sent))


TALES FROM THE 1001 NIGHTS
731561 xa-màlka| kút-yum ðà-brata gawə́rwa.| mbádla qayə́mwa qaṭə̀lwala.| wăzī̀r| xðírre xðìrre,| bnáθa prìqla.| kút-yum ðà,| lìθ.| ʾáwwa wăzī́r ʾíθwale ða-bràta.| ʾa-bráta mə́ra ṭla-wằzir,| ṭla-bába dìya,| mə́ra bábi ʾána nàbəlli| gawrànne ʾáwwa málka| mparqànnux m-áyya qə́ṣṣət.|
731562 qìmtɛla| ʾítwala ða-qàṭu,| nubàltəlla mə́nna díya.| nubáltəlla qáṭu mə́nna dìya,| gwìrtəlle málka.| ʾaw-dmìxɛle,| píštɛla mtanóye ða-qə̀ṣṣət| ṭla-qàṭu.|
731563 mə́ra ṭla-d-à-qaṭu| mə̀ra| qáṭu lɛ́le rìxɛle| mtányən ða-qə̀ṣṣət.| ʾɛ́-dana mbádla qáyəm málka qaṭə̀lli.| sab-kəmà-ṱ-ile gwára,| ʾaṣə́rta gawə̀rra| mbádla qaṭə̀lla.| yăðána mbádla qaṭə́lli ʾaw-màlka.|  
731564 mə́ra ʾíθwa lìθwa,| biš-m-álaha góṛa čú-məndi lìθwa.| — ʾáyya tuníθa ṭla-qàṭu —| mára ʾíθwa xá bàxta,| ʾìtwala| xa-bróna šə́mme díye Kărī̀m-addīn.| mára ʾàwwa,| ʾó Kărī̀m,| bábe mìtle.| bábe díye mìtle,| ʾáyya bàxta| kùt-yum| goyàwa,| maxláwa ṭla-bróna dìya.|
731565 qímla mšodə́rra brōn-díya mədràsa.| bróna díya 