In [1]:
import docx
import re
import collections
import pandas as pd

In [246]:
file = 'files/KTU 1.2_text processed.docx'

In [349]:
word_doc = docx.Document(file)

data = collections.defaultdict()

corpus = ''
column = ''
line = ''
line_text = ''
n=0

for para in word_doc.paragraphs:

    font = ''
    if re.match('KTU', para.text):
        corpus = para.text
    if re.match('[LXVI+]', para.text):
        column = para.text
    elif re.match('^[0-9]', para.text):
        line = re.match('^[0-9]*', para.text).group()
        line_text = re.sub('^[0-9]*', '', para.text).lstrip()
        
        for run in para.runs:
            letters_only = re.findall('[a-zḏġḥḫṣšṯṭẓˤ]', run.text)
            for r in letters_only:
                if run.italic:
                    font += 'i'
                else:
                    font += 'o'
        
        units = line_text.split(' . ')
        
        unit_no=1
        for u in units:
            hyphenize = re.sub('([a-zḏḏġḥḫṣšṯṭẓˤ]) ([a-zḏġḥḫṣšṯṭẓˤ])',r'\1-\2', u) #Insert hyphen between consonants in graph. units
            words = hyphenize.split('-')
            
            word_no=1
            for w in words:
                
                #trailer
                if word_no < len(words):
                    trailer = ''
                elif unit_no < len(units):
                    trailer = "."
                else:
                    trailer = ' '
        
                data[n] = [corpus, column, int(line), w, trailer, font]
            
                n+=1
                word_no+=1                
            unit_no+=1
        
df = pd.DataFrame(data).T
df.columns = ['corpus','column','line','word', 'trailer','font']

In [350]:
#df.to_excel('test.xlsx', index=None)

In [351]:
df

Unnamed: 0,corpus,column,line,word,trailer,font
0,KTU 1.2,I,1,[ ]x[ ...,,o
1,KTU 1.2,I,2,k,,oooo
2,KTU 1.2,I,2,ṯb,.,oooo
3,KTU 1.2,I,2,x[ ],,oooo
4,KTU 1.2,I,3,at,.,oioiiiooo
...,...,...,...,...,...,...
854,KTU 1.2,IV,39,ibh,.,oiiio
855,KTU 1.2,IV,39,mš[ ...,,oiiio
856,KTU 1.2,IV,40,bn,.,oiiio
857,KTU 1.2,IV,40,ˤnh[ ...,,oiiio


### Prepare the conversion

In [352]:
from tf.fabric import Fabric
from tf.convert.walker import CV
from tf.app import use

DATA_FOLDER = 'tf'
VERSION = '0.1'

TF_PATH = f'../{DATA_FOLDER}/{VERSION}'
TF = Fabric(locations=TF_PATH, silent=True)

In [353]:
script = {'a': 'a',
         'b': 'b',
         'd':'d',
         'ḏ':'u\1E0F',
         'g':'',
         'ġ':'\u0121',
         'h':'',
         'ḥ':'\u1E25',
         'ḫ':'\u1E2B',
         'i':'i',
         'k':'k',
         'l':'l',
         'm':'m',
         'n':'n',
         'p':'p',
         'q':'q',
         'r':'r',
         's':'s',
         'ṣ':'\u1E63',
         'š':'\u0161',
         'ṯ':'\u1E6F',
         'ṭ':'\u1E6D',
         't':'t',
         'u':'u',
         'w':'w',
         'y':'y',
         'z':'z',
         'ẓ':'\u1E93',
         'ˤ':'\u02BF',
         'x':'x',
         '.':'.',
         ',':',',
         '[':'[',
         ']':']',
         '{':'}',
         '}':'}',
         '<':'<',
         '>':'>',
         '/':'/',
         ' ':' ',
         '\xa0':' ',
         }

slotType = 'sign'

#Metadata
generic = {
    'dataset':'cuc',
    'datasetName': 'Copenhagen Ugarit Corpus',
    'encodedBy': 'Christian Canu Højgaard and Martijn Naaijer',
    'convertedToTextFabricBy':'Martijn Naaijer and Christian Canu Højgaard',    
    'source': '',
    'manuscripts':'',
    'licence': 'Creative Commons Attribution-NonCommercial 4.0 International License',
    'licenceUrl': 'http://creativecommons.org/licenses/by-nc/4.0/',
    'version': VERSION
}

#Representations
otext = {
    'fmt:text-orig-full': '{sign}',
    'sectionTypes': 'corpus,column,line',
    'sectionFeatures': 'corpus,column,line',
}

featureMeta = {
    'sign': {
        'description': 'consonantal letter',
    },
    'corpus': {
        'description': 'corpus name'
    },
    'column': {
        'description': 'column number',
    },
    'line': {
        'description': 'line number',
    },
    'g_cons': {
        'description': 'word consonantal-transliterated',
    },
    'trailer': {
        'description': 'interword material',
    },
    'language': {
        'description': 'language',
    }}

intFeatures = {
  'line'
}

In [354]:
def director(cv):   
    label_dict = dict(
        corpus = '',
        column = '',
        line = '',
        word = '',
    )
    
    node_dict = dict(
        corpus=None,
        column=None,
        line=None,
        word=None,
    )

    for row in df.iterrows():
        
        #book_title, chapter_number, verse_number = reference(line[1]['ref'])
        corpus_title = row[1]['corpus']
        column_number = row[1]['column']
        line_number = row[1]['line']
        
        if corpus_title != label_dict['corpus']:
            label_dict['corpus'] = corpus_title
            for ntp in ('word','line','column','corpus'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['corpus'] = cv.node('corpus')
            cv.feature(
              node_dict['corpus'],
              corpus=corpus_title,
            )

        if column_number != label_dict['column']:
            label_dict['column'] = column_number
            for ntp in ('word','line','column'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None
            node_dict['column'] = cv.node('column')
            cv.feature(
              node_dict['column'],
              column = column_number,
            )
         
        if line_number != label_dict['line']:
            label_dict['line'] = line_number
            for ntp in ('word','line'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['line'] = cv.node('line')
            cv.feature(
              node_dict['line'],
              line = line_number,
            )
            
        node_dict['word'] = cv.node('word')
        
        cv.feature(
            node_dict['word'],
            g_cons = row[1]['word'],
            trailer = row[1]['trailer'],
            language = 'Ugaritic',
        )
        
        signs = f'{row[1]["word"]}{row[1]["trailer"]}'
        for sign in signs:
            s = cv.slot()
            cv.feature(s, 
                    sign=script[sign]
                )
            
        cv.terminate(node_dict['word']) 

    # just for informational purposes
    print('\nINFORMATION:', cv.activeTypes(), '\n')
  
    for ntp in ('word','line','column','corpus'):
        cv.terminate(node_dict[ntp])

In [355]:
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
    generateTf=True
)

good

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.01s No structure nodes will be set up
   |   SECTION   TYPES:    corpus, column, line
   |   SECTION   FEATURES: corpus, column, line
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   text-orig-full       sign
   |     0.01s OK
   |     0.00s Following director... 

INFORMATION: {'corpus', 'line', 'column'} 

   |     0.21s "edge" actions: 0
   |     0.21s "feature" actions: 6015
   |     0.21s "node" actions: 996
   |     0.21s "resume" actions: 0
   |     0.21s "slot" actions: 5019
   |     0.21s "terminate" actions: 1143
   |          4 x "column" node 
   |          1 x "corpus" node 
   |        132 x "line" node 
   |       5019 x "sign" node  = slot type
   |        859 x "word" node 
   |       6015 nodes of all types
   |     0.23s OK
   |     0.00s checking for nodes and edges ... 
   |     0.00s OK
   |     0.00s checking 

True

In [356]:
from tf.app import use

In [357]:
A = use('dt-ucph/cuc', checkout='clone', version='0.1', hoist=globals())

**Locating corpus resources ...**

The requested app is not available offline
	~/text-fabric-data/github/dt-ucph/cuc/app not found


File is not a zip file
	could not save corpus data to ~/text-fabric-data/github

rate limit is 5000 requests per hour, with 4992 left for this hour
	connecting to online GitHub repo dt-ucph/cuc ... connected
	cannot find releases
	cannot find releases


No directory /app in #d9153f74966fca172a275486d2c6a00917bc1280	Failed


   |     0.01s T otype                from ~/github/dt-ucph/cuc/tf/0.1
   |     0.09s T oslots               from ~/github/dt-ucph/cuc/tf/0.1
   |     0.03s T corpus               from ~/github/dt-ucph/cuc/tf/0.1
   |     0.01s T line                 from ~/github/dt-ucph/cuc/tf/0.1
   |     0.10s T sign                 from ~/github/dt-ucph/cuc/tf/0.1
   |     0.02s T column               from ~/github/dt-ucph/cuc/tf/0.1
   |      |     0.01s C __levels__           from otype, oslots, otext
   |      |     0.11s C __order__            from otype, oslots, __levels__
   |      |     0.00s C __rank__             from otype, __order__
   |      |     0.43s C __levUp__            from otype, oslots, __rank__
   |      |     0.08s C __levDown__          from otype, __levUp__, __rank__
   |      |     0.00s C __characters__       from otext
   |      |     0.08s C __boundary__         from otype, oslots, __rank__
   |      |     0.00s C __sections__         from otype, oslots, otext, __levUp

Name,# of nodes,# slots/node,% coverage
corpus,1,5019.0,100
column,4,1254.75,100
line,132,38.02,100
word,859,5.84,100
sign,5019,1.0,100


In [358]:
L.u(1, 'line')

(5025,)

In [360]:
A.pretty(5026)