In [1]:
import docx
import re
import collections
import pandas as pd

In [2]:
file = 'files/KTU 1.2_text processed.docx'

In [126]:
word_doc = docx.Document(file)

data = collections.defaultdict()

corpus = ''
column = ''
line = ''
line_text = ''
n=0

for para in word_doc.paragraphs:

    font = ''
    if re.match('KTU', para.text):
        corpus = para.text
    if re.match('[LXVI+]', para.text):
        column = para.text
    elif re.match('^[0-9]', para.text):
        line = re.match('^[0-9]*', para.text).group()
        line_text = re.sub('^[0-9]*', '', para.text).lstrip()
        
        units = line_text.split(' . ')
        
        unit_no=1
        for u in units:
            hyphenize = re.sub('([a-zˤḫ]) ([a-zˤḫ])',r'\1-\2', u) #Insert hyphen between consonants in graph. units
            words = hyphenize.split('-')
            
            word_no=1
            for w in words:
                
                #trailer
                if word_no < len(words):
                    trailer = ''
                elif unit_no < len(units):
                    trailer = '.'
                else:
                    trailer = ' '
        
                data[n] = [corpus, column, int(line), w, trailer]
                n+=1
                word_no+=1                
            unit_no+=1
        
df = pd.DataFrame(data).T
df.columns = ['corpus','column','line','word', 'trailer']

In [85]:
        for run in para.runs:
            letters_only = re.findall('[a-zˤḫ]', run.text)
            for r in letters_only:
                if run.italic:
                    font += 'i'
                else:
                    font += 'o'   

' -'

In [4]:
#df.to_excel('test.xlsx', index=None)

In [136]:
df

Unnamed: 0,corpus,column,line,word,trailer
0,KTU 1.2,I,1,[ ]x[ ...,
1,KTU 1.2,I,2,k,
2,KTU 1.2,I,2,t̠b,.
3,KTU 1.2,I,2,x[ ],
4,KTU 1.2,I,3,at,.
...,...,...,...,...,...
849,KTU 1.2,IV,39,ibh,.
850,KTU 1.2,IV,39,mš[ ...,
851,KTU 1.2,IV,40,bn,.
852,KTU 1.2,IV,40,ˤnh[ ...,


### Prepare the conversion

In [129]:
from tf.fabric import Fabric
from tf.convert.walker import CV
from tf.app import use

DATA_FOLDER = 'tf'
VERSION = '0.1'

TF_PATH = f'../{DATA_FOLDER}/{VERSION}'
TF = Fabric(locations=TF_PATH, silent=True)

In [135]:
slotType = 'sign'

#Metadata
generic = {
    'dataset':'cuc',
    'datasetName': 'Copenhagen Ugarit Corpus',
    'encodedBy': 'Christian Canu Højgaard and Martijn Naaijer',
    'convertedToTextFabricBy':'Martijn Naaijer and Christian Canu Højgaard',    
    'source': '',
    'manuscripts':'',
    'licence': 'Creative Commons Attribution-NonCommercial 4.0 International License',
    'licenceUrl': 'http://creativecommons.org/licenses/by-nc/4.0/',
    'version': VERSION
}

#Representations
otext = {
    'fmt:text-orig-full': '{sign}',
    'sectionTypes': 'corpus,column,line',
    'sectionFeatures': 'corpus,column,line',
}

featureMeta = {
    'sign': {
        'description': 'consonantal letter',
    },
    'corpus': {
        'description': 'corpus name'
    },
    'column': {
        'description': 'column number',
    },
    'line': {
        'description': 'line number',
    },
    'g_cons': {
        'description': 'word consonantal-transliterated',
    },
    'trailer': {
        'description': 'interword material',
    },
    'language': {
        'description': 'language',
    }}

intFeatures = {
  'line'
}

In [139]:
def director(cv):   
    label_dict = dict(
        corpus = '',
        column = '',
        line = '',
        word = '',
    )
    
    node_dict = dict(
        corpus=None,
        column=None,
        line=None,
        word=None,
    )

    for row in df.iterrows():
        
        #book_title, chapter_number, verse_number = reference(line[1]['ref'])
        corpus_title = row[1]['corpus']
        column_number = row[1]['column']
        line_number = row[1]['line']
        
        if corpus_title != label_dict['corpus']:
            label_dict['corpus'] = corpus_title
            for ntp in ('word','line','column','corpus'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['corpus'] = cv.node('corpus')
            cv.feature(
              node_dict['corpus'],
              corpus=corpus_title,
            )

        if column_number != label_dict['column']:
            label_dict['column'] = column_number
            for ntp in ('word','line','column'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None
            node_dict['column'] = cv.node('column')
            cv.feature(
              node_dict['column'],
              column = column_number,
            )
         
        if line_number != label_dict['line']:
            label_dict['line'] = line_number
            for ntp in ('word','line'):
                cv.terminate(node_dict[ntp])
                node_dict[ntp] = None   
            node_dict['line'] = cv.node('line')
            cv.feature(
              node_dict['line'],
              line = line_number,
            )
            
        node_dict['word'] = cv.node('word')
        
        cv.feature(
            node_dict['word'],
            g_cons = row[1]['word'],
            trailer = row[1]['trailer'],
            language = 'Ugaritic',
        )
            
        for sign in row[1]['word']:
            s = cv.slot()
            cv.feature(s, 
                    sign=sign
                )
            
        cv.terminate(node_dict['word']) 

    # just for informational purposes
    print('\nINFORMATION:', cv.activeTypes(), '\n')
  
    for ntp in ('word','line','column','corpus'):
        cv.terminate(node_dict[ntp])

In [140]:
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
    generateTf=True
)

good

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.00s No structure nodes will be set up
   |   SECTION   TYPES:    corpus, column, line
   |   SECTION   FEATURES: corpus, column, line
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   text-orig-full       sign
   |     0.01s OK
   |     0.00s Following director... 

INFORMATION: {'corpus', 'line', 'column'} 

   |     0.08s "edge" actions: 0
   |     0.08s "feature" actions: 5623
   |     0.08s "node" actions: 991
   |     0.09s "resume" actions: 0
   |     0.09s "slot" actions: 4632
   |     0.09s "terminate" actions: 1138
   |          4 x "column" node 
   |          1 x "corpus" node 
   |        132 x "line" node 
   |       4632 x "sign" node  = slot type
   |        854 x "word" node 
   |       5623 nodes of all types
   |     0.10s OK
   |     0.00s checking for nodes and edges ... 
   |     0.00s OK
   |     0.00s checking 

True