In [20]:
"""
Running the script creates line-by-line transcriptions of the original word files.
Make sure the word files are in the folder utils/files.
"""
import os
import re
import docx

class TextTranscriber:
    """
    Makes a transcription of original word files line by line
    """

    def __init__(self, word_file_names, word_files_folder):
        self.word_file_names = word_file_names
        self.word_files_folder = word_files_folder
        self.line_texts = {}
        
        for word_file_name in self.word_file_names:
            path = os.path.join(self.word_files_folder, word_file_name)
            print(path)
            self.word_doc = docx.Document(path)
            self.read()
        
    def read(self):
        for para in self.word_doc.paragraphs:
            if re.match('KTU', para.text):
                self.corpus = para.text
            elif re.match('[LXVI+]', para.text):
                self.column = para.text
            elif re.match('^\d+', para.text):
                self.line = int(re.match('^\d+', para.text).group())
                self.text = re.sub('^\d+', '', para.text).lstrip()
                
                self.line_texts[(self.corpus, self.column, self.line)] = self.text

In [23]:
TextTranscriber(['KTU 1.2_text processed.docx'], 'files').line_texts

files\KTU 1.2_text processed.docx


{('KTU 1.2',
  'I',
  1): '[ ]x[ \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 ]',
 ('KTU 1.2',
  'I',
  2): 'k ṯb . x[ \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 ]',
 ('KTU 1.2',
  'I',
  3): 'at . ypˤt . b a/r[ \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 ]\xa0',
 ('KTU 1.2',
  'I',
  4): 'aliyn . bˤl . [ \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 ]',
 ('KTU 1.2',
  'I',
  5): 'drk{.}tk . [[ṯ]]mšl[ \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 ]',
 ('KTU 1.2',
  'I',
  6): 'b rišk . aymr[ \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 ]',
 ('KTU 1.2',
  'I',
  7): 'ṯpṭ . nhr . yṯb[r \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0  ]',
 ('KTU 1.2', 'I', 8): 'rišk . ˤṯtrt . š[m . bˤl . qdqd  . \xa0  ]',
 ('KTU 1.2',
  'I',
  9): 'rˤt . mṭ . tpln . b gb[