In [1]:
%load_ext autoreload
%autoreload 2

In [42]:
import os
from shutil import rmtree
from tf.transcription import Transcription

In [37]:
GH_BASE = os.path.expanduser('~/github')
ORG = 'etcbc'
REPO = 'linksyr'
PIL_PATH = 'data/peshitta/pil'
UNI_PATH = 'data/peshitta/uni'
BOOK_DATA = f'{GH_BASE}/{ORG}/{REPO}/{PIL_PATH}/books'
BOOK_UNI = f'{GH_BASE}/{ORG}/{REPO}/{UNI_PATH}/books'

TR = Transcription()

In [4]:
allAcrosSeq = '''
    Gn
    Ex
    Lv
    Nm
    Dt
    Jb
    Jos
    Jd
    1Sm
    2Sm
    Ps
    1Rg
    2Rg
    Pr
    Sap
    Ec
    Ct
    Is
    Jr
    Thr
    EpJr
    EpBar-A
    EpBar-B
    Bar
    Ez
    Hs
    Jl
    Am
    Ob
    Jon
    Mi
    Na
    Hb
    Zf
    Hg
    Sa
    Ml
    Dn
    BelDr
    Ru
    Sus
    Est
    Jdt
    Sir
    1Chr
    2Chr
    ApBar
    4Esr
    Ezr
    Neh
    1Mc-A
    1Mc-B
    2Mc
    3Mc
    4Mc
    Oda
    OrM-A
    OrM-B
    ApcPs-A
    ApcPs-B
    ApcPs
    PsS
    Tb-A
    Tb-B
    3Esr
'''.strip().split()

allAcros = set(allAcrosSeq)

In [18]:
def doText(book, chapter, verse, text, results, content):
  words = text.strip().split()
  results['problems'] |= {word for word in words if not TR.can_to_syriac(word)}
  results['chars'] |= set(''.join(words))
  content.setdefault(book, {}).setdefault(int(chapter), {})[int(verse)] = TR.to_syriac(text.strip())

In [27]:
def doBooks():
  bookInfo = {}
  content = {}
  with os.scandir(BOOK_DATA) as bookDir:
    for bookEntry in bookDir:
      if not bookEntry.is_file():
        continue
      bookFile = bookEntry.name
      print(bookFile)
      thisBookInfo = {}
      results = dict(chars=set(), problems=set())
      with open(f'{BOOK_DATA}/{bookFile}') as bh:
        (curChapter, curVerse, curText) = (None, None, None)
        for line in bh:
          if line.startswith('%'):
            comps = line[1:-1].split()
            if len(comps) == 0:
              continue
            keyword = comps[0]
            args = comps[1:]
            if keyword == 'bookname':
              acro = ' '.join(args)
              ab = bookFile[-1] if bookFile[-2] == '_' else ''
              abRep = '' if ab == '' else f'-{ab.upper()}'
              thisBookInfo['acro'] = f'{acro}{abRep}'
            elif keyword == 'language':
              thisBookInfo['language'] = ' '.join(args)
            elif keyword == 'verse':
              if curVerse:
                doText(bookFile, curChapter, curVerse, curText, results, content)
              (curChapter, curVerse) = args[0].split(',')
              curText = ''
          elif curVerse:
            curText += line
      thisBookInfo['problems'] = sorted(results['problems'])
      thisBookInfo['chars'] = ''.join(sorted(results['chars']))
      bookInfo[bookFile] = thisBookInfo
  return (bookInfo, content)

In [29]:
def checks(allAcros, bookInfo):
  allChars = set()
  for thisBookInfo in bookInfo.values():
    allChars |= set(thisBookInfo['chars'])

  sortedChars = ''.join(sorted(allChars))

  print(f'ALL CHARS = {sortedChars}')

  transcriptionProblems = 0

  for (book, thisBookInfo) in sorted(bookInfo.items()):
    acro = thisBookInfo['acro']
    lang = thisBookInfo['language']
    chars = thisBookInfo['chars']
    problems = thisBookInfo['problems']
    if problems:
      print(f'{acro:<5} = {book:<20} in {lang:<7} having {chars}')
      transcriptionProblems += len(problems)
      print(f'\tPROBLEMS: {" ".join(problems)}')

  print(
    f'XX: {transcriptionProblems} transcription problems'
    if transcriptionProblems else
    'OK: Transcription'
  )
  # books
  allDeclared = True
  allCorpus = set()
  for (book, thisBookInfo) in sorted(bookInfo.items()):
    acro = thisBookInfo['acro']
    allCorpus.add(acro)
    if acro not in allAcros:
      allDeclared = False
      print(f'CORPUS: book {book} = {acro} not in declared list')
  print('CORPUS: ' + ('OK: all books declared' if allDeclared else 'XX: some undeclared books'))
  allInCorpus = True
  for acro in sorted(allAcros):
    if acro not in allCorpus:
      allInCorpus = False
      print(f'DECLARED: {acro} not in corpus')
  print('DECLARED: ' + ('OK: all books in corpus' if allInCorpus else 'XX: some missing books'))

In [44]:
def writeUnicode(content):
  if os.path.exists(BOOK_UNI):
    rmtree(BOOK_UNI)
  os.makedirs(BOOK_UNI, exist_ok=True)
  for (book, chapters) in content.items():
    with open(f'{BOOK_UNI}/{book}.txt', 'w') as fh:
      for (chapter, verses) in sorted(chapters.items()):
        fh.write(f'Chapter {chapter}\n\n')
        for (verse, text) in sorted(verses.items()):
          fh.write(f'{verse} {text}\n')
        fh.write('\n')

In [32]:
(bookInfo, content) = doBooks()

Apocryphal_Psalms_A
Baruch
Tobit_A
Ecclesiastes
2Maccabees
3Maccabees
Ezra
Zephaniah
Prayer_of_Manasseh_B
Haggai
Numbers
Leviticus
Jeremiah
1Chronicles
2Samuel
Joel
Ruth
Amos
Judith
Deuteronomy
Odes
2Kings
Wisdom_of_Solomon
Daniel
Letter_of_Baruch_B
Judges
4Maccabees
Jonah
1Maccabees_B
1Samuel
Tobit_B
Exodus
Apocryphal_Psalms_B
Ezekiel
Apocryphal_Psalms
Proverbs
Letter_of_Jeremiah
Lamentations
Prayer_of_Manasseh_A
1Kings
3Esdras
Susanna
Job
Nahum
Psalms
Psalms_of_Solomon
Habakkuk
Micah
Obadiah
Apocalypse_of_Baruch
Bel_and_the_Dragon
4Esdras
Esther
Hosea
Sirach
Isaiah
Genesis
Nehemia
Malachi
Letter_of_Baruch_A
Joshua
1Maccabees_A
Zechariah
Song_of_Songs
2Chronicles


In [33]:
checks(allAcros, bookInfo)

ALL CHARS = !"#*./:<=>@ABCDEGHJKLMNPQRSTVWXYZ\^_o
OK: Transcription
CORPUS: OK: all books declared
DECLARED: OK: all books in corpus


In [45]:
writeUnicode(content)