# Link pericopes to Peshitta and Syrnt

In [1]:
%load_ext autoreload
%autoreload 2

In [72]:
import os
from shutil import rmtree
import re
import collections
from tf.app import use
from tf.fabric import Fabric

# Context

In [3]:
P = 'peshitta'
S = 'syrnt'
A = {P: None, S: None}

In [4]:
BASE = os.path.expanduser('~/github')
ORG = 'etcbc'
REPO = 'linksyr'

LECTIONARY_DATA = 'data/lectionaries'
DATA_FILE = 'pericopes.csv'
DATA_PATH = f'{BASE}/{ORG}/{REPO}/{LECTIONARY_DATA}/{DATA_FILE}'

TEMP = '_temp'
TEMP_PATH = f'{BASE}/{ORG}/{REPO}/{TEMP}'
PERI_RAW_FILE = 'periraw.txt'
PERI_RAW_PATH = f'{TEMP_PATH}/{PERI_RAW_FILE}'
PERI_FILE = 'peri.txt'
PERI_PATH = f'{TEMP_PATH}/{PERI_FILE}'
ERROR_FILE = 'error.txt'
ERROR_PATH = f'{TEMP_PATH}/{ERROR_FILE}'

TF_BASE = f'{BASE}/{ORG}/{REPO}/data/tf/lectio'
VERSION = {
  P: '0.1',
  S: '0.1',
}

## TF Data sources

Load the TF data for both volumes.

In [5]:
for dataSource in A:
  A[dataSource] = use(dataSource)

TF app is up-to-date.
Using annotation/app-peshitta commit 1f3f47a5154f5be012f5c42d050baca70a6c7e48 (=latest)
  in /Users/dirk/text-fabric-data/__apps__/peshitta.
Using etcbc/peshitta/tf - 0.1 r0.4 in /Users/dirk/text-fabric-data


**Documentation:** <a target="_blank" href="https://github.com/etcbc/peshitta/blob/master/docs" title="provenance of Peshitta (Old Testament)">PESHITTA</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Writing/Syriac" title="('Syriac characters and transcriptions',)">Character table</a> <a target="_blank" href="https://github.com/etcbc/peshitta/blob/master/docs/transcription-0.1.md#transcription.md" title="PESHITTA feature documentation">Feature docs</a> <a target="_blank" href="https://github.com/annotation/app-peshitta" title="peshitta API documentation">peshitta API</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Api/Fabric/" title="text-fabric-api">Text-Fabric API 7.3.12</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Use/Search/" title="Search Templates Introduction and Reference">Search Reference</a>

TF app is up-to-date.
Using annotation/app-syrnt commit d8cce973438848a1bf7e4f4ab62b2d480206ca9b (=latest)
  in /Users/dirk/text-fabric-data/__apps__/syrnt.
Using etcbc/syrnt/tf - 0.1 r0.3 in /Users/dirk/text-fabric-data


**Documentation:** <a target="_blank" href="https://github.com/etcbc/syrnt/blob/master/docs" title="provenance of SyrNT">SYRNT</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Writing/Syriac" title="('Syriac characters and transcriptions',)">Character table</a> <a target="_blank" href="https://github.com/etcbc/syrnt/blob/master/docs/transcription-0.1.md#transcription.md" title="SYRNT feature documentation">Feature docs</a> <a target="_blank" href="https://github.com/annotation/app-syrnt" title="syrnt API documentation">syrnt API</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Api/Fabric/" title="text-fabric-api">Text-Fabric API 7.3.12</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Use/Search/" title="Search Templates Introduction and Reference">Search Reference</a>

Make the TF handles easily available for both volumes.

In [6]:
api = {}
F = {}
T = {}
L = {}
TF = {}
tfDir = {}
verseNodes = {}
sortNodes = {}
for volume in A:
  thisApi = A[volume].api
  api[volume] = thisApi
  F[volume] = thisApi.F
  T[volume] = thisApi.T
  L[volume] = thisApi.L
  tfDir[volume] = f'{TF_BASE}/{volume}/{VERSION[volume]}'
  TF[volume] = Fabric(locations=tfDir[volume])
  sortNodes[volume] = thisApi.sortNodes
  verseNodes[volume] = thisApi.F.otype.s('verse')

This is Text-Fabric 7.3.12
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

1 features found and 0 ignored


  0.00s Warp feature "otype" not found in
/Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1/
  0.00s Warp feature "oslots" not found in
/Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1/


  0.00s Warp feature "otext" not found. Working without Text-API

This is Text-Fabric 7.3.12
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

1 features found and 0 ignored


  0.00s Warp feature "otype" not found in
/Users/dirk/github/etcbc/linksyr/data/tf/lectio/syrnt/0.1/
  0.00s Warp feature "oslots" not found in
/Users/dirk/github/etcbc/linksyr/data/tf/lectio/syrnt/0.1/


  0.00s Warp feature "otext" not found. Working without Text-API



Make a mapping from each book acronym to

* the volume in which the book resides
* its node number in that volume,
* its English name

The book acronyms in the lectionary data is slightly different from those in the TF data.
Here is a mapping from lectionary book acronyms to TF book names.

In [7]:
bookMapping = {
  P: {
    '1Chr': 'Chr1',
    '2Chr': 'Chr2',
    '1Mc': 'Mc1_A',
    '1Rg': 'Rg1',
    '2Rg': 'Rg2',
    '1Sm': 'Sm1',
    '2Sm': 'Sm2',
    '4Ezra': 'Esr4',
    'Am': 'Am',
    'ApBar': 'ApBar',
    'Bar': 'Bar',
    'Bel_Dr': 'BelDr',
    'Ct': 'Ct',
    'Dn': 'Dn',
    'Dt': 'Dt',
    'Ec': 'Ec',
    'EpBar': 'EpBar_A',
    'Ex': 'Ex',
    'Ez': 'Ez',
    'Gn': 'Gn',
    'Hb': 'Hb',
    'Hg': 'Hg',
    'Hs': 'Hs',
    'Is': 'Is',
    'Jb': 'Jb',
    'Jd': 'Jd',
    'Jl': 'Jl',
    'Jon': 'Jon',
    'Jr': 'Jr',
    'Js': 'Jos',
    'Lm': 'Thr',
    'Lv': 'Lv',
    'Mi': 'Mi',
    'Ml': 'Ml',
    'Na': 'Na',
    'Nm': 'Nm',
    'Ob': 'Ob',
    'Pr': 'Pr',
    'Ru': 'Ru',
    'Sa': 'Sa',
    'Sap': 'Sap',
    'Sir': 'Sir',
    'Su': 'Sus',
    'Zf': 'Zf',
  },
  S: {
    '1Cor': '1Cor',
    '2Cor': '2Cor',
    '1Joh': '1John',
    '1Petr': '1Peter',
    '2Petr': '2Peter',
    '1Thess': '1Thess',
    '2Thess': '2Thess',
    '1Tim': '1Tim',
    '2Tim': '2Tim',
    'Tim': '1Tim',
    'Acts': 'Acts',
    'Col': 'Col',
    'Eph': 'Eph',
    'Gal': 'Gal',
    'Heb': 'Heb',
    'Jas': 'James',
    'Joh': 'John',
    'Jude': 'Jude',
    'Lk': 'Luke',
    'Mat': 'Matt',
    'Mk': 'Mark',
    'Phil': 'Phil',
    'Rom': 'Rom',
    'Tit': 'Titus',
  },
}

Here we construct a mapping from TF book names to TF book nodes.

In [8]:
books = {}
for dataSource in A:
  books[dataSource] = {F[dataSource].book.v(n): n for n in F[dataSource].otype.s('book')}

Here we flatten the `bookMapping` above into a plain mapping from lectionary books to TF books.

We retain the information of which book belongs to which volume (OT or NT) in `whatVolume`, keyed by
the TF book name.

In [9]:
bookFromPeri = {}
whatVolume = {}

for volume in bookMapping:
  for periAcro in bookMapping[volume]:
    bookAcro = bookMapping[volume][periAcro]
    bookFromPeri[periAcro] = bookAcro
    whatVolume[bookAcro] = volume

Finally, we have a function that delivers for a lectionary acronym the volume, the TF book node, and the English book name.

In [10]:
def bookInfo(bookAcro):
  volume = whatVolume[bookAcro]
  node = books[volume][bookAcro]
  name = T[volume].bookName(node)
  return (volume, node, name)

A little check:

In [11]:
bookInfo('Gn')

('peshitta', 427228, 'Genesis')

In [12]:
bookInfo('Matt')

('syrnt', 109641, 'Matthew')

## Read the lectionary file.

In [13]:
if not os.path.exists(TEMP_PATH):
  os.makedirs(TEMP_PATH, exist_ok=True)

In [14]:
def readData():
  with open(DATA_PATH) as fh:
    header = next(fh)
    lines = list(fh)
  header = {n: name for (n, name) in enumerate(header.rstrip('\n').split(';'))}
  lines = [line.rstrip('\n').split(';') for line in lines]
  return (header, lines)

In [15]:
(header, lines) = readData()

In [16]:
print('\n'.join(f'{n:>2} = {name}' for (n, name) in header.items()))

 0 = ﻿no. MS
 1 = No.
 2 = Pericope
 3 = Intro
 4 = Intro.Remarks
 5 = Intro.Transl
 6 = Thales
 7 = Thales ID
 8 = Intro.Fol.A
 9 = 
10 = Intro.Col.A
11 = Intro.Line.A
12 = Intro.Fol.Z
13 = Intro.Col.Z
14 = Intro.Line.Z
15 = Taksa
16 = Taksa.Trans
17 = Taksa.Remarks
18 = Transl.remarks
19 = Taksa.Fol.A
20 = Taksa.Col.A
21 = Taksa.Line.A
22 = Taksa.Fol.Z
23 = Taksa.Col.Z
24 = Taksa.Line.Z
25 = Taksa.Remarks
26 = Siglum
27 = Siglum
28 = Link
29 = Link
30 = MS-List.link
31 = Pericope.Fol.A
32 = Pericope.Col.A
33 = Pericope.Line.A
34 = Pericope.Fol.Z
35 = Pericope.Col.Z
36 = Pericope.Line.Z
37 = Cross reference (corpus) [to be filled in aut.]
38 = Remarks Cross reference (corpus)
39 = Abbreviation
40 = Version
41 = Version.Remarks
42 = Version.Syriac
43 = Version.Transl
44 = Denomination
45 = Literature
46 = Literature.link
47 = Codicology
48 = Pericope.remarks
49 = Ref.remarks.
50 = Cross.ref.Lect
51 = Transl.Cross.ref.
52 = Addition before
53 = Addition after
54 = Addition.Fol.A
55 = Ad

In [17]:
print('\n'.join(f'{header[n]} = {value}' for (n, value) in enumerate(lines[0])))

﻿no. MS = 73
No. = 073-0265
Pericope = Acts#08:05-13.
Intro = ‎‏ܦܪܟܣܝܣ‏‎
Intro.Remarks = 
Intro.Transl = Acts
Thales = 
Thales ID = 
Intro.Fol.A = 161b
 = 
Intro.Col.A = 2
Intro.Line.A = 19
Intro.Fol.Z = 161b
Intro.Col.Z = 2
Intro.Line.Z = 19
Taksa = ‎‏ܬܘܒ ܛܟܣܐ ܥܠ ܚܕ̈ܒܫܒܐ ܐܚܪ̈ܢܐ ܬܡ̈ܢܝܐ ܕܩܝܡܬܐ ܡܫܲܒܚܬܐ‏‎
Taksa.Trans = Furthermore, the order on Last First (Day) of the Week, the Eighth, of the praiseworthy Resurrection
Taksa.Remarks = subrubr ‎‏ܛܟܣܐ ܩܕܡܝܐ ܕܩܝܡܬܐ ܡܲܐܚܝܢܝܬܐ‏‎ (12-13 - after a blank line) (The First Order of the live-giving Resurrection)
Transl.remarks = The First Order of the live-giving Resurrection
Taksa.Fol.A = 159b
Taksa.Col.A = 1.
Taksa.Line.A = 1.
Taksa.Fol.Z = 159b
Taksa.Col.Z = 1.
Taksa.Line.Z = 11.
Taksa.Remarks = 
Siglum = 16l1
Siglum = 16l01
Link = https://archive.org/stream/SMC1.1/SMC%201.1#page/n165/mode/1up
Link = https://archive.org/stream/SMC1.1/SMC%201.1#page/n165/mode/1up
MS-List.link = 16l01
Pericope.Fol.A = 161b
Pericope.Col.A = 2
Pericope.Line.A = 19
Per

# Pericope analysis

In [54]:
PERICOPE_INDEX = 2
VERSION_INDEX = 40
P_VAL = 'P'
P_INDEX = 2

TAKSA = 15
TAKSA_TR = 16 

SIGLUM = 26
LINK = 28

In [55]:
raw = {}

with open(PERI_RAW_PATH, 'w') as fh:
  for (ln, line) in enumerate(lines):
    if line[VERSION_INDEX] != P_VAL:
      continue
    pericopeStr = line[P_INDEX]
    raw[ln] = pericopeStr
    fh.write(f'{ln:>5} {pericopeStr}\n')

In [56]:
exceptions = {
  855: ('01:03:', '01:03-'),
  1209: ('Tit#03:', '+03:'),
  1252: ('-10', ''),
  1253: (')', ''),
  1298: ('Is#01-01', 'Is#01:01'),
  1350: ('Pr#2109', 'Pr#21:09'),
  1425: ('-05-03', '-05:03'),
  1549: ('12:09-09b', '12:09b'),
  1697: ('12:09-09b', '12:09b'),
  2142: ('12:09-09b', '12:09b'),
  2353: ('12:09-09b', '12:09b'),
  2617: ('12:09-09b', '12:09b'),
  2858: ('12:09-09b', '12:09b'),
  4447: ('12:09-09b', '12:09b'),
  4681: ('12:09-09b', '12:09b'),
  4837: ('12:09-09b', '12:09b'),
  8692: ('12:09-09b', '12:09b'),
  1955: (' + small fragments', ''),
  2732: ('-10-37', '-37'),
  3025: ('1Joh#03:02:17', '1Joh#03:02-17'),
  3914: ('-17-21', '+17-21'),
  4244: ('1-17.', ''),
  4263: ('01-02', '01:01-02'),
  4545: ('#24:-', '#24:2-'),
  5196: ('Rom#09:10:17-18', 'Rom#10:17-18'),
  5212: ('Acts#12:25-13-03', 'Acts#12:25-13:03'),
  5261: ('Rom#07:07:16', 'Rom#07:07-16'),
  6168: ('Dn#23', 'Dn#6:23'),
  7964: ('{or Rom#:-13}', ''),
  8896: ('Sa#04', 'Sa#08:04'),
}
skips = {
  887,
  1612,
  1613,
  2864,
  4568,
  4569,
  6333,
  7284,
  7430,
  7519,
}

In [57]:
len(exceptions) + len(skips)

40

In [58]:
test = None
# test = {3534, 3584}

In [117]:
headingPat = r'\s*[()]([^()]*[^()A-Za-z0-9:.?-]+[^()]*)[()]\s*'
headingRe = re.compile(headingPat)

def headingRepl(match):
  head = match.group(1).replace('/', '|').replace('-', '\u2017')
  return f'{{{head}}}'

anglePat = r'\s*<[^>]+>\s*'
angleRe = re.compile(anglePat)
bracketPat = r'\s*\([^)]+\)\s*'
bracketRe = re.compile(bracketPat)

def simplify(x, ln):
  trim = (
    x
    .replace('\u200E', '')
    .replace('\u200F', '')
    .replace('\u2013', '-')
    .replace('[', '')
    .replace(']', '')
    .replace('.', '')
  )
  trim = angleRe.sub('', trim)
  trim = headingRe.sub(headingRepl, trim)
  trim = bracketRe.sub('', trim)
  trim = trim.replace('++', '')
  trim = trim.replace('-?', '')
  trim = trim.replace('?', '')
  if ln in exceptions:
    (offend, better) = exceptions[ln]
    trim = trim.replace(offend, better)
  return trim

def sanitize(x):
    return (
        x
        .strip()
        .replace('\u200E', '')
        .replace('\u200F', '')
        .replace('\u2013', '-')
    )

In [60]:
def parsePericopes(shape, prevData):
  parts = shape.strip().strip('+').split('+')
  result = []
  for part in parts:
    (good, data) = parsePericope(part, prevData)
    if not good:
      return (False, shape)
    result.append(data)
    prevData = data[-1]
  return (True, result)

In [61]:
referencePat = r'^([A-Za-z0-9_]+#)?[0-9]+'
referenceRe = re.compile(referencePat)

def parsePericope(shape, prevData):
  shape = shape.strip().strip('-')
  if not referenceRe.match(shape):
    shape = shape.replace('-', '\u2017')
  parts = shape.split('-')
  if len(parts) > 2:
    return (False, shape)
  if len(parts) == 1:
    (good, data) = parseVerse(shape, prevData)
    if good:
      return (good, [data])
    else:
      return (good, shape)
  result = []
  for part in parts:
    (good, data) = parseVerse(part, prevData)
    if not good:
      return (good, shape)
    else:
      result.append(data)
      prevData = data
  return (True, result)

In [62]:
headPat = r'\{([^\)]+)\}'
headRe = re.compile(headPat)

def stripHeading(shape):
  takeWhole = False
  match = headRe.search(shape)
  if not match:
    if not referenceRe.match(shape):
      takeWhole = True
    else:
      return (shape, None, None)
  head = shape if takeWhole else match.group(1)
  head = head.replace('\u2017', '-')
  headParts = head.split('/' if takeWhole else '|', 1)
  if len(headParts) == 1:
    headParts = (head, None)
  shape = '' if takeWhole else headRe.sub('', shape)
  return (shape, *headParts)

def parseVerse(shape, prevData):
  book = None
  chapter = None
  verse = None
  fullShape = shape.strip()
  (shape, head1, head2) = stripHeading(fullShape)
  if shape == '':
    if prevData is None or len(prevData) < 3:
      return (False, fullShape)
    (book, chapter, verse) = prevData[0:3]
    return (True, (book, chapter, verse, head1, head2))
    
  parts = [p.strip() for p in shape.split('#')]
  chvh = None
  if len(parts) > 2:
    return (False, fullShape)
  
  chvh = shape if len(parts) == 1 else parts[1]
  book = None if len(parts) == 1 else bookFromPeri[parts[0]]
  
  subparts = [sp.strip() for sp in chvh.split(':')]
  if len(subparts) > 2:
    return (False, fullShape)
  
  if len(subparts) == 1:
    chapter = None
    verse = chvh
  else:
    (chapter, verse) = subparts
    
  if book is None:
    if prevData is None:
      return (False, fullShape)
    book = prevData[0]
  if chapter is None:
    if prevData is None or len(prevData) < 2:
      return (False, fullShape)
    chapter = prevData[1]
  else:
    chapter = chapter.lstrip('0')
    if chapter == '':
      chapter = 0
    elif not chapter.isdigit():
      return (False, fullShape)
    else:
      chapter = int(chapter)
    if chapter == 0:
      chapter = 1
    
  if verse is not None:
    verse = verse.replace('a', '').replace('b', '').replace('f', '').lstrip('0')
    if '/' in verse:
      verse = verse.split('/', 1)[0]
    if verse == 'end':
      verse = None
    else:
      verse = verse.lstrip('0')
      if verse == '':
        verse = 0
      elif not verse.isdigit():
        return (False, fullShape)
      else:
        verse = int(verse)
    
  return (True, (book, chapter, verse, head1, head2))

In [118]:
pericopes = {}
errors = {}

sourceColumns = (
  ('taksa', TAKSA, {}),
  ('taksaTr', TAKSA_TR, {}),
  ('siglum', SIGLUM, {}),
  ('link', LINK, {}),
)

for (ln, line) in enumerate(lines):
  if ln in skips:
    continue
  if test is not None and ln not in test:
    continue
  if line[VERSION_INDEX] != P_VAL:
    continue
    
  for (name, index, dest) in sourceColumns:
    dest[ln] = sanitize(line[index])
  
  pericopeStr = (simplify(line[P_INDEX], ln))
  (good, data) = parsePericopes(pericopeStr, None)
  if good:
    pericopes[ln] = data
  else:
    errors[ln] = data
  prevData = data[-1][-1]

In [119]:
if errors:
  for (i, peri) in sorted(errors.items())[0:20]:
    print(f'{i:>5} {raw[i]} => {peri}\n')
print(f'{len(errors):>5} errors')
print(f'{len(pericopes):>5} pericopes')

with open(ERROR_PATH, 'w') as f:
  for (i, peri) in sorted(errors.items()):
    f.write(f'{i:>5} {peri}\n')
    
with open(PERI_PATH, 'w') as f:
  for (i, peri) in sorted(pericopes.items()):
    f.write(f'{i:>5} {peri}\n')
if pericopes:
  for (i, peri) in sorted(pericopes.items())[0:20]:
    print(f'{i:>5} {raw[i]} => {peri}\n')

  705 ?#?:?-? => #:

    1 errors
 8266 pericopes
    0 Acts#08:05-13. => [[('Acts', 8, 5, None, None), ('Acts', 8, 13, None, None)]]

    1 1Sm#16:01-13a. => [[('Sm1', 16, 1, None, None), ('Sm1', 16, 13, None, None)]]

    2 Ct#01:02-14. => [[('Ct', 1, 2, None, None), ('Ct', 1, 14, None, None)]]

    3 Jd#05:01-11. => [[('Jd', 5, 1, None, None), ('Jd', 5, 11, None, None)]]

    4 Ex#03:01-10. => [[('Ex', 3, 1, None, None), ('Ex', 3, 10, None, None)]]

    5 1Sm#01:09-19a.(‎‏ܠܪܡܬܐ‏‎) => [[('Sm1', 1, 9, None, None), ('Sm1', 1, 19, 'ܠܪܡܬܐ', None)]]

    6 Nm#17:16-26. => [[('Nm', 17, 16, None, None), ('Nm', 17, 26, None, None)]]

    7 Jd#13:02-14. => [[('Jd', 13, 2, None, None), ('Jd', 13, 14, None, None)]]

    8 ‎2Cor#03:12-04:04. => [[('2Cor', 3, 12, None, None), ('2Cor', 4, 4, None, None)]]

    9 Gn#12:07-08.+18:01-15. => [[('Gn', 12, 7, None, None), ('Gn', 12, 8, None, None)], [('Gn', 18, 1, None, None), ('Gn', 18, 15, None, None)]]

   10 Gn#03:21-04:07. => [[('Gn', 3, 21, None, 

## Linking

### Resolving verse specifications and ranges

First we define some functions that retrieve verse nodes for a range in a pericope.

In [120]:
def betweenVerses(volume, vStart, vEnd):
  skipping = True
  results = []
  for vn in verseNodes[volume]:
    if vn == vEnd:
      break
    if not skipping:
      results.append(vn)
    else:
      if vn == vStart:
        skipping = False
  return tuple(results)
  
def versesFromSpec(spec):
  (b, c, v) = spec[0:3]
  (volume, bNode, bName) = bookInfo(b)
  if bNode is None or volume is None:
    return f'unknown book {b}'
  if c is None:
    return (volume, L[volume].d(bNode, otype='verse'))
  elif v is None:
    cNode = T[volume].nodeFromSection((bName, c))
    if cNode is None:
      return f'book {bName} has no chapter {c}'
    return (volume, L[volume].d(cNode, otype='verse'))
  else:
    vNode = T[volume].nodeFromSection((bName, c, v))
    if vNode is None:
      return f'book {bName} has no verse {c}:{v}'
    return (volume, (vNode,))

def versesFromRange(rang):
  if len(rang) == 1:
    return versesFromSpec(rang[0])
  (start, end) = rang
  startResult = versesFromSpec(start)
  if type(startResult) is str:
    return f'start: {startResult}'
  endResult = versesFromSpec(end)
  if type(endResult) is str:
    return f'end {endResult}'
  (volume, startVerses) = startResult
  (endVolume, endVerses) = endResult
  if volume != endVolume:
    return f'start in {volume} but end in {endVolume}: not supported'
  lastStart = sortNodes[volume](startVerses)[-1]
  firstEnd = sortNodes[volume](endVerses)[0]
  sortedLF = sortNodes[volume]((lastStart, firstEnd))
  if sortedLF != [lastStart, firstEnd]:
    return (volume, [])
  between = betweenVerses(volume, lastStart, firstEnd)
  return (volume, startVerses + between + endVerses)

### Testing

We want to test these functions.

So we define a function to show a list of verse nodes, and then we call it on the results of a few test cases.

In [121]:
def showVerse(volume, verseNode):
  if verseNode is None:
    print('non existent verse')
  else:
    print('{} {}:{}'.format(*T[volume].sectionFromNode(verseNode)))
  
def showVerses(msg, spec, volume, verseNodes):
  lV = len(verseNodes)
  pl = '' if lV == 1 else 's'
  print(f'{msg} {spec} => {volume}: {lV} verse{pl}')
  if lV <= 3:
    for vn in verseNodes:
      showVerse(volume, vn)
  else:
    showVerse(volume, verseNodes[0])
    print(' ... ')
    showVerse(volume, verseNodes[-1])
  print('')

In [122]:
for spec in (
  ('Gn', 1, 10),
  ('Gn', 1, None),
  ('Gn', None, None),
  ('Gn', 3, 25),
  ('Matt', 1, 10),
  ('Matt', 1, None),
  ('Matt', None, None),
  ('Acts', 8, 5, None, None),
):
  result = versesFromSpec(spec)
  if type(result) is str:
    print(f'SPEC {spec} => ERROR {result}')
  else:
    showVerses('SPEC', spec, *result)

SPEC ('Gn', 1, 10) => peshitta: 1 verse
Genesis 1:10

SPEC ('Gn', 1, None) => peshitta: 31 verses
Genesis 1:1
 ... 
Genesis 1:31

SPEC ('Gn', None, None) => peshitta: 1533 verses
Genesis 1:1
 ... 
Genesis 50:26

SPEC ('Gn', 3, 25) => ERROR book Genesis has no verse 3:25
SPEC ('Matt', 1, 10) => syrnt: 1 verse
Matthew 1:10

SPEC ('Matt', 1, None) => syrnt: 25 verses
Matthew 1:1
 ... 
Matthew 1:25

SPEC ('Matt', None, None) => syrnt: 1071 verses
Matthew 1:1
 ... 
Matthew 28:20

SPEC ('Acts', 8, 5, None, None) => syrnt: 1 verse
Acts 8:5



In [123]:
for rang in (
  (('Gn', 1, 1), ('Matt', 1, 1)),
  (('Gn', 1, 1),),
  (('Gn', 1, 1), ('Gn', 1, 1)),
  (('Gn', 1, 2), ('Gn', 1, 1)),
  (('Gn', 1, 3), ('Gn', 1, 1)),
  (('Gn', 1, 1), ('Gn', 1, 2)),
  (('Gn', 1, 1), ('Gn', 1, 3)),
  (('Gn', 1, 1), ('Gn', 1, 10)),
  (('Gn', 1, None), ('Gn', 1, 10)),
  (('Gn', 1, None), ('Gn', 2, 10)),
  (('Gn', 1, None), ('Gn', 3, None)),
  (('Gn', 40, 13), ('Ex', 3, 5)),
  (('Gn', 3, 13), ('Gn', 3, 25)),
  (('Acts', 8, 5, None, None), ('Acts', 8, 13, None, None)),
):
  result = versesFromRange(rang)
  if type(result) is str:
    print(f'RANGE {rang} => ERROR {result}')
  else:
    showVerses('RANGE', rang, *result)

RANGE (('Gn', 1, 1), ('Matt', 1, 1)) => ERROR start in peshitta but end in syrnt: not supported
RANGE (('Gn', 1, 1),) => peshitta: 1 verse
Genesis 1:1

RANGE (('Gn', 1, 1), ('Gn', 1, 1)) => peshitta: 2 verses
Genesis 1:1
Genesis 1:1

RANGE (('Gn', 1, 2), ('Gn', 1, 1)) => peshitta: 0 verses

RANGE (('Gn', 1, 3), ('Gn', 1, 1)) => peshitta: 0 verses

RANGE (('Gn', 1, 1), ('Gn', 1, 2)) => peshitta: 2 verses
Genesis 1:1
Genesis 1:2

RANGE (('Gn', 1, 1), ('Gn', 1, 3)) => peshitta: 3 verses
Genesis 1:1
Genesis 1:2
Genesis 1:3

RANGE (('Gn', 1, 1), ('Gn', 1, 10)) => peshitta: 10 verses
Genesis 1:1
 ... 
Genesis 1:10

RANGE (('Gn', 1, None), ('Gn', 1, 10)) => peshitta: 0 verses

RANGE (('Gn', 1, None), ('Gn', 2, 10)) => peshitta: 41 verses
Genesis 1:1
 ... 
Genesis 2:10

RANGE (('Gn', 1, None), ('Gn', 3, None)) => peshitta: 80 verses
Genesis 1:1
 ... 
Genesis 3:24

RANGE (('Gn', 40, 13), ('Ex', 3, 5)) => peshitta: 400 verses
Genesis 40:13
 ... 
Exodus 3:5

RANGE (('Gn', 3, 13), ('Gn', 3, 25)) =

### Feature construction

Make a feature `lectio`, for verse nodes, which is 1 for verses mentioned in a pericope.

In [124]:
rangeErrors = []
lectio = {}
mark1 = {}
mark2 = {}
extra = {}
for (name, index, source) in sourceColumns:
  extra[name] = {}

for volume in A:
  lectio[volume] = {}
  mark1[volume] = {}
  mark2[volume] = {}
  for name in extra:
    extra[name][volume] = collections.defaultdict(set)

for (ln, ranges) in pericopes.items():
  for rang in ranges:
    result = versesFromRange(rang)
    if type(result) is str:
      rangeErrors.append((ln, result))
    else:
      (volume, vns) = result
      for vn in vns:
        old = lectio[volume].setdefault(vn, '')
        sep = ',' if old else ''
        lectio[volume][vn] = f'{old}{sep}{str(ln)}'
        for (name, index, source) in sourceColumns:
          val = source[ln]
          if val:
            extra[name][volume][vn].add(source[ln])
        
    for spec in rang:
      result = versesFromSpec(spec)
      if type(result) is str:
        continue
      else:
        (volume, vns) = result
        for (i, m) in enumerate(spec[3:5]):
          if m is not None:
            for vn in vns:
              dest = mark1 if i == 0 else mark2
              old = dest[volume].setdefault(vn, '')
              sep = '|' if old else ''
              dest[volume][vn] = f'{old}{sep}{str(ln)}:{m}'
print('Done')

Done


The values of the extra features are sets.
We convert them to strings, by joining the sorted elements by `|`. 

In [125]:
for (name, nameData) in extra.items():
  for (volume, volumeData) in nameData.items():
    for (vn, valSet) in volumeData.items():
      volumeData[vn] = '|'.join(sorted(valSet))

Checks:

* range errors

In [126]:
if rangeErrors:
  print(f'{len(rangeErrors)} errors')
  for (ln, e) in rangeErrors[0:20]:
    print(f'{ln:>5} {e}')
else:
  print('No range errors')

print('')

print('Annotated verses:')
for volume in lectio:
  print(f'{volume}: {len(lectio[volume])} verses')

20 errors
  492 book Numbers has no verse 9:26
  619 start: book Kings_1 has no verse 17:30
  734 end book Exodus has no verse 3:28
  768 end book Numbers has no verse 12:28
  861 book 2_Thessalonians has no verse 4:13
 1213 start: book Judges has no verse 30:34
 1263 book 1_Thessalonians has no verse 1:25
 3914 end book 1_Corinthians has no verse 5:16
 3914 start: book 1_Corinthians has no verse 5:17
 4042 end book Proverbs has no verse 13:23
 4243 start: book Isaiah has no verse 54:41
 4244 start: book Isaiah has no verse 54:171
 5178 end book 2_Corinthians has no verse 4:21
 5214 start: book Acts has no verse 1:33
 5217 end book Ezekiel has no verse 9:15
 5563 start: book Exodus has no verse 15:1311
 5984 book Exodus has no verse 43:51
 6314 end book Ezekiel has no verse 9:15
 6420 end book Acts has no verse 31:7
 6543 end book Romans has no verse 16:28

Annotated verses:
peshitta: 11370 verses
syrnt: 4696 verses


## Save lectio as TF

First we specify some metadata.

In [127]:
dataName = {
  P: 'lectionary data for the Peshitta (OT)',
  S: 'lectionary data for the Peshitta (NT)',
}
metaDataBase = {
  '' : {
    'editor': 'Geert Jan Veldman',
    'converter': 'Dirk Roorda',
  },
  'lectio': {
    'valueType': 'str',
    'description': 'numbers of lectionaries associated to verses (comma separated)'
  },
  'mark1': {
    'valueType': 'str',
    'description': 'lectionary start mark associated to verses (| separated ln:mark values)'
  },
  'mark2': {
    'valueType': 'str',
    'description': 'lectionary start mark (alternative language) associated to verses (| separated ln:mark values)'
  },
  'taksa': {
    'valueType': 'str',
    'description': 'taksa of lectionaries associated to verses (comma separated)'
  },
  'taksaTr': {
    'valueType': 'str',
    'description': 'taksa (translation) of lectionaries associated to verses (comma separated)'
  },
  'siglum': {
    'valueType': 'str',
    'description': 'siglum of lectionaries associated to verses (comma separated)'
  },
  'link': {
    'valueType': 'str',
    'description': 'archive link to manuscript of lectionaries associated to verses (comma separated)'
  },
}

metaData = {}

for volume in dataName:
  meta = {}
  meta.update(metaDataBase)
  meta['']['name'] = dataName[volume]
  metaData[volume] = meta

Then we make the features ready:

In [128]:
nodeFeatures = {}

collectedData = [
  ('lectio', lectio),
  ('mark1', mark1),
  ('mark2', mark2),
]
for (name, data) in extra.items():
  collectedData.append((name, data))
  
for (name, data) in collectedData:
  for volume in data:
    nodeFeatures.setdefault(volume, {})[name] = data[volume]
  
edgeFeatures = {
  P: {},
  S: {},
}

Now we save the features:

In [129]:
for volume in lectio:
  if os.path.exists(tfDir[volume]):
    rmtree(tfDir[volume])
  os.makedirs(tfDir[volume], exist_ok=True)
  TF[volume].save(nodeFeatures=nodeFeatures[volume], edgeFeatures=edgeFeatures[volume], metaData=metaData[volume])

  0.00s Exporting 7 node and 0 edge and 0 config features to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1:
   |     0.02s T lectio               to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
   |     0.02s T link                 to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
   |     0.00s T mark1                to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
   |     0.00s T mark2                to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
   |     0.02s T siglum               to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
   |     0.04s T taksa                to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
   |     0.04s T taksaTr              to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
  0.15s Exported 7 node features and 0 edge features and 0 config features to /Users/dirk/github/etcbc/linksyr/data/tf/lectio/peshitta/0.1
  0.00s Exporting 7 node