In [2]:
import os
import collections

# Local topography

**Note that the data source files are not on GitHub!**

They are in my `~/local` directory and also in the Dropbox folder with the Abegg files.

In [3]:
BASE = os.path.expanduser('~/github')
ORG = 'etcbc'
REPO = 'dss'
VERSION = '0.1'

REPO_DIR = f'{BASE}/{ORG}/{REPO}'
META_DIR = f'{REPO_DIR}/sources/meta'

CHAR_TABLE = f'{META_DIR}/chars.txt'
MAN_TABLE = f'{META_DIR}/mans.txt'

LOCAL = os.path.expanduser('~/local')
DATA_DIR = f'{LOCAL}/{REPO}/sanitized'

# Sources

We have two source files:

* `dss_bib.txt` with biblical material
* `dss_nonbib.txt` with non-biblical material

Throughout this conversion program
we use `True` for the biblical material and `False` for the other material.

In [4]:
SOURCES = dict(
  bib=True,
  nonbib=False,
)

## Tables

We have two tables for mapping certain values in the source data to other values
in a systematic way:

* a manuscript table in order to represent mamuscript codes by their names.
  In most cases the name *is* the code.
* a character table relating unicode characters to their transliterations.

In [5]:
origFromTrans = {}
bookFromCode = {}

def readChars():
  with open(CHAR_TABLE) as fh:
    for line in fh:
      (orig, trans) = line.rstrip().split('\t')
      origFromTrans[trans] = orig
  print(f'{len(origFromTrans):>4} characters mapped')


def readBooks():
  with open(MAN_TABLE) as fh:
    for line in fh:
      (code, book) = line.rstrip().split('\t')
      bookFromCode[code] = book
  print(f'{len(bookFromCode):>4} mans mapped')
  
readChars()
readBooks()

  38 characters mapped
 265 mans mapped


# Slurp the data

We take in the data and store the lines in a dictionary of lists of lines,
keyed by the boolean "is biblical?"

In [6]:
lines = collections.defaultdict(list)

def getSourceLines():
  for (src, isBib) in SOURCES.items():
    with open(f'{DATA_DIR}/{REPO}_{src}.txt') as fh:
      for line in fh:
        lines[isBib].append(line.strip())
      print(f'{src:<15}: {len(lines[isBib]):>5} lines')
    
getSourceLines()

bib            : 243157 lines
nonbib         : 378023 lines


# Parse lines

We read the stored lines and parse them.

If there are errors, we store all occurrences of them in a dict, keyed by:

* kind of error
* the sore point
* the source (`True` or `False` - biblical or not)

And then we get a list of list indices.

Later when we display the errors

* we replace `True` by `bib` and `False` by `nonbib`
* we display line numbers (one higher than list indices)
* we retrieve the corresponding source line

We only show at most `batch` many lines per error and per sore point and per source.

## Line splitting

We have to split lines differently in both sources.

source | separator | # fields
--- | --- | ---
bib | tab | 5
nonbib | space | 4

Several fields are made up of parts themselves.

**bib**

```Gen 1:19	1Q1 f1:1	w	w◊@Pc	32.5```

* `Gen` is book acronym
* `1` is the chapter
* `19` is the verse
* `1Q1` is the manuscript name
* `f1` *what is this called? folio, part, page?*
* `1` *is the line?*
* `w` is the transcription
* `w◊` is the lexeme
* `Pc` is the morphology code
* `32` is the word number
* `5` *what is this? Sub-number, or part of the wordnumber?*

**nonbib**

```CD 1:2,3.1 ky k;Iy_2@Pc```

* `CD` is manuscript name
* `1` is column
* `2` is line
* `3` is line (again! are lines nested, is it a sub-line, *how shall I call it?*)
* `ky` is transcription (this goes into the plain text)
* `k;ly_2` is lexeme, (*the `_2` is disambiguation?*)
* `Pc` is morphology code

In [7]:
def splitLine(isBib, ln, line):
  parts = line.split('\t' if isBib else ' ')
  expFields = 5 if isBib else 4
  nFields = len(parts)
  if nFields != expFields:
    errors['wrong number of fields'][nFields][isBib].append(ln)
    return False
  return parts

In [8]:
errors = collections.defaultdict(
  lambda: collections.defaultdict(
    lambda: collections.defaultdict(list)
  )
)
batch = 10

def showErrors():
  for (kind, soreSources) in sorted(errors.items()):
    print(f'ERROR {kind}:')
    for (sore, srcOccs) in sorted(soreSources.items()):
      print(f'\t{sore}:')
      for (isBib, occs) in sorted(srcOccs.items()):
        nOccs = len(occs)
        srcStr = 'bib' if isBib else 'nonbib'
        print(f'\t\t{srcStr:<6}: {nOccs:>6}x:')
        for occ in occs[0:batch]:
          print(f'\t\t\t{occ + 1:>6} "{lines[isBib][occ]}"')
        if nOccs > batch:
          print('\t\t\t...')

def readData():
  for isBib in lines:
    for (i, line) in enumerate(lines[isBib]):
      if line.startswith('>'):
        continue
      parts = splitLine(isBib, i, line)
      if not parts:
        continue
    
readData()
showErrors()

ERROR wrong number of fields:
	1:
		nonbib:   1144x:
			 15051 "(fl)"
			 15053 "(fy)"
			 15799 "(fl)"
			 15801 "(fy)"
			 15807 "(fl)"
			 15809 "(fy)"
			 16153 "(fl)"
			 16155 "(fy)"
			 16843 "(fl)"
			 16845 "(fy)"
			...
	2:
		nonbib:     85x:
			 13082 "1QSa 1:8,6.1"
			 24811 "1QHa 4:14,8.1"
			 24870 "1QHa 4:19,7.1"
			 39282 "e(s9).(xb5). (a)"
			 51846 "e(s9).(xb5). (a)"
			 60231 "e(s9).(xb5). (a)"
			 64990 "e(s9).(xb5). (a)"
			 66125 "e(s9).(xb5). (a)"
			 67575 "e(s9).(xb5). (a)"
			 69024 "e(s9).(xb5). (a)"
			...
	3:
		nonbib:  74516x:
			     2 "CD 1:1,1.1 ≥"
			     3 "CD 1:1,2.1 ≤"
			    16 "CD 1:2,2.1 ."
			    29 "CD 1:2,13.1 ."
			    48 "CD 1:4,3.1 ."
			    64 "CD 1:5,5.1 ."
			    85 "CD 1:7,2.1 ."
			   106 "CD 1:8,6.1 ."
			   118 "CD 1:9,4.1 ."
			   130 "CD 1:10,3.1 ."
			...
		bib   :    496x:
			 42273 "Ps 127:3	1Q11 f2_5:3	(fl)"
			 42275 "Ps 127:3	1Q11 f2_5:3	(fy)"
			 45253 "Ex 4:31	2Q3 f1:1	(fl)"
			 45255 "Ex 4

In [9]:
def convert():
  pass

convert()