In [1]:
import pandas as pd
import re
import collections

In [4]:
f = open('running_text_genesis_schorch_corrected_CCH.txt', 'r', encoding='utf8')

Remove page numbers and unicode characters

In [4]:
f = re.sub('PAGE [0-9]*','',f.read())

In [33]:
f = f.encode('ascii','ignore')
f = f.decode()

Transform script to ETCBC-standard

In [34]:
f = re.sub('A','>',f)
f = re.sub('e','<',f)

In [35]:
f

'\n1 BR>CJT BR> >LHJM >T HCMJM W>T H>RY \n2 WH>RY HJTH THW WBHW  WXCK <L PNJ THWM \nWRWX >LHJM MRXPT <L PNJ HMJM  3 WJ>MR\n>LHJM  JHJ >WR  WJHJ >WR  4 WJR> >LHJM >T\nH>WR KJ VWB  WJBDL >LHJM BJN H>WR WBJN\nHXCK  5 WJQR> >LHJM L>WR JWM  WLXCK QR>\nLJLH  WJHJ <RB WJHJ BQR JWM >XD \n6 WJ>MR >LHJM  JHJ RQJ< BTWK HMJM  WJHJ\nMBDJL BJN MJM LMJM  7 WJ<C >LHJM >T\nHRQJ<  WJBDL BJN HMJM >CR MTXT LRQJ< \n\nWBJN HMJM >CR M<L LRQJ< WJHJ KN  8 WJQR>\n>LHJM LRQJ< CMJM  WJHJ <RB WJHJ BQR JWM\nCNJ \n9 WJ>MR >LHJM  JQWW HMJM MTXT HCMJM >L\nMQWM >XD  WTR>H HJBCH WJHJ KN  10 WJQR>\n>LHJM LJBCH >RY WLMQWH HMJM QR> JMJM \nWJR> >LHJM KJ VWB  11 WJ>MR >LHJM  TDC>\nH>RY DC> <CB MZRJ< ZR<  W<Y PRJ <CH\nPRJ LMJNW  >CR ZR<W BW <L H>RY  WJHJ KN \n\n12 WTWY> H>RY DC> <CB MZRJ< ZR< LMJNHW \nW<Y <CH PRJ >CR ZR<W BW LMJNHW  WJR>\n>LHJM KJ VWB  13 WJHJ <RB WJHJ BQR JWM\nCLJCJ \n14 WJ>MR >LHJM  JHJ M>WRWT BRQJ< HCMJM \nLH>JR <L H>RY  WLHBDJL BJN HJWM WBJN\nHLJLH  WHJW L>TWT WLMW<DJM WLJMJM WCNJM \n15 WHJW LM>WRWT BRQJ<

Reading and structuring the input

In [36]:
dic = collections.defaultdict(lambda: collections.defaultdict(list))
chapter = 1

for w in f.split():
    if re.search(r'\d', w): #A digit (verse number) initiates a new entry in the dictionary
        verse_number=w
        if verse_number in dic[chapter]: #If verse already exist in the dictionary, a new chapter is added
            chapter += 1
    else:
        dic[chapter][verse_number].append(w) #Regular words are added

In [37]:
#Gen 1:1
dic[5]['1']

['ZH',
 'SPR',
 'TWLDT',
 '>DM',
 'BJWM',
 'BR>',
 '>LHJM',
 '>DM',
 'BDMWT',
 '>LHJM',
 '<CH',
 '>TW']

Writing tab-separated file

In [38]:
with open('SP_input', 'w', encoding='utf8') as outf:
    for chapter in dic:
        for verse in dic[chapter]:
            outf.write('''Genesis\t{}\t{}\t{}\n'''.format(chapter, verse, ' '.join(dic[chapter][verse])))

#### Comments

1. Shin and Sin are not distinguished

## Correct input

#### 1. Check for errors in chapter and verse

In [35]:
data = pd.read_csv('SP_input_22.09.29.bin', sep='\t', header=None)
data.columns = ['book','chapter','verse','text']
data.head()

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,BR>CJT BR> >LHJM >T HCMJM W>T H>RY
1,Genesis,1,2,WH>RY HJTH THW WBHW WXCK <L PNJ THWM WRWX >LHJ...
2,Genesis,1,3,WJ>MR >LHJM JHJ >WR WJHJ >WR
3,Genesis,1,4,WJR> >LHJM >T H>WR KJ VWB WJBDL >LHJM BJN H>WR...
4,Genesis,1,5,WJQR> >LHJM L>WR JWM WLXCK QR> LJLH WJHJ <RB W...


In [3]:
from tf.app import use
A = use('etcbc/bhsa', hoist=globals())

In [36]:
for n, row in data.iterrows():
    bo, ch, ve = row['book'], row['chapter'], row['verse']
    try:
        if T.nodeFromSection((bo, int(ch), int(ve))) == None:
            print(f'{bo} {ch}:{ve}')
    except:
        "Exception"

#### 2. Check for single letters
Single letters are often caused by special signs misinterpreted by the OCR. This is a test to identify all single letters to discern whether they need to be attached to the preceding word.

In [34]:
data = pd.read_csv('SP_input_22.09.29.bin', sep='\t', header=None)
data.columns = ['book','chapter','verse','text']
data.head()

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,BR>CJT BR> >LHJM >T HCMJM W>T H>RY
1,Genesis,1,2,WH>RY HJTH THW WBHW WXCK <L PNJ THWM WRWX >LHJ...
2,Genesis,1,3,WJ>MR >LHJM JHJ >WR WJHJ >WR
3,Genesis,1,4,WJR> >LHJM >T H>WR KJ VWB WJBDL >LHJM BJN H>WR...
4,Genesis,1,5,WJQR> >LHJM L>WR JWM WLXCK QR> LJLH WJHJ <RB W...


In [37]:
for n, row in data.iterrows():
    for w in row['text'].split():
        if len(w) == 1:
            print(f"{row['book']} {row['chapter']}:{row['verse']}\t{w}")