# Finding clause boundaries in the biblical scrolls 

In this script clause boundaries in the BHSA are transfered to the biblical Dead Sea Scrolls by using sequence alignment. This approach works well, as long as clause boundaries in the BHSA correspond with clause boundaries in the scrolls. By far most clause boundaries are placed correctly, but further processing is necessary.

In [None]:
import collections
import re

In [None]:
from Bio import pairwise2
from Bio.Seq import Seq 

In [None]:
from tf.app import use
A = use('dss', hoist=globals())

Ldss = L
Tdss = T
Fdss = F

In [None]:
from tf.app import use
A = use('bhsa', hoist=globals())

The helper function my_split splits a string on any of a list of strings.

In [None]:
def my_split(s, seps):
    """
    Input:
    s is a string
    seps is a list of strings. E.g. is seps = [' ', 'b'], the input string s is split on both a space
    and the letter 'b'
    Output:
    res is a list containg the original string and the resulting substrings.
    """
    res = [s]
    for sep in seps:
        s, res = res, []
        for seq in s:
            res += seq.split(sep)
    return res

In [None]:
dss_isa_dict = collections.defaultdict(list)

dss_isa_dict_ch = collections.defaultdict(list)

lexemes_dss = collections.defaultdict(list)

for scr in Fdss.otype.s('scroll'):
    scroll_name = Tdss.scrollName(scr)
    
    if scroll_name == '1Qisaa':
        words = Ldss.d(scr, 'word')
        
        for w in words:
            
            if Fdss.glyphe.v(w) != None:
                lexeme = Fdss.glexe.v(w)
                glyphs = Fdss.glyphe.v(w)
                        
                # the consonant '#' is used for both 'C' and 'F'. We check in the lexeme
                # to which of the two alternatives it should be converted. This appproach is crude, 
                # but works well in practice. There is only one word with both F and C in the lexeme: >RTX##T> >AR:T.AX:CAF:T.:> in 4Q117
                if '#' in glyphs:  
                    # hardcode the single word with both 'C' and 'F' in the lexeme.
                    if glyphs == '>RTX##T>':
                        glyphs = '>RTXCFT>'
                    
                    elif 'F' in lexeme:
                        glyphs = glyphs.replace('#', 'F')
                    
                    # cases in wich 'C' occurs in lexeme or morphology
                    else:                        
                        glyphs = glyphs.replace('#', 'C')
                        
                glyphs = glyphs.replace(u'\xa0', u' ').replace("'", "").replace("k", "K").replace("n", "N").replace("m", "M").replace("y", "Y").replace("p", "P")   
               
                dss_isa_dict[(int(Fdss.chapter.v(w)), int(Fdss.verse.v(w)))].append(glyphs)
                dss_isa_dict_ch[int(Fdss.chapter.v(w))].append(glyphs)
                lexemes_dss[(int(Fdss.chapter.v(w)), int(Fdss.verse.v(w)))].append(Fdss.glexe.v(w)) 
                        


## Clause endings

BHSA data are prepared, by constructing the text of verses, with an 'e' inserted after a clause has ended.

In [None]:
all_verses = []
lexemes_bhsa = collections.defaultdict(list)

bhsa_isa_dict = collections.defaultdict(list)

for w in F.otype.s('word'):
    bo, ch, ve = T.sectionFromNode(w)
    if bo == 'Isaiah':
        
        cl = L.u(w, "clause")[0]
        words_in_cl = L.d(cl, "word")
        
        bhsa_isa_dict[(ch, ve)].append(F.g_cons.v(w))
        for cons in F.g_cons.v(w):
            lexemes_bhsa[(ch, ve)].append(F.lex.v(w))        
        
        # add end (e) of clause
        if w == words_in_cl[-1]:
            bhsa_isa_dict[(ch, ve)].append('e')
        if (ch, ve) not in all_verses:
            all_verses.append((ch, ve))
    

In [None]:
all_verses_split1 = []

words_dss = []
cl_ends = []
verse_list = []

for verse in all_verses:
        
    if verse in dss_isa_dict:
        
        seq_dss = ' '.join(dss_isa_dict[verse])
        seq_bhsa = ' '.join(bhsa_isa_dict[verse])
        
        seq1 = Seq(seq_bhsa) 
        seq2 = Seq(seq_dss)
    
        alignments = pairwise2.align.globalxx(seq1, seq2)
        
        dss_al = alignments[0][1]
        bhsa_al = alignments[0][0]
        
        print(verse)
        print(dss_al)
        print(bhsa_al)
        
        dss_list = list(dss_al)
    
        all_endings = [m.start() for m in re.finditer('e', bhsa_al)]

        dss_list = list(dss_al)

        for ending in all_endings:
            dss_list[ending] = 'e'
        
        dss_new = ''.join(dss_list)
        
        clauses_dss = my_split(dss_new, ["-e ", " e ", " e", "-e-- ", "-e- "])   

        print(clauses_dss)

        clauses_dss2 = []
        
        for cl in clauses_dss:
            cl = cl.strip('e')
    
            if not 'e' in cl:
                cl = cl.replace("-", "")
    
                clauses_dss2.append(cl)
        
            else:
                
                # '>M J-D-WMW K- TWL<-eT K- YMR JHJW'
                all_starts_b = [m.start() for m in re.finditer('e', cl)]
                all_starts_sp = [m.start() for m in re.finditer(' ', cl)]
        
                if len(all_starts_sp) == 0:
                        
                    cl = cl.replace('e', '')
                    cl = cl.replace('-', '')
                    clauses_dss2.append(cl)
                        
                else: 
                        
                    indices = [0]
                    for beg in all_starts_b:
                        if max(all_starts_sp) > beg:
                            indices.append(min([sp for sp in all_starts_sp if sp > beg]))
  
                    parts = [cl[i:j] for i,j in zip(indices, indices[1:]+[None])]
    
                    for part in parts:
                        clauses_dss2.append(part)
                
        cl_dss2 = [cl.replace("-", "").strip(" ").split(" ") for cl in clauses_dss2] #.replace('e', '')  
        print(cl_dss2)
        
        for c in cl_dss2:
            if c == [""] or c == ["e"]:
                continue
            for ind, w in enumerate(c):
                words_dss.append(w)
                verse_list.append(verse)

                if ind == len(c) - 1:
                    cl_ends.append('e')
                else:
                    cl_ends.append('-')
        

## Clause starts

In [None]:
all_verses = []
lexemes_bhsa = collections.defaultdict(list)

bhsa_isa_dict = collections.defaultdict(list)

for w in F.otype.s('word'):
    bo, ch, ve = T.sectionFromNode(w)
    if bo == 'Isaiah':
        
        cl = L.u(w, "clause")[0]
        words_in_cl = L.d(cl, "word")
        
        # add beginning (b) of clause
        if w == words_in_cl[0]:
        
            bhsa_isa_dict[(ch, ve)].append('b')
        
        bhsa_isa_dict[(ch, ve)].append(F.g_cons.v(w))
        for cons in F.g_cons.v(w):
            lexemes_bhsa[(ch, ve)].append(F.lex.v(w))
        
        if (ch, ve) not in all_verses:
            all_verses.append((ch, ve))

In [None]:
all_verses_split1 = []

words_dss2 = []
cl_starts = []


for verse in all_verses:

    if verse in dss_isa_dict:
        print(verse)
        
        seq_dss = ' '.join(dss_isa_dict[verse])
        seq_bhsa = ' '.join(bhsa_isa_dict[verse])

        seq1 = Seq(seq_dss) 
        seq2 = Seq(seq_bhsa)
        
        print(seq1)
        print(seq2)
    
        alignments = pairwise2.align.globalxx(seq2, seq1)
        
        bhsa_al = alignments[0][0]
        dss_al = alignments[0][1]

        dss_list = list(dss_al)
    
        all_endings = [m.start() for m in re.finditer('b', bhsa_al)]

        dss_list = list(dss_al)

        for ending in all_endings:
            dss_list[ending] = 'b'
        
        dss_new = ''.join(dss_list)
        
        clauses_dss = my_split(dss_new, ["-b ", " b", "b ", "-b- ", "-b-- "])

        clauses_dss2 = []
        
        for cl in clauses_dss:
            cl = cl.strip('b')
    
            if not 'b' in cl:
                cl = cl.replace("-", "")
    
                clauses_dss2.append(cl)
        
            else:
                # '>M J-D-WMW K- TWL<-eT K- YMR JHJW'
                all_starts_b = [m.start() for m in re.finditer('b', cl)]
                all_starts_sp = [m.start() for m in re.finditer(' ', cl)]
                
                if len(all_starts_sp) == 0:
                        
                    cl = cl.replace('b', '')
                    cl = cl.replace('-', '')
                    clauses_dss2.append(cl)
                        
                else: 
                        
                    indices = [0]
                    for beg in all_starts_b:
                        if max(all_starts_sp) > beg:
                            indices.append(min([sp for sp in all_starts_sp if sp > beg]))
                    print(indices)
                    print('clause', cl)  
                    parts = [cl[i:j] for i,j in zip(indices, indices[1:]+[None])]
                    print('parts', parts)
                    for part in parts:
                        clauses_dss2.append(part)

        cl_dss2 = [cl.strip('b').replace("b", "").replace("-", "").strip(" ").split(" ") for cl in clauses_dss2]
        
        print('cl_dss2', cl_dss2)
        for clause in cl_dss2:
            
            if clause == [""] or clause == ["b"]:
                continue
            for ind, w1 in enumerate(clause):
                
                words_dss2.append(w1)

                if ind == 0:
                    cl_starts.append('b')
                else:
                    cl_starts.append('-')

print(len(words_dss2))
print(len(cl_starts))



In [None]:
for i in range(len(words_dss2)):
    print(words_dss2[i], words_dss[i], verse_list[i])

In [None]:
chap_verse = [[ ch for ch, v in verse_list ], 
       [ v for ch, v in verse_list ]] 

In [None]:
import pandas as pd

isa_df = pd.DataFrame(list(zip(chap_verse[0], chap_verse[1], words_dss2, cl_starts, cl_ends)), 
               columns =['chapter', 'verse', 'word', 'start_of_clause', 'end_of_clause']) 
        

In [None]:
isa_df

In [None]:
isa_df.to_csv("_1_q_isaa.csv", index=False)