# Lexemes of the non-biblical Dead Sea Scrolls

The lexemes of the non-biblical DSS are converted to their ETCBC values using the mapping_dict, that was made in the script lexemes_pos_biblical_books.ipynb. 
This means that lexemes of words occurring in the biblical DSS can be used to convert the same words in the non-biblical scrolls.

In [None]:
import dill
from pprint import pprint
import pandas as pd
import collections

In [None]:
from tf.app import use
A = use('dss', hoist=globals())

# give the relevant classes for the DSS new names
Ldss = L
Tdss = T
Fdss = F

In [None]:
with open("mapping_dict.pkl", 'rb') as pkl_file:
    mapping_dict = dill.load(pkl_file)
    
pprint(mapping_dict)

In [None]:
book_dict_bhsa_dss = {'Genesis':      'Gen',
             'Exodus':       'Ex',
             'Leviticus':    'Lev',
             'Numbers':      'Num',
             'Deuteronomy':  'Deut',
             'Joshua':       'Josh',
             'Judges':       'Judg',
             '1_Samuel':     '1Sam',
             '2_Samuel':     '2Sam',
             '1_Kings':      '1Kgs',
             '2_Kings':      '2Kgs',
             'Isaiah':       'Is',
             'Jeremiah':     'Jer',
             'Ezekiel':      'Ezek',
             'Hosea':        'Hos',
             'Joel':         'Joel',
             'Amos':         'Amos',
             'Obadiah':      'Obad',
             'Jonah':        'Jonah',
             'Micah':        'Mic',
             'Nahum':        'Nah',
             'Habakkuk':     'Hab',
             'Zephaniah':    'Zeph',
             'Haggai':       'Hag',
             'Zechariah':    'Zech',
             'Malachi':      'Mal',
             'Psalms':       'Ps',
             'Job':          'Job',
             'Proverbs':     'Prov',
             'Ruth':         'Ruth',
             'Song_of_songs':'Song',
             'Ecclesiastes': 'Eccl',
             'Lamentations': 'Lam',
             'Daniel':       'Dan',
             'Ezra':         'Ezra',
             '2_Chronicles': '2Chr'
            }

In [None]:
book_dict_dss_bhsa = {v: k for k, v in book_dict_bhsa_dss.items()}
print(book_dict_dss_bhsa)

In [None]:
from collections import Counter 
  
def most_frequent(lex_list): 
    occurrence_count = Counter(lex_list) 
    lex = occurrence_count.most_common(1)[0][0]
    count = occurrence_count.most_common(1)[0][1]
    return(lex, count)

In [None]:
tf_nodes = []
scrolls = []
glyphs_list = []
dss_lex = []
etcbc_lex = []

for scr in Fdss.otype.s('scroll'):
    scroll_name = Tdss.scrollName(scr)
    
    #if scroll_name != '11Q19':
    #    continue
    
    words = Ldss.d(scr, 'word')
        
    for w in words:
        
        bo = Fdss.book.v(w) 
        
        if bo in book_dict_dss_bhsa:
            continue
        
        glyphs = Fdss.glyphe.v(w)
        lexeme = Fdss.glexe.v(w)
        
        tf_nodes.append(w)
        scrolls.append(scroll_name)
        glyphs_list.append(glyphs)
        dss_lex.append(lexeme)

        if lexeme == None:
            etcbc_lex.append('')
            continue
            
        
        elif lexeme in mapping_dict:
        
            all_candidates_lists = list((mapping_dict[lexeme]).values())
            candidates_list = [item for sublist in all_candidates_lists for item in sublist]        
            best_cand, count = most_frequent(candidates_list)
            etcbc_lex.append(best_cand)
            #print(w, glyphs, lexeme, best_cand)
        else:
            etcbc_lex.append('')


In [None]:
df_lex = pd.DataFrame(list(zip(tf_nodes, scrolls, glyphs_list, dss_lex, etcbc_lex)), 
               columns =['tf_word_id', 'scroll','glyphs','dss_lex', 'etcbc_lex']) 

df_lex

In [None]:
df_lex.to_csv("lexemes_non_bib_books.csv", index=False)

In [None]:
# This approach does not work well for cases like the following, needs to be corrected in the file 
# in which the mapping_dict is made.

mapping_dict['H']