# Import ETCBC data into R

This notebook contains the R instructions to load the big ETCBC4b export and save it in the much more compact .rds format.

We then perform some simple information extracting on the data.
For comparison, the same information extraction has been done in
[Pandas](https://shebanq.ancient-data.org/shebanq/static/docs/tools/r/Hebrew_in_Pandas.html).

Note that we have to ignore quotes and comment signs!

First we load the big text file with all information. This will take 3 minutes or so.

In [38]:
etcbc = read.table(
    '/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.txt', 
    sep="\t", 
    header=TRUE, 
    comment.char="",
    quote="",
    as.is = TRUE,
)
dim(etcbc)

Now we save it into compact rds format.

In [39]:
saveRDS(
    object=etcbc, 
    file='/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.rds'
)

This data has been saved at the github repo 
[etcbc/laf-fabric-data](https://github.com/ETCBC/laf-fabric-data)

We load the data again, now from the compact representation. Much quicker. Still 40 seconds.

In [1]:
etcbc = readRDS(
    file='/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.rds'
)

In [2]:
dim(etcbc)

In [3]:
head(etcbc, n=30)

Unnamed: 0,oid,otype,in.subphrase,in.phrase_atom,in.phrase,in.clause_atom,in.clause,in.sentence_atom,in.sentence,in.half_verse,in.verse,in.chapter,in.book,distributional_parent,functional_parent,mother,X,book,chapter,code,det,dist,dist_unit,domain,entry,entry_heb,entryid,freq_lex,freq_occ,function.,g_cons,g_cons_utf8,g_entry,g_entry_heb,g_lex,g_lex_utf8,g_nme,g_nme_utf8,g_pfm,g_pfm_utf8,g_prs,g_prs_utf8,g_qere_utf8,g_uvf,g_uvf_utf8,g_vbe,g_vbe_utf8,g_vbs,g_vbs_utf8,g_word,g_word_utf8,gloss,gn,id,is_root,kind,label,lan,language,lex,lex_utf8,ls,maxmonad,minmonad,monads,mother_object_type,nametype,nme,nu,number,pdp,pfm,phono,phono_sep,pos,prs,ps,qtrailer_utf8,rank_lex,rank_occ,rela,root,sp,st,subpos,tab,trailer_utf8,txt,typ,uvf,vbe,vbs,verse,vs,vt
1,1,book,,,,,,,,,,,,,,,,Genesis,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,28758,1,1-28758,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,chapter,,,,,,,,,,,1.0,,,,,Genesis,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,673,1,1-673,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,verse,,,,,,,,,,2.0,1.0,,,,,Genesis,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"GEN 01,01",,,,,,11,1,1-11,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,
4,11,sentence,,,,,,,,,3.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,1,1-11,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,
5,10,sentence_atom,,,,,,,11.0,,3.0,2.0,1.0,,11.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,1,1-11,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,
6,9,clause,,,,,,10.0,11.0,,3.0,2.0,1.0,,11.0,,,,,,,0.0,clause_atoms,?,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,VC,,,,,,,11,1,1-11,clause,,,,1.0,,,,,,,,,,,,,,,,,,?,xQtX,,,,,,
7,8,clause_atom,,,,,9.0,10.0,11.0,,3.0,2.0,1.0,10.0,9.0,,,,,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True,,,,,,,,11,1,1-11,,,,,1.0,,,,,,,,,,,,,,,,0.0,,,xQtX,,,,,,
8,4,half_verse,,,,,,,,,3.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,A,,,,,,4,1,1-4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,7,phrase,,,,8.0,9.0,10.0,11.0,4.0,3.0,2.0,1.0,,9.0,,,,,,und,0.0,clause_atoms,,,,,,,Time,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,1,1-2,,,,,1.0,,,,,,,,,,,,,,,,,,,PP,,,,,,
10,6,phrase_atom,,,7.0,8.0,9.0,10.0,11.0,4.0,3.0,2.0,1.0,8.0,7.0,,,,,,und,0.0,clause_atoms,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,1,1-2,,,,,1.0,,,,,,,,,,,,,,,,,,,PP,,,,,,


# Books

Let us extract some data.
First a list of the book names.

In [4]:
books = etcbc$book[etcbc$otype == 'book']
paste(books, collapse=' ')

# Text

Now the complete text of the whole bible.

In [14]:
words = which(etcbc$otype == 'word')
text = paste(
    etcbc$g_word_utf8[words], sub('׃', '׃\n', etcbc$trailer_utf8[words]),
    sep='', collapse=''
)
write(text, file='/Users/dirk/Downloads/test_r_text.txt')

# Drill down to a passage

Let us get the words from the first verse:

In [6]:
word_ids = etcbc$oid[etcbc$otype=='word' & etcbc$in.verse==3]
word_ids

Now the *text* of the first verse.

In [47]:
words = which(etcbc$oid %in% word_ids)
gsub('׃', '׃\n', 
    paste(etcbc$g_word_utf8[words], etcbc$trailer_utf8[words], collapse='')
)

Let us get the words and text of an arbitrary passage, say Psalmi 131:2

In [48]:
verse_id = etcbc$oid[etcbc$otype == 'verse' & etcbc$book == 'Psalmi' & etcbc$chapter == 131 & etcbc$verse == 2]
verse_id
word_ids = etcbc$oid[etcbc$otype=='word' & etcbc$in.verse == verse_id]
word_ids
words = which(etcbc$oid %in% word_ids)
gsub('׃', '׃\n', 
    paste(etcbc$g_word_utf8[words], etcbc$trailer_utf8[words], collapse='')
)

Now let us organize this in two functions: one that returns the verse object given a passage, and one that prints the texts of the words in a given object.

In [51]:
object2text = function(oid) {
    otype = etcbc$otype[etcbc$oid == oid]
    word_ids = eval(parse(text=paste("etcbc$oid[etcbc$otype=='word' & etcbc$in.", otype, '==oid]', sep='')))
    words = which(etcbc$oid %in% word_ids)
    return(gsub('׃', '׃\n',  
        paste(etcbc$g_word_utf8[words], etcbc$trailer_utf8[words], collapse='')
    ))
}

verse2object = function(book, chapter, verse) {
    return(etcbc$oid[etcbc$otype == 'verse' & etcbc$book == book & etcbc$chapter == chapter & etcbc$verse == verse])
}
verse2text = function(book, chapter, verse) {
    return(object2text(verse2object(book, chapter, verse)))
}
chapter2object = function(book, chapter) {
    return(etcbc$oid[etcbc$otype == 'chapter' & etcbc$book == book & etcbc$chapter == chapter])
}
chapter2text = function(book, chapter) {
    return(object2text(chapter2object(book, chapter)))
}

In [52]:
cat(verse2text('Psalmi', 131, 2))

אִם ־לֹ֤א  שִׁוִּ֨יתִי  ׀ וְ דֹומַ֗מְתִּי  נַ֫פְשִׁ֥י  כְּ֭ גָמֻל  עֲלֵ֣י  אִמֹּ֑ו  כַּ  גָּמֻ֖ל  עָלַ֣י  נַפְשִֽׁי ׃


In [53]:
cat(chapter2text('Psalmi', 131))

שִׁ֥יר  הַֽ מַּֽעֲלֹ֗ות  לְ דָ֫וִ֥ד  יְהוָ֤ה  ׀ לֹא ־גָבַ֣הּ  לִ֭בִּי  וְ לֹא ־רָמ֣וּ  עֵינַ֑י  וְ לֹֽא ־הִלַּ֓כְתִּי  ׀ בִּ גְדֹלֹ֖ות  וּ בְ נִפְלָאֹ֣ות  מִמֶּֽנִּי ׃
אִם ־לֹ֤א  שִׁוִּ֨יתִי  ׀ וְ דֹומַ֗מְתִּי  נַ֫פְשִׁ֥י  כְּ֭ גָמֻל  עֲלֵ֣י  אִמֹּ֑ו  כַּ  גָּמֻ֖ל  עָלַ֣י  נַפְשִֽׁי ׃
יַחֵ֣ל  יִ֝שְׂרָאֵל  אֶל ־יְהוָ֑ה  מֵֽ֝ עַתָּ֗ה  וְ עַד ־עֹולָֽם ׃
