# Convert verse ranges of genres to TF verse node features

In [4]:
import collections
import pandas as pd
from tf.fabric import Fabric
from tf.dataset import modify
from tf.app import use

In [10]:
VERSION = '2021'

In [11]:
A = use('etcbc/bhsa', version=VERSION, hoist=globals())

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [12]:
genre_ranges = pd.read_csv('genre_ranges.csv')

In [13]:
genre_ranges

Unnamed: 0,book,ch start,vs start,ch end,vs end,genre
0,Genesis,1,1,4,16,prose
1,Genesis,4,17,4,23,list
2,Genesis,4,24,4,26,prose
3,Genesis,5,1,5,32,list
4,Genesis,6,1,9,29,prose
...,...,...,...,...,...,...
215,1_Chronicles,16,8,16,36,poetry
216,1_Chronicles,16,37,23,1,prose
217,1_Chronicles,23,2,27,34,list
218,1_Chronicles,28,1,29,30,prose


# Compile data & sanity checks

In [14]:
# check book values
genre_ranges.book.unique()

array(['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy',
       'Joshua', 'Judges', '1_Samuel', '2_Samuel', '1_Kings', '2_Kings',
       'Isaiah', 'Jeremiah', 'Ezekiel', 'Hosea', 'Joel', 'Amos',
       'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah',
       'Haggai', 'Zechariah', 'Malachi', 'Psalms', 'Job', 'Proverbs',
       'Ruth', 'Song_of_songs', 'Ecclesiastes', 'Lamentations', 'Esther',
       'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles'],
      dtype=object)

In [15]:
# check genre values
genre_ranges.genre.unique()

array(['prose', 'list', 'poetry', 'instruction', 'prophetic'],
      dtype=object)

In [16]:
# check book name alignment with BHSA english names
for book in genre_ranges.book.unique():
    bhsa_node = T.nodeFromSection((book,))
    if not bhsa_node:
        raise Exception(book)

In [32]:
def verse_node_range(start, end, tf_api):
    """Generate a list of verse nodes for a given range of reference tuples.
    
    Note that start and end are both inclusive bounds.
    
    Args:
        start: 3-tuple of (book, n_ch, n_vs)
        end: 3-tuple of (book, n_ch, n_vs)
    Returns:
        list of nodes
    """
    start_node = tf_api.T.nodeFromSection(start)
    end_node = tf_api.T.nodeFromSection(end)
    nodes = [start_node]
    while nodes[-1] < end_node:
        nodes.append(tf_api.L.n(nodes[-1],'verse')[0])
    return nodes

In [33]:
# check for missing verses
# or double-counted verses

verse2genre = {} # will be used for TF export
verse2count = collections.Counter()

for book, startch, startvs, endch, endvs, genre in genre_ranges.values:
    start = (book, startch, startvs)
    end = (book, endch, endvs)
    for verse in verse_node_range(start, end, A.api):
        verse2genre[verse] = genre
        verse2count[verse] += 1
        
# check for double-labeled verses
for verse,count in verse2count.items():
    if count > 1:
        print(verse, T.sectionFromNode(verse))

In [34]:
# check for missing verses
all_verses = set(F.otype.s('verse'))
for missing_verse in (all_verses - set(verse2genre.keys())):
    print(missing_verse, T.sectionFromNode(missing_verse))

In [20]:
#verse2genre

# Export TF Features

In [21]:
nodeFeatures = {'genre': verse2genre}
featureMeta = {
    'genre': {
        'description': '(sub)genre of a verse node',
        'authors': 'Dirk Bakker, Marianne Kaajan, Martijn Naaijer, Wido van Peursen, Janet Dyk',
        'tf-conversion': 'Cody Kingham',
        'origin': 'the genre feature was tagged during the NWO-funded syntactic variation project (2013-2018) of the ETCBC, VU Amsterdam',
        'source_URL': 'https://github.com/MartijnNaaijer/phdthesis/blob/master/Various/subgenres_synvar.xls',
        'valueType': 'str',
    } 
}

In [22]:
TF = Fabric(f'tf/{VERSION}')
TF.save(nodeFeatures=nodeFeatures, metaData=featureMeta)

  0.00s Not all of the warp features otype and oslots are present in
tf/2021
  0.00s Only the Feature and Edge APIs will be enabled
  0.00s Warp feature "otext" not found. Working without Text-API

  0.00s Exporting 1 node and 0 edge and 0 config features to tf/2021:
   |     0.03s T genre                to tf/2021
  0.03s Exported 1 node features and 0 edge features and 0 config features to tf/2021


True

## Tests

In [23]:
TF = Fabric(locations=[f'~/github/etcbc/bhsa/tf/{VERSION}', f'tf/{VERSION}'])
API = TF.load('genre')
API.makeAvailableIn(globals())

   |     0.07s T genre                from tf/2021


[('Computed',
  'computed-data',
  ('C Computed', 'Call AllComputeds', 'Cs ComputedString')),
 ('Features', 'edge-features', ('E Edge', 'Eall AllEdges', 'Es EdgeString')),
 ('Fabric', 'loading', ('TF',)),
 ('Locality', 'locality', ('L Locality',)),
 ('Nodes', 'navigating-nodes', ('N Nodes',)),
 ('Features',
  'node-features',
  ('F Feature', 'Fall AllFeatures', 'Fs FeatureString')),
 ('Search', 'search', ('S Search',)),
 ('Text', 'text', ('T Text',))]

In [24]:
F.otype.s('verse')

range(1414389, 1437602)

In [25]:
verse_data = []

for verse_n in F.otype.s('verse'):
    genre = F.genre.v(verse_n)
    book, chapter, verse = T.sectionFromNode(verse_n)
    ref = f'{book} {chapter}:{verse}'
    verse_data.append({
        'node': verse_n,
        'ref': ref,
        'book': book,
        'genre': genre,
        'text': T.text(verse_n),
    })
    
verse_df = pd.DataFrame(verse_data)
verse_df.set_index('node', inplace=True)
verse_df.head()

Unnamed: 0_level_0,ref,book,genre,text
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1414389,Genesis 1:1,Genesis,prose,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...
1414390,Genesis 1:2,Genesis,prose,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁך...
1414391,Genesis 1:3,Genesis,prose,וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י אֹ֑ור וַֽיְהִי־אֹֽ...
1414392,Genesis 1:4,Genesis,prose,וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָאֹ֖ור כִּי־טֹ֑וב וַי...
1414393,Genesis 1:5,Genesis,prose,וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לָאֹור֙ יֹ֔ום וְלַחֹ֖שׁ...


In [26]:
# save a .csv copy
verse_df[['ref', 'genre']].to_csv('verse2genre.csv', index=False)

In [27]:
verse_df.genre.value_counts()

prose          9337
poetry         5286
prophetic      4615
instruction    2319
list           1656
Name: genre, dtype: int64

In [28]:
verse_df[verse_df.genre == 'prophetic'].book.value_counts()

Jeremiah     1296
Isaiah       1228
Ezekiel      1082
Zechariah     211
Hosea         197
Amos          146
Micah         105
Joel           73
Habakkuk       56
Malachi        55
Zephaniah      53
Nahum          47
Haggai         38
Obadiah        21
2_Kings         7
Name: book, dtype: int64

In [29]:
verse_df[verse_df.genre == 'list'].book.value_counts()

1_Chronicles    619
Numbers         307
Joshua          222
Nehemiah        184
Genesis         144
Ezra            116
1_Kings          20
Exodus           19
2_Samuel         16
Esther            5
1_Samuel          4
Name: book, dtype: int64

In [30]:
# How many verses per book are a given genre?

book2genre = pd.pivot_table(
    verse_df,
    index='book',
    columns=['genre'],
    aggfunc='size',
    fill_value=0,
)

book2genre

genre,instruction,list,poetry,prophetic,prose
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1_Chronicles,0,619,29,0,295
1_Kings,0,20,0,0,797
1_Samuel,0,4,9,0,798
2_Chronicles,0,0,0,0,822
2_Kings,0,0,0,7,712
2_Samuel,0,16,64,0,615
Amos,0,0,0,146,0
Daniel,0,0,11,0,346
Deuteronomy,342,0,70,0,547
Ecclesiastes,0,0,222,0,0


In [31]:
# get percentages

book2genre.div(book2genre.sum(1), 0)

genre,instruction,list,poetry,prophetic,prose
book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1_Chronicles,0.0,0.656416,0.030753,0.0,0.312831
1_Kings,0.0,0.02448,0.0,0.0,0.97552
1_Samuel,0.0,0.004932,0.011097,0.0,0.98397
2_Chronicles,0.0,0.0,0.0,0.0,1.0
2_Kings,0.0,0.0,0.0,0.009736,0.990264
2_Samuel,0.0,0.023022,0.092086,0.0,0.884892
Amos,0.0,0.0,0.0,1.0,0.0
Daniel,0.0,0.0,0.030812,0.0,0.969188
Deuteronomy,0.356621,0.0,0.072993,0.0,0.570386
Ecclesiastes,0.0,0.0,1.0,0.0,0.0
