# Get data for Geoffrey

In [20]:
import pandas as pd

In [44]:
bhsa_df = pd.read_csv('../github/Gesenius_data/results/datasets/qtl_old/bhsa.csv')
eng_df = pd.read_csv('../github/Gesenius_data/results/datasets/qtl_old/eng.csv')
engtext_df = pd.read_csv('../github/Gesenius_data/results/datasets/qtl_old/eng_text.csv')

df = bhsa_df.merge(eng_df, on='bhsa_node')\
    .merge(engtext_df, on='bhsa_node')

In [45]:
df.shape

(14303, 64)

In [46]:
df = df[
    (df.esv_TAM.str.match('.*', na=False))
    & (df.niv_TAM.str.match('.*', na=False))
]
df.shape

(12846, 64)

In [47]:
# select with the 'safe' column
dfs = df[df.safe]

# exclude uses of "did not" for now since these 
# have unique semantic considerations
# remove cases of 'did not' for now since these are semantically ambiguous
dfs = dfs[
    (~dfs.esv_TAMspan.str.match('.*did not.*', na=False))
    & (~dfs.niv_TAMspan.str.match('.*did not.*', na=False))
]

dfs.shape

(11907, 64)

In [48]:
df.columns

Index(['bhsa_node', 'ref', 'book', 'book_super', 'canon_part', 'period',
       'genre', 'domain2', 'text_full', 'text_plain', 'lex', 'lex_etcbc',
       'gloss', 'verb_form', 'stem', 'person', 'gender', 'number', 'valence',
       'clause_atom', 'clause', 'sentence', 'txt_type', 'clause_type',
       'cltype_simp', 'clause_rela', 'cl_args', 'prec_lexes', 'prec_pos',
       'prec_part', 'ref_sbl', 'ref_abbr', 'has_objc', 'objc_pos', 'has_loca',
       'loca_type', 'loca_heads', 'has_time', 'safe', 'esv_tags', 'esv_VBtags',
       'esv_tense', 'esv_aspect', 'esv_modality', 'esv_TAM', 'esv_TAMtag',
       'niv_tags', 'niv_VBtags', 'niv_tense', 'niv_aspect', 'niv_modality',
       'niv_TAM', 'niv_TAMtag', 'eng_fullparse', 'eng_TAM', 'eng_agree', 'esv',
       'esv_verse', 'esv_TAMspan', 'niv', 'niv_verse', 'niv_TAMspan', 'esv_is',
       'niv_is'],
      dtype='object')

In [74]:
from tf.app import use

In [75]:
A = use('bhsa', hoist=globals())

In [63]:
import re

In [112]:
def make_text_examples(df, spread=25, bhs_joiner='', bhs_text=['clause_atom']):
    """Build copy and pastable text samples."""

# old implementation
#     # execute query
#     query = ex_params['query']
#     df = df.query(ex_params['query'])
#     n_results = df.shape[0]

#     # return empty search results
#     if n_results == 0:
#         return [query, '0 results']
    # or process into examples:
#     spread = ex_params.get('spread', 25) 
#     spread_i = get_spread(df.index, spread)
#     df = df.loc[spread_i]

    df = df.sample(n=spread, random_state=42)
    df = df.sort_values(by='bhsa_node')
    
    # sort out texts
    # NB that esv and niv texts might also be similarly formatted later on
    bhs_text = df[bhs_text].astype(str).agg(bhs_joiner.join, axis=1)
       
    exs = [f'{len(df)} results\n']

    for i, node in enumerate(df.index):
        heb_verse = T.text(L.u(node, 'verse')[0])
        ref = df.loc[node]['ref_abbr']
        esv = df.loc[node]['esv'].lower()
        niv = df.loc[node]['niv'].lower()
        esv_verse = df.loc[node]['esv_verse']
        niv_verse = df.loc[node]['niv_verse']
        
        # fix lowercase i
        lower_i = re.compile(r'\bi\b')
        esv, niv = lower_i.sub('I', esv), lower_i.sub('I', niv)
        bhs = bhs_text[node]
    
        if niv == esv:
            ex = f'{esv} (ESV, NIV | BHS {bhs}, {ref})'
        else:
            ex = f'{esv}, {niv} (ESV, NIV | BHS {bhs}, {ref})'

        ex = f'{i+1}. {ref}\n{bhs}\n{esv}; {niv} (ESV, NIV)\n{heb_verse}\n{esv_verse} (ESV)\n{niv_verse}(NIV)\n'
            
        exs.append(ex)
        
    return exs

<hr>

### Inchoative

In [113]:
ppdf = dfs[dfs.eng_TAM == 'PAST..IND']

ppdf.shape

(4116, 64)

In [114]:
inchoates = ppdf[
    (ppdf.niv.str.match('.*became'))
    | (ppdf.esv.str.match('.*became'))
]

inchoates.shape

(64, 64)

In [115]:
inc_exs = make_text_examples(inchoates, spread=64)

In [116]:
from pathlib import Path

In [117]:
ex_texts = '\n'.join(inc_exs)

Path('inchoates.txt').write_text(ex_texts)

35828

## Objects

In [118]:
dfs.columns

Index(['bhsa_node', 'ref', 'book', 'book_super', 'canon_part', 'period',
       'genre', 'domain2', 'text_full', 'text_plain', 'lex', 'lex_etcbc',
       'gloss', 'verb_form', 'stem', 'person', 'gender', 'number', 'valence',
       'clause_atom', 'clause', 'sentence', 'txt_type', 'clause_type',
       'cltype_simp', 'clause_rela', 'cl_args', 'prec_lexes', 'prec_pos',
       'prec_part', 'ref_sbl', 'ref_abbr', 'has_objc', 'objc_pos', 'has_loca',
       'loca_type', 'loca_heads', 'has_time', 'safe', 'esv_tags', 'esv_VBtags',
       'esv_tense', 'esv_aspect', 'esv_modality', 'esv_TAM', 'esv_TAMtag',
       'niv_tags', 'niv_VBtags', 'niv_tense', 'niv_aspect', 'niv_modality',
       'niv_TAM', 'niv_TAMtag', 'eng_fullparse', 'eng_TAM', 'eng_agree', 'esv',
       'esv_verse', 'esv_TAMspan', 'niv', 'niv_verse', 'niv_TAMspan', 'esv_is',
       'niv_is'],
      dtype='object')

In [124]:
obdf = dfs[
    (dfs.eng_TAM == 'PRES.PERF.IND')
    & (dfs.has_objc == 1)
    & (
        (dfs.esv_verse.str.match('.* a '))
        | (dfs.niv_verse.str.match('.* a '))
    )
]

#obdf

In [125]:
obdf_exs = make_text_examples(obdf)

ob_texts = '\n'.join(obdf_exs)

Path('pres_perf_iobjcs.txt').write_text(ob_texts)

14365