# Analyzing the Translation Tendencies of the Qatal Verb

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tf.app import use
import stats.significance as sig

PROJ = Path.home().joinpath('github/CambridgeSemiticsLab/translation_traditions_HB/data/')
DATA_CSV = PROJ.joinpath('_private_/translation_dataset.csv')
verb_df = pd.read_csv(DATA_CSV)

In [2]:
verb_df.shape

(20332, 26)

In [3]:
verb_df.columns

Index(['bhsa_node', 'wlc_id', 'ref', 'book', 'text_full', 'text_plain', 'lex',
       'lex_etcbc', 'gloss', 'tense', 'stem', 'person', 'gender', 'number',
       'wlc_morph', 'sentence', 'genre', 'domain', 'txt_type', 'clause_type',
       'clause_rela', 'preceding_waw', 'valence', 'niv_tags', 'niv_dep',
       'niv_words'],
      dtype='object')

In [4]:
verb_df.head()

Unnamed: 0,bhsa_node,wlc_id,ref,book,text_full,text_plain,lex,lex_etcbc,gloss,tense,...,genre,domain,txt_type,clause_type,clause_rela,preceding_waw,valence,niv_tags,niv_dep,niv_words
0,3,10010010021,Genesis 1:1,Genesis,בָּרָ֣א,ברא,ברא,BR>[,he created,perf,...,prose,?,?,xQtX,Main,False,d-,VBD,ROOT,created
1,15,10010020021,Genesis 1:2,Genesis,הָיְתָ֥ה,היתה,היה,HJH[,she was,perf,...,prose,?,?,WXQt,Main,False,--,VBD,ROOT,was
2,69,10010050061,Genesis 1:5,Genesis,קָ֣רָא,קרא,קרא,QR>[,he called,perf,...,prose,N,?N,WxQ0,Main,False,l.,VBD,relcl,called
3,172,10010100071,Genesis 1:10,Genesis,קָרָ֣א,קרא,קרא,QR>[,he called,perf,...,prose,N,?N,WxQ0,Main,False,l.,VBD,relcl,called
4,267,10010150012,Genesis 1:15,Genesis,הָי֤וּ,היו,היה,HJH[,let them be,perf,...,prose,Q,?NQ,WQt0,Main,True,-p,VB|VB,ROOT|ccomp,let be


In [5]:
verb_df.niv_tags.value_counts().head(25)

VBD          3835
MD|VB        2161
VBP|VBN      1557
VB           1421
VBZ          1183
VBD|VBN      1113
VBZ|VBN      1005
VBP           968
VBD|RP        354
VBD|VB        353
NN            250
MD|VB|VBN     243
VBD|JJ        233
VBN           230
MD|VB|RP      207
VBG           181
VBZ|TO|VB     138
VBP|JJ        137
MD|VB|JJ      131
VBP|VB        131
JJ            127
VBD|RB        113
VB|RP         110
VBD|IN        110
VBZ|JJ        106
Name: niv_tags, dtype: int64

### Gesenius Groups

* Events in past §106.1
    * a "english perfect definite", e.g. "did VB"
    * tempus historicum (VBD)
    * pluperfect "have + past participle"
    
* Events in past with remaining effects §106.2
    * have + present tense verb

* Events in future

In [6]:
verb_df.clause_rela.value_counts()

Main      14646
SubAdv     2891
SubMod     2342
SubArg      453
Name: clause_rela, dtype: int64

# Looking at the modal uses of qatal

In [7]:
# add some new boolean columns to check for various 
# predictors of modality
verb_df['has_modal'] = verb_df.niv_tags.str.match('MD') 
verb_df['is_prose'] = verb_df.genre == 'prose'
verb_df['is_speech'] = verb_df.domain == 'Q'
verb_df['is_main'] = verb_df.clause_rela == 'Main'

In [8]:
# run some correlation statistics with various relevant factors

modal_corr = pd.pivot_table(
    verb_df,
    index=['has_modal'],
    columns=['preceding_waw', 'is_prose', 'is_speech', 'is_main'],
    aggfunc='size',
    fill_value=0,
)

In [9]:
modal_corr

preceding_waw,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True
is_prose,False,False,False,False,True,True,True,True,False,False,False,False,True,True,True,True
is_speech,False,False,True,True,False,False,True,True,False,False,True,True,False,False,True,True
is_main,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True
has_modal,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
False,518,1194,1848,3562,1436,1986,1513,1573,9,318,114,1760,9,216,75,945
True,10,35,80,289,8,67,31,42,4,187,20,1775,0,35,11,662


In [10]:
modal_prop = modal_corr.div(modal_corr.sum(1), 0)

modal_prop

preceding_waw,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True
is_prose,False,False,False,False,True,True,True,True,False,False,False,False,True,True,True,True
is_speech,False,False,True,True,False,False,True,True,False,False,True,True,False,False,True,True
is_main,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True
has_modal,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
False,0.030335,0.069923,0.108222,0.208597,0.084095,0.116304,0.088604,0.092118,0.000527,0.018623,0.006676,0.103069,0.000527,0.012649,0.004392,0.055341
True,0.003071,0.010749,0.02457,0.088759,0.002457,0.020577,0.009521,0.012899,0.001229,0.057432,0.006143,0.545147,0.0,0.010749,0.003378,0.203317


In [11]:
# run some correlation tests

modal_fish, odds = sig.apply_fishers(modal_corr, 0, 1)

modal_fish

Unnamed: 0_level_0,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,True
Unnamed: 0_level_1,False,False,False,False,True,True,True,True,False,False,False,False,True,True,True,True
Unnamed: 0_level_2,False,False,True,True,False,False,True,True,False,False,True,True,False,False,True,True
Unnamed: 0_level_3,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True
False,26.781137,52.351673,64.584147,65.47773,98.527008,82.402339,78.230193,73.017645,-0.848723,-30.345932,0.089587,-inf,0.430628,0.361182,0.256057,-141.57634
True,-26.781137,-52.351673,-64.584147,-65.47773,-98.527008,-82.402339,-78.230193,-73.017645,0.848723,30.345932,-0.089587,inf,-0.430628,-0.361182,-0.256057,141.57634


In [12]:
# see which features cue the modal response

corr_modal = modal_corr.T

corr_modal

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,has_modal,False,True
preceding_waw,is_prose,is_speech,is_main,Unnamed: 4_level_1,Unnamed: 5_level_1
False,False,False,False,518,10
False,False,False,True,1194,35
False,False,True,False,1848,80
False,False,True,True,3562,289
False,True,False,False,1436,8
False,True,False,True,1986,67
False,True,True,False,1513,31
False,True,True,True,1573,42
True,False,False,False,9,4
True,False,False,True,318,187


In [13]:
def color_dp(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'blue' if val < 0 else 'red'
    return f'color: {color}'

In [14]:
corr_delt = sig.apply_deltaP(corr_modal, 0, 1)

corr_delt.style.applymap(color_dp)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,has_modal,False,True
preceding_waw,is_prose,is_speech,is_main,Unnamed: 4_level_1,Unnamed: 5_level_1
False,False,False,False,0.144967,-0.144967
False,False,False,True,0.140134,-0.140134
False,False,True,False,0.131077,-0.131077
False,False,True,True,0.10498,-0.10498
False,True,False,False,0.166421,-0.166421
False,True,False,True,0.141827,-0.141827
False,True,True,False,0.151574,-0.151574
False,True,True,True,0.145709,-0.145709
True,False,False,False,-0.147645,0.147645
True,False,False,True,-0.215508,0.215508


In [None]:
verb_df

### Key Observations

```
-------- 

* There are no cases where the absence of a preceding waw
strongly predicts a modal context.

-------

* 40% more likely to see modal translation
    preceding_waw    True
    is_prose         False
    is_speech        True
    is_main          True

-------

However, the waw is clearly not the only factor:

* 16% more likely to see a NON-modal translation
    preceding_waw    True
    is_prose         True
    is_speech        False
    is_main          False

```