# Exploring the Dataset

In [1]:
import json
from tf.app import use
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.set_option('display.max_rows', 200)

# custom modules
from scripts.paths import paths
from scripts.export import Exporter
import scripts.nav_tree as nt

# saving paths
out = Exporter(paths['outdir'], 'time_EXPLORE')

# load BHSA data
bhsa = use('bhsa', silent='deep')
F, E, T, L = bhsa.api.F, bhsa.api.E, bhsa.api.T, bhsa.api.L

In [4]:
df = pd.read_csv(paths['dataset'], index_col='node')
df

Unnamed: 0_level_0,verse,book,booksuper,canon_part,period,genre,domain,gendom,function,quality,...,TIMEAPPO,SPEC,SFX:3,SFX,DUAL,DEMON,SFX:1,SFX:2,ADVB,CARDC
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
427553,Genesis 1:1,Genesis,Genesis,Law,SBH,prose,?,prose.?,simultaneous,location,...,,,,,,,,,,
427680,Genesis 2:2,Genesis,Genesis,Law,SBH,prose,N,prose.N,simultaneous,location,...,,,,,,,,,,
427682,Genesis 2:2,Genesis,Genesis,Law,SBH,prose,N,prose.N,simultaneous,location,...,,,,,,,,,,
427693,Genesis 2:5,Genesis,Genesis,Law,SBH,prose,D,prose.D,anterior,sequence,...,,,,,,,,,,
427694,Genesis 2:5,Genesis,Genesis,Law,SBH,prose,D,prose.D,anterior,sequence,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515616,2_Chronicles 36:9,2_Chronicles,Chronicles,Writings,LBH,prose,N,prose.N,atelic_ext,duration,...,,,,,,,,,,
515619,2_Chronicles 36:10,2_Chronicles,Chronicles,Writings,LBH,prose,N,prose.N,simultaneous,location,...,,,,,,,,,,
515624,2_Chronicles 36:11,2_Chronicles,Chronicles,Writings,LBH,prose,N,prose.N,atelic_ext,duration,...,,,,,,,,,,
515659,2_Chronicles 36:21,2_Chronicles,Chronicles,Writings,LBH,prose,N,prose.N,atelic_ext,duration,...,,,,,,,,,,


In [6]:
function_ct = df.function.value_counts()
function_pr = function_ct / function_ct.sum()

function_ct.head(20)

simultaneous                 1947
atelic_ext                    576
anterior_dur                  434
posterior                     225
posterior_dur                  66
habitual                       62
begin_to_end                   55
purposive_ext                  34
simultaneous + atelic_ext      24
anterior_dur_past?             21
multi_simuls                   18
regular_recurrence             18
anterior                       18
anterior_dur_purposive         15
telic_ext                      14
dist_fut                        6
habitual?                       6
simul_to_end                    6
begin_to_end_habitual           6
dist_posterior                  6
Name: function, dtype: int64

In [7]:
function_pr.head(20)

simultaneous                 0.533717
atelic_ext                   0.157895
anterior_dur                 0.118969
posterior                    0.061678
posterior_dur                0.018092
habitual                     0.016996
begin_to_end                 0.015077
purposive_ext                0.009320
simultaneous + atelic_ext    0.006579
anterior_dur_past?           0.005757
anterior                     0.004934
multi_simuls                 0.004934
regular_recurrence           0.004934
anterior_dur_purposive       0.004112
telic_ext                    0.003838
habitual?                    0.001645
simul_to_end                 0.001645
dist_fut                     0.001645
dist_posterior               0.001645
begin_to_end_habitual        0.001645
Name: function, dtype: float64

In [9]:
qual_ct = df.quality.value_counts()
qual_pr = qual_ct / qual_ct.sum()

qual_ct

location     1947
duration     1161
sequence      243
iteration     104
Name: quality, dtype: int64

In [10]:
qual_pr

location     0.563531
duration     0.336035
sequence     0.070333
iteration    0.030101
Name: quality, dtype: float64

In [4]:
# top20 qualities left to be assigned

df.query('~quality.notnull()').function.value_counts().head(20)

anterior_dur_past?           21
anterior_dur_purposive       15
habitual?                     6
dist_fut                      6
atelic_ext?                   5
dur_to_end                    5
anterior_limitive             5
multi_antdur                  4
posterior + atelic_ext        4
atelic_ext + simultaneous     4
anterior_limitive?            4
posterior_simul               3
dist_past                     3
regular_recurrence?           3
dist_prospective              3
purposive_ext?                2
habitual + begin_to_end       2
multi_posterior_dur           2
anterior_dur + duration       2
purposive?                    2
Name: function, dtype: int64