# Import

In [2]:
import glob, re, collections, operator, scipy.stats, sklearn.metrics, getpass
from itertools import chain
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from discreteMarkovChain import markovChain
from tf.fabric import Fabric
from sklearn.model_selection import KFold

# Loading from database

In [3]:
# format paths for Etienne or Cody
if getpass.getuser() == 'etien':
    locations = 'C:/Users/etien/Documents/github/bhsa/tf'
elif getpass.getuser() == 'cody':
    locations = '~/github/etcbc/bhsa/tf'

TF = Fabric(locations=locations, modules='c', silent=True)

api = TF.load('''
              otype
              book chapter verse
              function domain
              typ pdp
              ''')

api.makeAvailableIn(globals())


  0.00s loading features ...
   |     0.02s B otype                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B book                 from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s B chapter              from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.01s B verse                from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.06s B function             from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.02s B domain               from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.19s B typ                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.10s B pdp                  from /Users/cody/github/etcbc/bhsa/tf/c
   |     0.00s Feature overview: 109 for nodes; 5 for edges; 1 configs; 7 computed
  4.32s All features loaded/computed - for details use loadLog()


In [4]:
lbh_books = {'1_Chronicles', '2_Chronicles', 
             'Ezra', 'Esther', 'Nehemiah'}

sbh_books = {'Genesis', 'Exodus','Leviticus', 
             'Deuteronomy','Joshua', 'Judges', 
             '1_Kings', '2_Kings', '1_Samuel',
             '2_Samuel'}

# don't load clauses with these functions
exclude_functions = {'IntS', 'Ques', 'Exst', 'ModS',
                     'NCoS', 'NCop', 'Supp', 'PrAd',
                     'Frnt', 'Intj', 'EPPr', "ExsS", 
                     "PrcS", "Voct"}
#exclude_functions = set()
data = collections.defaultdict(list) # lbh, sbh, and individual books go in here
       
for i, clause in enumerate(F.otype.s('clause')):
    
    book, chapter, verse = T.sectionFromNode(clause)
    
    # skip clauses in our corpora
    if book not in (lbh_books | sbh_books):
        continue
    # skip non-narrative clauses
    if F.domain.v(clause) != 'N':
        continue
        
    clause_phrases = L.d(clause, otype='phrase')
    phrase_functions = [F.function.v(phrase) for phrase in clause_phrases]
    
    # skip if clause has excluded function
    if set(phrase_functions) & exclude_functions:
        continue
        
    if book in lbh_books:
        data['lbh'].append(phrase_functions)
    elif book in sbh_books:
        data['sbh'].append(phrase_functions)
        
    data[book].append(phrase_functions)
    
print(f'Done with {i} clauses loaded in data...')

Done with 88100 clauses loaded in data...


# MODEL


In [5]:
unique_functions = set(F.function.v(phrase) for phrase in F.otype.s('phrase') 
                           if F.function.v(phrase) not in exclude_functions)
unique_functions = sorted(list(unique_functions))
nodes =  ["Clause_Begin", "Clause_End"] + unique_functions
print(unique_functions)

['Adju', 'Cmpl', 'Conj', 'Loca', 'Modi', 'Nega', 'Objc', 'PreC', 'PreO', 'PreS', 'Pred', 'PtcO', 'Rela', 'Subj', 'Time']


In [6]:
def transitionWalk(clauses):
    states = list()
    for clause in clauses:
        states.append("Clause_Begin")
        states.extend(clause)
        states.append("Clause_End")
    return states

In [7]:
def MarkovModel(transitionWalk):
    transition_Matrix = np.zeros((len(nodes),len(nodes)))
    
    for i in range(0,len(transitionWalk)-1):
        transition_Matrix[nodes.index(transitionWalk[i]),nodes.index(transitionWalk[i+1])] +=1 
        
    df_Trans = pd.DataFrame(transition_Matrix, columns = nodes, index = nodes)    
    
     # Remove unused transition functions
    df_Trans = df_Trans[(df_Trans.T != 0).any()]
    df_Trans = df_Trans.loc[:, (df_Trans != 0).any(axis=0)]
    
    df_Trans = df_Trans.div(df_Trans.sum(axis=1), axis=0)    
    return(df_Trans)

In [8]:
def totalVariationDistance(df_prob_M1, df_prob_M2):
    df_sub = df_prob_M1.subtract(df_prob_M2, fill_value =0)
    maxVar = abs(df_sub.values).sum() *0.5
    return maxVar

In [9]:
def hellingerDistance(df_prob_M1,df_prob_M2):
    df1 = df_prob_M1.applymap(np.sqrt)
    df2 = df_prob_M2.applymap(np.sqrt)
    df_sub = df1.subtract(df2, fill_value=0)
    df_sub = np.power(df_sub, 2)
    sumDiff = np.sum(df_sub.values)
    helligerDistance = np.sqrt(sumDiff)/np.sqrt(2)
    return helligerDistance

In [21]:
K_fold = dict()
for bookname, clauses in data.items():
    
    kf = KFold(n_splits=2, shuffle=True)
    
    clauses = np.array(clauses)
    distances = list()
    
    for train_index, test_index in kf.split(clauses):
        train, test = clauses[train_index], clauses[test_index]
        
        walk_train = transitionWalk(train)
        model_train = MarkovModel(walk_train)
        
        walk_test = transitionWalk(test)
        model_test = MarkovModel(walk_test)
        
        dist = hellingerDistance(model_train,model_test)
        distances.append(dist)
    K_fold[bookname] = round(np.mean(distances), 3)
        

We hope this is 0

In [22]:
K_fold

{'1_Chronicles': 1.345,
 '1_Kings': 0.80700000000000005,
 '1_Samuel': 0.82299999999999995,
 '2_Chronicles': 1.03,
 '2_Kings': 0.88100000000000001,
 '2_Samuel': 1.0149999999999999,
 'Deuteronomy': 1.3660000000000001,
 'Esther': 1.262,
 'Exodus': 0.92900000000000005,
 'Ezra': 1.3660000000000001,
 'Genesis': 0.59799999999999998,
 'Joshua': 0.89100000000000001,
 'Judges': 1.1379999999999999,
 'Leviticus': 1.496,
 'Nehemiah': 1.0429999999999999,
 'lbh': 0.90100000000000002,
 'sbh': 0.30199999999999999}

In [36]:
data['1_Kings'][:2]

[['Conj', 'PreO', 'Cmpl'], ['Conj', 'Pred', 'Cmpl', 'Subj']]