# Import

In [1]:
import glob, re, collections, operator, scipy.stats, sklearn.metrics, getpass, random
from itertools import chain
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from discreteMarkovChain import markovChain
from tf.fabric import Fabric
from sklearn.model_selection import KFold

# Loading from database

In [4]:
# format paths for Etienne or Cody
if getpass.getuser() == 'etien':
    locations = 'C:/Users/etien/Documents/github/bhsa/tf'
elif getpass.getuser() == 'cody':
    locations = '~/github/etcbc/bhsa/tf'

TF = Fabric(locations=locations, modules='c', silent=True)

api = TF.load('''
              otype
              book chapter verse
              function domain
              typ pdp
              ''')

api.makeAvailableIn(globals())


  0.00s loading features ...
   |     0.06s B otype                from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.01s B book                 from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.01s B chapter              from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.00s B verse                from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.07s B function             from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.02s B domain               from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.26s B typ                  from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.21s B pdp                  from C:/Users/etien/Documents/github/bhsa/tf/c
   |     0.00s Feature overview: 109 for nodes; 5 for edges; 1 configs; 7 computed
  5.30s All features loaded/computed - for details use loadLog()


In [3]:
lbh_books = {'1_Chronicles', '2_Chronicles', 
             'Ezra', 'Esther', 'Nehemiah'}

sbh_books = {'Genesis', 'Exodus','Leviticus', 
             'Deuteronomy','Joshua', 'Judges', 
             '1_Kings', '2_Kings', '1_Samuel',
             '2_Samuel'}

# don't load clauses with these functions
exclude_functions = {'IntS', 'Ques', 'Exst', 'ModS',
                     'NCoS', 'NCop', 'Supp', 'PrAd',
                     'Frnt', 'Intj', 'EPPr', "ExsS", 
                     "PrcS", "Voct"}
#exclude_functions = set()
data = collections.defaultdict(list) # lbh, sbh, and individual books go in here
typs = set()   
    
for i, clause in enumerate(F.otype.s('clause')):
    
    book, chapter, verse = T.sectionFromNode(clause)
    clause_phrases = L.d(clause, otype='phrase')
    phrase_functions = [F.function.v(phrase) for phrase in clause_phrases]
    
    # skip clause if...
    if any([book not in (lbh_books | sbh_books), # not in corpus
            F.domain.v(clause) != 'N', # not narrative
            set(phrase_functions) & exclude_functions, # has exceptional functions
           # F.typ.v(clause) != 'Way0'
           ]):
        continue

    if book in lbh_books:
        data['lbh'].append(phrase_functions)
    elif book in sbh_books:
        data['sbh'].append(phrase_functions)
        
    data[book].append(phrase_functions)
    
nlbh = len(data['lbh'])
nsbh = len(data['sbh'])
ndat = nlbh + nsbh
    
print(f'Done with {ndat} clauses loaded in data...')
print(f'\t...{nsbh} clauses in SBH')
print(f'\t...{nlbh} clauses in LBH')

Done with 21395 clauses loaded in data...
	...16381 clauses in SBH
	...5014 clauses in LBH


# Functions

In [349]:
def give_unique_feature(feature, otype='phrase'):
    '''
    Returns sorted list of unique features from BHSA data.
    Sort by the frequency of the feature, most frequent first.
    '''
    
    feature_count = collections.Counter()
    
    for obj in F.otype.s(otype):
        feature_count[feature.v(obj)] += 1
    
    unique_features = list(v[0] for v in feature_count.most_common())
    
    return unique_features

# MODEL


In [350]:
unique_functions = list(f for f in give_unique_feature(F.function, otype='phrase')
                            if f not in exclude_functions)

nodes =  ["Clause_Begin", "Clause_End"] + unique_functions
print(unique_functions)

['Pred', 'Conj', 'Subj', 'Cmpl', 'Objc', 'PreC', 'Adju', 'Rela', 'Nega', 'PreO', 'Time', 'Modi', 'Loca', 'PreS', 'PtcO']


In [351]:
def transitionWalk(clauses):
    states = list()
    for clause in clauses:
        states.append("Clause_Begin")
        states.extend(clause)
        states.append("Clause_End")
    return states

In [352]:
def MarkovModel(transitionWalk):
    transition_Matrix = np.zeros((len(nodes),len(nodes)))
    
    for i in range(0,len(transitionWalk)-1):
        transition_Matrix[nodes.index(transitionWalk[i]), nodes.index(transitionWalk[i+1])] +=1 
        
    df_Trans = pd.DataFrame(transition_Matrix, columns = nodes, index = nodes)    
    
     # Remove unused transition functions
    df_Trans = df_Trans[(df_Trans.T != 0).any()]
    df_Trans = df_Trans.loc[:, (df_Trans != 0).any(axis=0)]
    
    df_Trans = df_Trans.div(df_Trans.sum(axis=1), axis=0) 
    
    return(df_Trans)

In [353]:
def totalVariationDistance(df_prob_M1, df_prob_M2):
    df_sub = df_prob_M1.subtract(df_prob_M2, fill_value =0)
    maxVar = abs(df_sub.values).sum() *0.5
    return maxVar

In [354]:
def hellingerDistance(df_prob_M1,df_prob_M2):
    
    df1 = df_prob_M1.applymap(np.sqrt)
    df2 = df_prob_M2.applymap(np.sqrt)
    df_sub = df1.subtract(df2, fill_value=0)
    df_sub = np.power(df_sub, 2)
    sumDiff = np.sum(df_sub.values)
    helligerDistance = np.sqrt(sumDiff)/np.sqrt(2)
    
    return helligerDistance

In [355]:
random.seed(1609) # preserve random state

K_fold = dict()
models = collections.defaultdict(dict)

for bookname, clauses in data.items():
    
    kf = KFold(n_splits=2)
    
    clauses = np.array(clauses)
    random.shuffle(clauses)
    
    distances = list()
        
    for train_index, test_index in kf.split(clauses):
        
        train, test = clauses[train_index], clauses[test_index]
        
        walk_train = transitionWalk(train)
        model_train = MarkovModel(walk_train)
                
        walk_test = transitionWalk(test)
        model_test = MarkovModel(walk_test)
        
        dist = hellingerDistance(model_train, model_test)
        distances.append(dist)
        
        # save models
        models[bookname]['train'] = model_train
        models[bookname]['test'] = model_test
        
    K_fold[bookname] = round(np.mean(distances),3)
    
    
K_fold

{'1_Chronicles': 1.4199999999999999,
 '1_Kings': 0.63600000000000001,
 '1_Samuel': 0.75600000000000001,
 '2_Chronicles': 1.0780000000000001,
 '2_Kings': 0.84799999999999998,
 '2_Samuel': 1.012,
 'Deuteronomy': 1.5209999999999999,
 'Esther': 1.4430000000000001,
 'Exodus': 0.90700000000000003,
 'Ezra': 1.3220000000000001,
 'Genesis': 0.627,
 'Joshua': 0.90100000000000002,
 'Judges': 0.81999999999999995,
 'Leviticus': 1.615,
 'Nehemiah': 0.99099999999999999,
 'lbh': 0.91300000000000003,
 'sbh': 0.28100000000000003}

In [356]:
models['lbh']['train']

Unnamed: 0,Clause_Begin,Clause_End,Pred,Conj,Subj,Cmpl,Objc,PreC,Adju,Rela,Nega,PreO,Time,Modi,Loca,PreS
Clause_Begin,0.0,0.0,0.097327,0.664938,0.070203,0.005983,0.009174,0.033107,0.009972,0.078979,0.001197,0.007978,0.009174,0.003989,0.000798,0.00718
Clause_End,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pred,0.0,0.149533,0.0,0.0,0.292835,0.257321,0.190031,0.026168,0.042991,0.0,0.0,0.0,0.018069,0.006854,0.016199,0.0
Conj,0.0,0.0006,0.614277,0.0,0.132573,0.017996,0.059388,0.043791,0.024595,0.002999,0.025195,0.055189,0.013197,0.004799,0.005399,0.0
Subj,0.0,0.311111,0.070531,0.0,0.0,0.160386,0.099517,0.262802,0.050242,0.0,0.009662,0.005797,0.007729,0.006763,0.015459,0.0
Cmpl,0.0,0.713536,0.01469,0.0,0.049318,0.034627,0.064008,0.007345,0.067156,0.0,0.004197,0.0,0.03043,0.006296,0.008395,0.0
Objc,0.0,0.561254,0.064103,0.0,0.005698,0.149573,0.02849,0.001425,0.112536,0.0,0.011396,0.0,0.032764,0.002849,0.029915,0.0
PreC,0.0,0.580858,0.0033,0.0,0.070957,0.173267,0.061056,0.0,0.074257,0.0,0.0,0.0,0.014851,0.009901,0.011551,0.0
Adju,0.0,0.652582,0.032864,0.0,0.068075,0.068075,0.061033,0.016432,0.061033,0.0,0.002347,0.004695,0.011737,0.007042,0.014085,0.0
Rela,0.0,0.0,0.344828,0.0,0.014778,0.0,0.0,0.561576,0.0,0.0,0.044335,0.034483,0.0,0.0,0.0,0.0


In [357]:
models['lbh']['test']

Unnamed: 0,Clause_Begin,Clause_End,Pred,Conj,Subj,Cmpl,Objc,PreC,Adju,Rela,Nega,PreO,Time,Modi,Loca,PreS,PtcO
Clause_Begin,0.0,0.0,0.080176,0.666933,0.077383,0.009174,0.00359,0.040287,0.007579,0.081771,0.002393,0.007579,0.008775,0.001197,0.002792,0.010371,0.0
Clause_End,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pred,0.0,0.135897,0.0,0.0,0.303846,0.251282,0.203846,0.024359,0.048718,0.0,0.0,0.0,0.010897,0.007051,0.013462,0.0,0.000641
Conj,0.0,0.000597,0.593079,0.00179,0.149165,0.019093,0.050119,0.048926,0.02327,0.002387,0.020883,0.056683,0.0179,0.00358,0.01074,0.00179,0.0
Subj,0.0,0.329102,0.088849,0.0,0.0,0.138713,0.082502,0.265639,0.050771,0.0,0.00816,0.003626,0.012693,0.007253,0.012693,0.0,0.0
Cmpl,0.0,0.726087,0.017391,0.0,0.053261,0.031522,0.066304,0.007609,0.063043,0.0,0.002174,0.001087,0.018478,0.003261,0.009783,0.0,0.0
Objc,0.0,0.619883,0.068713,0.0,0.002924,0.141813,0.027778,0.002924,0.086257,0.0,0.004386,0.0,0.011696,0.005848,0.027778,0.0,0.0
PreC,0.0,0.566465,0.007553,0.0,0.102719,0.149547,0.066465,0.0,0.083082,0.0,0.0,0.0,0.010574,0.006042,0.007553,0.0,0.0
Adju,0.0,0.624697,0.029056,0.0,0.065375,0.079903,0.089588,0.016949,0.065375,0.0,0.004843,0.002421,0.007264,0.0,0.014528,0.0,0.0
Rela,0.0,0.0,0.392344,0.0,0.019139,0.0,0.0,0.559809,0.0,0.0,0.009569,0.019139,0.0,0.0,0.0,0.0,0.0


In [358]:
model_train =  models['lbh']['train']
model_test = models['lbh']['test']

diffs = np.nan_to_num(abs(model_train - model_test).values).round(decimals=5)
    
# diffs = pd.DataFrame(diffs.sum(axis=1), index=nodes, columns=['Abs_Diff'])

# diffs.sort_values('Abs_Diff', ascending=False)

diffs.shape

(17, 17)

In [330]:
model_train =  models['sbh']['train']
model_test = models['sbh']['test']


diffs = np.nan_to_num(abs(model_train - model_test).values).round(decimals=5)
    
diffs = pd.DataFrame(diffs.sum(axis=1), index=nodes, columns=['Abs_Diff'])

diffs.sort_values('Abs_Diff', ascending=False)

Unnamed: 0,Abs_Diff
Loca,0.21511
PreS,0.16834
Modi,0.14963
PreO,0.10623
Adju,0.10171
Time,0.09504
PreC,0.07543
Nega,0.07096
Cmpl,0.04603
Objc,0.04269
