## Extraction of all explicit subjects, objects and complements in the Psalms

In [1]:
import sys, os, csv, re
import collections
import subprocess

from lxml import etree
from pprint import pprint
#Visualisation
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime

%load_ext autoreload
%autoreload 2
%matplotlib inline
import laf
from laf.fabric import LafFabric
from etcbc.preprocess import prepare
from etcbc.lib import Transcription, monad_set

#from etcbc.mql import MQL
fabric = LafFabric()

  0.00s This is LAF-Fabric 4.6.0
API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html
Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html



In [21]:
API=fabric.load('etcbc4b', 'lexicon', 'mql', {
    "xmlids": {"node": False, "edge": False},
    "features": ('''
        otype nu ps gn vs vt prs ls lex g_cons
        function txt
        book chapter verse label sp kind typ 
        gloss
    ''',
    ''' functional_parent
    '''),
    "prepare": prepare,
}, verbose='DETAIL')
exec(fabric.localnames.format(var='fabric'))
#Q = MQL(API)

  0.00s LOADING API: please wait ... 
  0.00s DETAIL: COMPILING m: UP TO DATE
  0.00s USING main  DATA COMPILED AT: 2015-11-02T15-08-56
  0.00s DETAIL: COMPILING a: UP TO DATE
  0.00s USING annox DATA COMPILED AT: 2016-01-27T19-01-17
  0.01s DETAIL: keep main: G.node_anchor_min
  0.01s DETAIL: keep main: G.node_anchor_max
  0.01s DETAIL: keep main: G.node_sort
  0.01s DETAIL: keep main: G.node_sort_inv
  0.01s DETAIL: keep main: G.edges_from
  0.01s DETAIL: keep main: G.edges_to
  0.01s DETAIL: keep main: F.etcbc4_db_otype [node] 
  0.01s DETAIL: keep main: F.etcbc4_sft_book [node] 
  0.01s DETAIL: keep main: F.etcbc4_sft_chapter [node] 
  0.01s DETAIL: keep main: F.etcbc4_sft_verse [node] 
  0.01s DETAIL: keep annox: F.etcbc4_db_otype [node] 
  0.01s DETAIL: keep annox: F.etcbc4_sft_book [node] 
  0.01s DETAIL: keep annox: F.etcbc4_sft_chapter [node] 
  0.01s DETAIL: keep annox: F.etcbc4_sft_verse [node] 
  0.01s DETAIL: clear main: F.etcbc4_ft_g_word_utf8 [node] 
  0.01s DETAIL: clea

In [25]:
divine_lexemes = set('''
CDJ/
>L/
>LH/
>LHJM/
>LJL/
>LWH/'''.strip().split())

In [26]:
states = {}
for cn in F.otype.s('clause'):
    roles = set()
    for pn in L.d('phrase', cn):
        role = F.function.v(pn)
        if role in {'Subj', 'Objc', 'Cmpl'}:
            if {w for w in L.d('word', pn) if F.lex.v(w) in divine_lexemes}:
                roles.add(role)
    if 'Subj' in roles: states[cn] = 3
    elif 'Objc' in roles: states[cn] = 2
    elif 'Cmpl' in roles: states[cn] = 1
    else: states[cn] = 0

Lexemes denoting God:

In [24]:
{F.lex.v(w) for w in F.otype.s('word') if F.gloss.v(w).startswith('god')}

{'>L/', '>LH/', '>LHJM/', '>LJL/', '>LWH/'}

In [13]:
len(states)

88011

In [27]:
stats = collections.defaultdict(lambda: collections.Counter())
for (cn, state) in states.items(): stats['state'][state] += 1
stats['state']

Counter({0: 85811, 1: 658, 2: 441, 3: 1101})

In [28]:
states_list = [states[cn] for cn in F.otype.s('clause')]
transitions = [(states_list[i], states_list[i+1]) for i in range(len(states_list)-1)]

In [17]:
transitions[0:10]

[(3, 0),
 (0, 0),
 (0, 3),
 (3, 3),
 (3, 0),
 (0, 0),
 (0, 3),
 (3, 0),
 (0, 3),
 (3, 3)]

In [29]:
for x in transitions: stats['trans'][x] += 1
stats['trans']

Counter({(0, 0): 83765,
         (0, 1): 601,
         (0, 2): 416,
         (0, 3): 1028,
         (1, 0): 594,
         (1, 1): 32,
         (1, 2): 10,
         (1, 3): 22,
         (2, 0): 413,
         (2, 1): 13,
         (2, 2): 8,
         (2, 3): 7,
         (3, 0): 1039,
         (3, 1): 12,
         (3, 2): 7,
         (3, 3): 43})

In [3]:
def make_words(phrase):
    '''Takes as argument a phrase-node and turns it into transliterated words.
    Function is called in def get_soc_info().'''
    words_str = ''
    words = L.d('word', phrase)
    for word in words:
        words_str += F.g_cons.v(word)
        if not word == words[-1]:
            words_str += '_'
    return words_str

In [4]:
def get_soc_info():
    '''Function iterates over all clauses in the Psalms and searches for phrases within those clauses with an
    explicit subject, object or complement (soc). In the phrases_in_clause_dict nodes are added to the soc info
    for ease of reference.'''
    clause_lst = []
    phrases_in_clause_dict = collections.defaultdict(list)
    for node in NN():
        clause_info = []
        if F.otype.v(node) == 'clause': 
            if F.book.v(L.u('book', node)) == 'Psalmi':
                clause_lst.append(node)
                clause_info.append(str(node))
                clause_info.append(str(F.chapter.v(L.u('chapter', node))))
                clause_info.append(str(F.verse.v(L.u('verse', node))))
                phrases = L.d('phrase', node)
                phrase_identifier = collections.defaultdict(list)
                for phrase in phrases:
                    if F.function.v(phrase) in ['Subj', 'Objc', 'Cmpl']:
                        phrase_identifier[F.function.v(phrase)].append(phrase)
                        #print(phrase_identifier)

                #check for empty entries     
                if len(phrase_identifier) > 0:
                    if 'Subj' in phrase_identifier:
                        clause_info.append(phrase_identifier['Subj'][0])   
                        clause_info.append(make_words(phrase_identifier['Subj'][0]))
                    else: 
                        clause_info.append('_')
                        clause_info.append('_')
                    for fc in ['Objc', 'Cmpl']:
                        if fc in phrase_identifier:
                            length = len(phrase_identifier[fc])
                            if length == 1:
                                clause_info.append(phrase_identifier[fc][0])
                                clause_info.append(make_words(phrase_identifier[fc][0]))
                                clause_info.append('_')
                                clause_info.append('_')
                            elif length == 2:
                                clause_info.append(phrase_identifier[fc][0])
                                clause_info.append(make_words(phrase_identifier[fc][0]))
                                clause_info.append(phrase_identifier[fc][1])
                                clause_info.append(make_words(phrase_identifier[fc][1]))
                        else:
                            for item in range(4):
                                clause_info.append('_')
                    phrases_in_clause_dict[node] = clause_info
                    #print(clause_info)

    return clause_lst, phrases_in_clause_dict      

In [5]:
clause_lst, phrases_in_clause_dict = get_soc_info()

with open("explicit-subjects-objects-complements.csv", 'w') as f:
    header = ['clnode', 'chapter', 'verse', 'subj_n', 'subj', 'obj_n', 'obj', 'obj_n2', 'obj2', 
              'cmpl_n', 'cmpl', 'cmpl_n2', 'cmpl2']
    f.write('{}\n'.format(','.join(header)))

    for clause in clause_lst:
        if len(phrases_in_clause_dict[clause]) > 0:
            line = []
            for element in phrases_in_clause_dict[clause]:
                line.append(str(element))
            f.write('{}\n'.format(','.join(line)))