# Generate phrase parsings for multiple functions

In [1]:
import pickle
import pandas as pd
import numpy as np
import sys
from datetime import datetime
import collections
import networkx as nx
from pathlib import Path

# import custom Construction builders and rules
from cx_analysis.search import SearchCX
from cx_analysis.cx import Construction
from cx_analysis.build import CXbuilder, CXbuilderTF
import cx_analysis.graph_nav as nav
from dataset import build_dataset

# add grammar to path
sys.path.append('../cxs')
from word_grammar import Words
from phrase_grammar import Subphrases
from phrase_classes import SinglePhrase

# import Text-fabric data
import tf_tools.formatting as form
from tf_tools.load import load_tf
from tf_tools.tokenizers import tokenize_surface
from tf_tools.formatting import book2sbl

repo = Path('/Users/cody/github/CambridgeSemiticsLab/time_collocations')
data_path = repo.joinpath('data/advb_article')

TF, API, A = load_tf('nhead mother', hoist=globals())

This is Text-Fabric 8.3.3
Api reference : https://annotation.github.io/text-fabric/cheatsheet.html

124 features found and 4 ignored
  0.00s loading features ...
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API
  6.95s All features loaded/computed - for details use loadLog()


In [2]:
# configure visualizers for Construction objects

cx_show = SearchCX(A)
pretty, prettyconds, showcx, search = (
    cx_show.pretty, cx_show.prettyconds, 
    cx_show.showcx, cx_show.search
)

In [3]:
test_functs = {'Time', 'Subj', 'Objc', 'Loca', 'Adju', 'Cmpl'}

In [4]:
# build testset, we only want to collect single-atomed 
# phrases with no daughter relations and only 1 head word

test_phrases = []

for phrase in F.otype.s('phrase'):
    function = F.function2.v(phrase)
    n_atoms = len(L.d(phrase, 'phrase_atom'))
    heads = E.nhead.t(phrase)
    n_heads = len(heads)
    first_head = heads[0]
    daughters = E.mother.t(phrase)
    mothers = E.mother.f(phrase)
    n_mothers = len(mothers)
    n_daughters = len(daughters)
    
    if all([
        function in test_functs,
        n_atoms == 1,
        n_heads == 1,
        n_daughters == 0,
        n_mothers == 0,
        F.typ.v(phrase) not in {'InrP', 'NegP', 'InjP', 'CP', 'VP'},
    ]):
        test_phrases.append(phrase)
        
print(len(test_phrases), 'phrases selected...')

84739 phrases selected...


In [5]:
# collect words to begin analysis with

phrase_sample = test_phrases

phrase_words = [    
    word for phrase in phrase_sample
        for word in L.d(phrase, 'word')
]
len(phrase_words)

158671

# Word Analysis

In [6]:
words = Words(A, context='phrase') # word CX builder

# analyze all matches; return as dict
start = datetime.now()
print(f'Beginning word construction analysis...')
wordcxs = words.cxdict(phrase_words)
print(f'\t{datetime.now() - start} COMPLETE \t[ {len(wordcxs)} ] words loaded')

Beginning word construction analysis...
	0:01:35.870065 COMPLETE 	[ 158671 ] words loaded


# Subphrase Analysis

In [7]:
subphrases = Subphrases(wordcxs, A, context='phrase')

### Troubleshoot

In [8]:
# test_small = subphrases.prep(227481)
# showcx(test_small, conds=True, condenseType='sentence')

In [9]:
# A.search('''

# phrase function=Objc|Adju
#     word pdp=prep
#     <: word pdp=prep prs#n/a|absent
    
# ''')

In [10]:
# test = subphrases.analyzestretch(L.d(783909, 'word'), debug=True)

# for res in test:
#     showcx(res, conds=True, condenseType='phrase',)

### Run on all phrases

In [11]:
silent=True

phrase2cxs = collections.defaultdict(list)
nocxs = []

# time it
start = datetime.now()

print(f'{datetime.now()-start} beginning subphrase analysis...')

for i, phrase in enumerate(phrase_sample):
     
    # analyze all known relas
    elements = L.d(phrase, 'word')
    
    # analyze with debug exceptions
    try:
        cxs = subphrases.analyzestretch(elements)
    except:
        nocxs.append(phrase)
        
        if not silent:
            sys.stderr.write(f'error on {phrase}\t{T.text(phrase)}\n')
            
        continue
        
#         sys.stderr.write(f'\nFAIL...running with debug...\n')
#         pretty(phrase)
#         subphrases.analyzestretch(elements, debug=True)
#         raise Exception('...debug complete...')

    # save those phrases that have no matching constructions
    if not cxs:
        nocxs.append(phrase)
    else:
        phrase2cxs[phrase] = cxs
        
    # report status
    if i % 2000 == 0 and i:
        print(f'\t{datetime.now()-start}\tdone with iter {i}/{len(phrase_sample)}')
        
print(f'{datetime.now()-start}\tCOMPLETE')
print('-'*20)
print(f'{len(phrase2cxs)} phrases matched with Constructions...')
print(f'{len(nocxs)} phrases not yet matched with Constructions...')

0:00:00.000042 beginning subphrase analysis...
	0:00:26.189508	done with iter 2000/84739
	0:00:52.341568	done with iter 4000/84739
	0:01:18.599864	done with iter 6000/84739
	0:01:45.953206	done with iter 8000/84739
	0:02:16.074562	done with iter 10000/84739
	0:02:48.669258	done with iter 12000/84739
	0:03:16.536969	done with iter 14000/84739
	0:03:45.172717	done with iter 16000/84739
	0:04:12.636798	done with iter 18000/84739
	0:04:39.394427	done with iter 20000/84739
	0:05:08.933871	done with iter 22000/84739
	0:05:37.976384	done with iter 24000/84739
	0:06:02.594041	done with iter 26000/84739
	0:06:28.701193	done with iter 28000/84739
	0:06:52.988728	done with iter 30000/84739
	0:07:17.070217	done with iter 32000/84739
	0:07:45.440738	done with iter 34000/84739
	0:08:11.897446	done with iter 36000/84739
	0:08:36.632683	done with iter 38000/84739
	0:09:03.497871	done with iter 40000/84739
	0:09:30.464239	done with iter 42000/84739
	0:09:52.359476	done with iter 44000/84739
	0:10:14.69

# Classifier

In [12]:
from pathlib import Path
import pickle
from dataset import build_dataset

In [13]:
# compile acceptable head lexemes from single-phrased CXs
good_heads = set()
for ph, cx_data in phrase2cxs.items():
    if len(cx_data) == 1:
        cx = cx_data[0]
        head = list(cx.getsuccroles('head'))[-1]
        good_heads.add(F.lex.v(head))

# tag the time cxs with classifications
sp = SinglePhrase(phrase2cxs.values(), good_heads, A)  
sp.label_cxs()

In [14]:
with open(data_path.joinpath('function_cxs.pickle'), 'wb') as outfile:
    pickle.dump(phrase2cxs, outfile)

## Build CSV Dataset

In [2]:
with open(data_path.joinpath('function_cxs.pickle'), 'rb') as infile:
    phrase2cxs = pickle.load(infile)

In [3]:
constructions = [c for cs in phrase2cxs.values() 
                     for c in cs
                     if F.language.v(int(c)) == 'Hebrew']
dataset = build_dataset(constructions, A.api)

In [4]:
dataset.to_csv(data_path.joinpath('function_data.csv'))

In [5]:
dataset.shape

(84357, 43)

In [6]:
dataset.columns

Index(['function', 'ref', 'book', 'ph_type', 'head', 'text', 'token', 'clause',
       'sentence', 'classi', 'head_node', 'head_voc', 'head_etcbc', 'head_pos',
       'head_type', 'plural', 'suffix', 'preposition', 'leading_prep',
       'trailing_prep', 'tokenized_prep', 'extended_prep', 'ø', 'øanchor',
       'genitive', 'definite', 'quantified', 'quant_str', 'cardinal',
       'qualitative', 'qual_str', 'demonstrative', 'demon_str', 'demon_dist',
       'ordinal', 'ord_str', 'cl_kind', 'verb', 'tense', 'verb_lex',
       'book_sbl', 'lang', 'genre'],
      dtype='object')

In [7]:
dataset.head()

Unnamed: 0_level_0,function,ref,book,ph_type,head,text,token,clause,sentence,classi,...,demon_dist,ordinal,ord_str,cl_kind,verb,tense,verb_lex,book_sbl,lang,genre
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
651542,Time,Gen 1:1,Genesis,prep_ph,ראשׁית,בְּרֵאשִׁ֖ית,ב.ראשׁית,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,single.prep.bare.øanchor,...,,False,,VC,True,qtl,ברא,Gen,Hebrew,prose
651544,Subj,Gen 1:1,Genesis,cont,אלהים,אֱלֹהִ֑ים,אלהים,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,single.øanchor,...,,False,,VC,True,qtl,ברא,Gen,Hebrew,prose
651547,Subj,Gen 1:2,Genesis,defi_ph,ארץ,הָאָ֗רֶץ,ה.ארץ,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,single.definite,...,,False,,VC,True,qtl,היה,Gen,Hebrew,prose
651551,Subj,Gen 1:2,Genesis,cont,חשׁך,חֹ֖שֶׁךְ,חשׁך,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,single.bare.øanchor,...,,False,,NC,False,,,Gen,Hebrew,prose
651554,Subj,Gen 1:2,Genesis,geni_ph,רוח,ר֣וּחַ אֱלֹהִ֔ים,רוח.אלהים,וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמּ...,וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמּ...,single.genitive,...,,False,,VC,True,ptcp,רחף,Gen,Hebrew,prose


In [8]:
dataset.iloc[0]

function                                                       Time
ref                                                         Gen 1:1
book                                                        Genesis
ph_type                                                     prep_ph
head                                                         ראשׁית
text                                                  בְּרֵאשִׁ֖ית 
token                                                      ב.ראשׁית
clause            בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...
sentence          בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...
classi                                     single.prep.bare.øanchor
head_node                                                         2
head_voc                                                   רֵאשִׁית
head_etcbc                                                   R>CJT/
head_pos                                                       subs
head_type                                       

In [9]:
dataset[dataset.cardinal & (dataset.function!='Time')]

Unnamed: 0_level_0,function,ref,book,ph_type,head,text,token,clause,sentence,classi,...,demon_dist,ordinal,ord_str,cl_kind,verb,tense,verb_lex,book_sbl,lang,genre
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
651632,Cmpl,Gen 1:9,Genesis,prep_ph,מקום,אֶל־מָקֹ֣ום אֶחָ֔ד,אל.מקום.אחד,יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶ...,יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶ...,single.prep.øanchor.quantified.cardinal,...,,False,,VC,True,yqtl,קוה,Gen,Hebrew,prose
651988,Cmpl,Gen 2:7,Genesis,prep_ph,אף,בְּאַפָּ֖יו,ב.אף,וַיִּפַּ֥ח בְּאַפָּ֖יו נִשְׁמַ֣ת חַיִּ֑ים,וַיִּפַּ֥ח בְּאַפָּ֖יו נִשְׁמַ֣ת חַיִּ֑ים,single.prep.suffix,...,,False,,VC,True,wyqtl,נפח,Gen,Hebrew,prose
652246,Subj,Gen 3:5,Genesis,cont,עין,עֵֽינֵיכֶ֑ם,עין,וְנִפְקְח֖וּ עֵֽינֵיכֶ֑ם,כִּ֚י יֹדֵ֣עַ אֱלֹהִ֔ים כִּ֗י בְּיֹום֙ אֲכָלְכ...,single.suffix,...,,False,,VC,True,wqtl,פקח,Gen,Hebrew,prose
652262,Adju,Gen 3:6,Genesis,prep_ph,עין,לָעֵינַ֗יִם,ל.ה.עין,וְכִ֧י תַֽאֲוָה־ה֣וּא לָעֵינַ֗יִם,וַתֵּ֣רֶא הָֽאִשָּׁ֡ה כִּ֣י טֹוב֩ הָעֵ֨ץ לְמַא...,single.prep.definite,...,,False,,NC,False,,,Gen,Hebrew,prose
652280,Subj,Gen 3:7,Genesis,geni_ph,עין,עֵינֵ֣י שְׁנֵיהֶ֔ם,עין.שׁנים,וַתִּפָּקַ֨חְנָה֙ עֵינֵ֣י שְׁנֵיהֶ֔ם,וַתִּפָּקַ֨חְנָה֙ עֵינֵ֣י שְׁנֵיהֶ֔ם,single.geni_cardinal,...,,False,,VC,True,wyqtl,פקח,Gen,Hebrew,prose
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904250,Objc,2 Chr 35:8,2_Chronicles,numb_ph,בקר,בָקָ֖ר שְׁלֹ֥שׁ מֵאֹֽות׃,בקר.שׁלשׁ.מאה,חִלְקִיָּ֨ה וּזְכַרְיָ֜הוּ וִֽיחִיאֵ֗ל נְגִידֵ...,חִלְקִיָּ֨ה וּזְכַרְיָ֜הוּ וִֽיחִיאֵ֗ל נְגִידֵ...,single.component.øanchor.quantified.cardinal,...,,False,,VC,True,qtl,נתן,2 Chr,Hebrew,prose
904256,Objc,2 Chr 35:9,2_Chronicles,numb_ph,בקר,בָקָ֖ר חֲמֵ֥שׁ מֵאֹֽות׃,בקר.חמשׁ.מאה,וְ֠כָֽנַנְיָהוּ וּשְׁמַֽעְיָ֨הוּ וּנְתַנְאֵ֜ל ...,וְ֠כָֽנַנְיָהוּ וּשְׁמַֽעְיָ֨הוּ וּנְתַנְאֵ֜ל ...,single.component.øanchor.quantified.cardinal,...,,False,,VC,True,qtl,רום,2 Chr,Hebrew,prose
904545,Cmpl,2 Chr 36:6,2_Chronicles,prep_ph,נחשׁת,בַּֽנְחֻשְׁתַּ֔יִם,ב.ה.נחשׁת,וַיַּֽאַסְרֵ֨הוּ֙ בַּֽנְחֻשְׁתַּ֔יִם,וַיַּֽאַסְרֵ֨הוּ֙ בַּֽנְחֻשְׁתַּ֔יִם לְהֹלִיכֹ...,single.prep.definite,...,,False,,VC,True,wyqtl,אסר,2 Chr,Hebrew,prose
904582,Adju,2 Chr 36:9,2_Chronicles,prep_ph,עין,בְּעֵינֵ֥י יְהוָֽה׃,ב.עין.יהוה,הָרַ֖ע בְּעֵינֵ֥י יְהוָֽה׃,וַיַּ֥עַשׂ הָרַ֖ע בְּעֵינֵ֥י יְהוָֽה׃,single.prep.genitive,...,,False,,NC,False,,,2 Chr,Hebrew,prose


In [10]:
dataset.ph_type.value_counts()

prep_ph       42360
cont          16943
name           6834
geni_ph        5503
prps           3624
defi_ph        3622
numb_ph        2197
prde            717
adjv_ph         580
prin            551
advb            499
attrib_ph       333
qquant          248
card            220
prep             48
card_chain       41
appo_name        21
ordn              9
demon_ph          6
intj              1
Name: ph_type, dtype: int64

In [11]:
dataset.genre.value_counts()

prose          39061
prophetic      19524
poetry         14994
instruction     8180
list            2598
Name: genre, dtype: int64