# Test Phrase Grammar on Many Functions

In [1]:
import pickle
import pandas as pd
import numpy as np
import sys
from datetime import datetime
import collections
import networkx as nx
from pathlib import Path

# import custom Construction builders and rules
from cx_analysis.search import SearchCX
from cx_analysis.cx import Construction
from cx_analysis.build import CXbuilder, CXbuilderTF
import cx_analysis.graph_nav as nav
from dataset import build_dataset

# add grammar to path
sys.path.append('../cxs')
from word_grammar import Words
from phrase_grammar import Subphrases
from phrase_classes import SinglePhrase

# import Text-fabric data
import tf_tools.formatting as form
from tf_tools.load import load_tf
from tf_tools.tokenizers import tokenize_surface
from tf_tools.formatting import book2sbl

data_path = Path('../all_function_cxs')

TF, API, A = load_tf('nhead mother', hoist=globals())

This is Text-Fabric 8.1.1
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

122 features found and 5 ignored
  0.00s loading features ...
   |     0.00s Dataset without structure sections in otext:no structure functions in the T-API
  6.39s All features loaded/computed - for details use loadLog()


In [2]:
cx_show = SearchCX(A)
pretty, prettyconds, showcx, search = (
    cx_show.pretty, cx_show.prettyconds, 
    cx_show.showcx, cx_show.search
)

In [3]:
test_functs = {'Time', 'Subj', 'Objc', 'Loca', 'Adju', 'Cmpl'}

In [4]:
# build testset, we only want to collect single-atomed 
# phrases with no daughter relations and only 1 head word

test_phrases = []

for phrase in F.otype.s('phrase'):
    function = F.function.v(phrase)
    n_atoms = len(L.d(phrase, 'phrase_atom'))
    heads = E.nhead.t(phrase)
    n_heads = len(heads)
    daughters = E.mother.t(phrase)
    mothers = E.mother.f(phrase)
    n_mothers = len(mothers)
    n_daughters = len(daughters)
    
    if all([
        function in test_functs,
        n_atoms == 1,
        n_heads == 1,
        n_daughters == 0,
        n_mothers == 0,
        F.typ.v(phrase) in {'PP', 'AdvP', 'NP', 'PrNP', 'AdjP'}
    ]):
        test_phrases.append(phrase)
        
print(len(test_phrases), 'phrases selected...')

80150 phrases selected...


In [6]:
# collect words to begin analysis with

phrase_sample = test_phrases

phrase_words = [    
    word for phrase in phrase_sample
        for word in L.d(phrase, 'word')
]
len(phrase_words)

153949

# Word Analysis

In [7]:
words = Words(A, context='phrase') # word CX builder

# analyze all matches; return as dict
start = datetime.now()
print(f'Beginning word construction analysis...')
wordcxs = words.cxdict(phrase_words)
print(f'\t{datetime.now() - start} COMPLETE \t[ {len(wordcxs)} ] words loaded')

Beginning word construction analysis...
	0:01:29.201074 COMPLETE 	[ 153949 ] words loaded


# Subphrase Analysis

In [8]:
subphrases = Subphrases(wordcxs, A, context='phrase')

### Troubleshoot

In [9]:
# test_small = subphrases.prep(227481)
# showcx(test_small, conds=True, condenseType='sentence')

In [10]:
# A.search('''

# phrase function=Objc|Adju
#     word pdp=prep
#     <: word pdp=prep prs#n/a|absent
    
# ''')

In [11]:
# test = subphrases.analyzestretch(L.d(783909, 'word'), debug=True)

# for res in test:
#     showcx(res, conds=True, condenseType='phrase',)

### Run on all phrases

In [12]:
silent=True

phrase2cxs = collections.defaultdict(list)
nocxs = []

# time it
start = datetime.now()

print(f'{datetime.now()-start} beginning subphrase analysis...')

for i, phrase in enumerate(phrase_sample):
     
    # analyze all known relas
    elements = L.d(phrase, 'word')
    
    # analyze with debug exceptions
    try:
        cxs = subphrases.analyzestretch(elements)
    except:
        nocxs.append(phrase)
        
        if not silent:
            sys.stderr.write(f'error on {phrase}\t{T.text(phrase)}\n')
            
        continue
        
#         sys.stderr.write(f'\nFAIL...running with debug...\n')
#         pretty(phrase)
#         subphrases.analyzestretch(elements, debug=True)
#         raise Exception('...debug complete...')

    # save those phrases that have no matching constructions
    if not cxs:
        nocxs.append(phrase)
    else:
        phrase2cxs[phrase] = cxs
        
    # report status
    if i % 2000 == 0 and i:
        print(f'\t{datetime.now()-start}\tdone with iter {i}/{len(phrase_sample)}')
        
print(f'{datetime.now()-start}\tCOMPLETE')
print('-'*20)
print(f'{len(phrase2cxs)} phrases matched with Constructions...')
print(f'{len(nocxs)} phrases not yet matched with Constructions...')

0:00:00.000075 beginning subphrase analysis...
	0:00:27.904240	done with iter 2000/80150
	0:00:51.844710	done with iter 4000/80150
	0:01:21.048740	done with iter 6000/80150
	0:01:52.349809	done with iter 8000/80150
	0:02:25.819672	done with iter 10000/80150
	0:03:00.125173	done with iter 12000/80150
	0:03:32.140493	done with iter 14000/80150
	0:03:58.780991	done with iter 16000/80150
	0:04:25.900385	done with iter 18000/80150
	0:04:52.120074	done with iter 20000/80150
	0:05:27.151098	done with iter 22000/80150
	0:05:56.954264	done with iter 24000/80150
	0:06:27.749558	done with iter 26000/80150
	0:06:55.227143	done with iter 28000/80150
	0:07:21.359782	done with iter 30000/80150
	0:07:48.812653	done with iter 32000/80150
	0:08:20.793298	done with iter 34000/80150
	0:08:48.896591	done with iter 36000/80150
	0:09:18.183210	done with iter 38000/80150
	0:09:44.926422	done with iter 40000/80150
	0:10:08.348771	done with iter 42000/80150
	0:10:32.491075	done with iter 44000/80150
	0:11:01.26

# Classifier

In [13]:
from pathlib import Path
import pickle
from dataset import build_dataset

In [14]:
# compile acceptable head lexemes from single-phrased CXs
good_heads = set()
for ph, cx_data in phrase2cxs.items():
    if len(cx_data) == 1:
        cx = cx_data[0]
        head = list(cx.getsuccroles('head'))[-1]
        good_heads.add(F.lex.v(head))

# tag the time cxs with classifications
sp = SinglePhrase(phrase2cxs.values(), good_heads, A)  
sp.label_cxs()

In [21]:
with open(data_path.joinpath('function_cxs.pickle'), 'wb') as outfile:
    pickle.dump(phrase2cxs, outfile)

## Build CSV Dataset

In [2]:
with open(data_path.joinpath('function_cxs.pickle'), 'rb') as infile:
    phrase2cxs = pickle.load(infile)

In [3]:
constructions = [c for cs in phrase2cxs.values() for c in cs]
dataset = build_dataset(constructions, A.api)

In [15]:
dataset.to_csv(data_path.joinpath('function_data.csv'))

In [4]:
dataset.shape

(80948, 38)

In [5]:
dataset.head()

Unnamed: 0_level_0,function,ref,book,ph_type,head,text,token,clause,sentence,classi,...,demonstrative,demon_str,demon_dist,ordinal,ord_str,cl_kind,verb,tense,verb_lex,book_sbl
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
651542,Time,Gen 1:1,Genesis,prep_ph,ראשׁית,בְּרֵאשִׁ֖ית,ב.ראשׁית,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,single.prep.bare.øanchor,...,False,,,False,,VC,True,qtl,ברא,Gen
651544,Subj,Gen 1:1,Genesis,cont,אלהים,אֱלֹהִ֑ים,אלהים,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,single.øanchor,...,False,,,False,,VC,True,qtl,ברא,Gen
651547,Subj,Gen 1:2,Genesis,defi_ph,ארץ,הָאָ֗רֶץ,ה.ארץ,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,single.definite,...,False,,,False,,VC,True,qtl,היה,Gen
651551,Subj,Gen 1:2,Genesis,cont,חשׁך,חֹ֖שֶׁךְ,חשׁך,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,single.bare.øanchor,...,False,,,False,,NC,False,,,Gen
651554,Subj,Gen 1:2,Genesis,geni_ph,רוח,ר֣וּחַ אֱלֹהִ֔ים,רוח.אלהים,וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמּ...,וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמּ...,single.genitive,...,False,,,False,,VC,True,ptcp,רחף,Gen


In [9]:
dataset[dataset.cardinal & (dataset.function!='Time')]

Unnamed: 0_level_0,function,ref,book,ph_type,head,text,token,clause,sentence,classi,...,demonstrative,demon_str,demon_dist,ordinal,ord_str,cl_kind,verb,tense,verb_lex,book_sbl
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
651632,Cmpl,Gen 1:9,Genesis,prep_ph,מקום,אֶל־מָקֹ֣ום אֶחָ֔ד,אל.מקום.אחד,יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶ...,יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶ...,single.prep.øanchor.quantified.cardinal,...,False,,,False,,VC,True,yqtl,קוה,Gen
652735,Objc,Gen 4:19,Genesis,numb_ph,אשׁה,שְׁתֵּ֣י נָשִׁ֑ים,שׁנים.אשׁה,וַיִּֽקַּֽח־לֹ֥ו לֶ֖מֶךְ שְׁתֵּ֣י נָשִׁ֑ים,וַיִּֽקַּֽח־לֹ֥ו לֶ֖מֶךְ שְׁתֵּ֣י נָשִׁ֑ים,single.øanchor.quantified.cardinal,...,False,,,False,,VC,True,wyqtl,לקח,Gen
653961,Cmpl,Gen 9:22,Genesis,prep_ph,אח,לִשְׁנֵֽי־אֶחָ֖יו,ל.שׁנים.אח,וַיַּגֵּ֥ד לִשְׁנֵֽי־אֶחָ֖יו בַּחֽוּץ׃,וַיַּגֵּ֥ד לִשְׁנֵֽי־אֶחָ֖יו בַּחֽוּץ׃,single.prep.quantified.cardinal,...,False,,,False,,VC,True,wyqtl,נגד,Gen
654159,Subj,Gen 10:25,Genesis,numb_ph,בן,שְׁנֵ֣י בָנִ֑ים,שׁנים.בן,וּלְעֵ֥בֶר יֻלַּ֖ד שְׁנֵ֣י בָנִ֑ים,וּלְעֵ֥בֶר יֻלַּ֖ד שְׁנֵ֣י בָנִ֑ים,single.øanchor.quantified.cardinal,...,False,,,False,,VC,True,qtl,ילד,Gen
654266,Subj,Gen 11:6,Genesis,numb_ph,עם,עַ֤ם אֶחָד֙,עם.אחד,הֵ֣ן עַ֤ם אֶחָד֙,הֵ֣ן עַ֤ם אֶחָד֙,single.øanchor.quantified.cardinal,...,False,,,False,,NC,False,,,Gen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
903465,Loca,2 Chr 33:5,2_Chronicles,prep_ph,חצר,בִּשְׁתֵּ֖י חַצְרֹ֥ות בֵּית־יְהוָֽה׃,ב.שׁנים.חצר.בית.יהוה,וַיִּ֥בֶן מִזְבְּחֹ֖ות לְכָל־צְבָ֣א הַשָּׁמָ֑י...,וַיִּ֥בֶן מִזְבְּחֹ֖ות לְכָל־צְבָ֣א הַשָּׁמָ֑י...,single.prep.genitive.quantified.cardinal,...,False,,,False,,VC,True,wyqtl,בנה,2 Chr
904238,Objc,2 Chr 35:7,2_Chronicles,numb_ph,בקר,בָקָ֖ר שְׁלֹ֣שֶׁת אֲלָפִ֑ים,בקר.שׁלשׁ.אלף,וּבָקָ֖ר שְׁלֹ֣שֶׁת אֲלָפִ֑ים,וַיָּ֣רֶם יֹאשִׁיָּ֣הוּ לִבְנֵ֪י הָעָ֟ם צֹ֞אן ...,single.øanchor.quantified.cardinal,...,False,,,False,,WP,False,,,2 Chr
904250,Objc,2 Chr 35:8,2_Chronicles,numb_ph,בקר,בָקָ֖ר שְׁלֹ֥שׁ מֵאֹֽות׃,בקר.שׁלשׁ.מאה,חִלְקִיָּ֨ה וּזְכַרְיָ֜הוּ וִֽיחִיאֵ֗ל נְגִידֵ...,חִלְקִיָּ֨ה וּזְכַרְיָ֜הוּ וִֽיחִיאֵ֗ל נְגִידֵ...,single.component.øanchor.quantified.cardinal,...,False,,,False,,VC,True,qtl,נתן,2 Chr
904256,Objc,2 Chr 35:9,2_Chronicles,numb_ph,בקר,בָקָ֖ר חֲמֵ֥שׁ מֵאֹֽות׃,בקר.חמשׁ.מאה,וְ֠כָֽנַנְיָהוּ וּשְׁמַֽעְיָ֨הוּ וּנְתַנְאֵ֜ל ...,וְ֠כָֽנַנְיָהוּ וּשְׁמַֽעְיָ֨הוּ וּנְתַנְאֵ֜ל ...,single.component.øanchor.quantified.cardinal,...,False,,,False,,VC,True,qtl,רום,2 Chr


In [10]:
dataset.ph_type.value_counts()

prep_ph       42899
cont          17769
name           6867
geni_ph        5562
defi_ph        3629
numb_ph        2268
adjv_ph         588
advb            359
attrib_ph       333
qquant          258
card            228
prep             48
card_chain       42
demon_ph         38
ordn             35
appo_name        21
prde              3
intj              1
Name: ph_type, dtype: int64