# Generate phrase parsings for multiple functions

In [None]:
import pickle
import pandas as pd
import numpy as np
import sys
from datetime import datetime
import collections
import networkx as nx
from pathlib import Path

# import custom Construction builders and rules
from cx_analysis.search import SearchCX
from cx_analysis.cx import Construction
from cx_analysis.build import CXbuilder, CXbuilderTF
import cx_analysis.graph_nav as nav
from dataset import build_dataset

# add grammar to path
sys.path.append('../cxs')
from word_grammar import Words
from phrase_grammar import Subphrases
from phrase_classes import SinglePhrase

# import Text-fabric data
import tf_tools.formatting as form
from tf_tools.load import load_tf
from tf_tools.tokenizers import tokenize_surface
from tf_tools.formatting import book2sbl

repo = Path('/Users/cody/github/CambridgeSemiticsLab/time_collocations')
data_path = repo.joinpath('data/advb_article')

TF, API, A = load_tf('nhead mother', hoist=globals())

In [2]:
# configure visualizers for Construction objects

cx_show = SearchCX(A)
pretty, prettyconds, showcx, search = (
    cx_show.pretty, cx_show.prettyconds, 
    cx_show.showcx, cx_show.search
)

In [3]:
test_functs = {'Time', 'Subj', 'Objc', 'Loca', 'Adju', 'Cmpl'}

In [4]:
# build testset, we only want to collect single-atomed 
# phrases with no daughter relations and only 1 head word

test_phrases = []

for phrase in F.otype.s('phrase'):
    function = F.function2.v(phrase)
    n_atoms = len(L.d(phrase, 'phrase_atom'))
    heads = E.nhead.t(phrase)
    n_heads = len(heads)
    first_head = heads[0]
    daughters = E.mother.t(phrase)
    mothers = E.mother.f(phrase)
    n_mothers = len(mothers)
    n_daughters = len(daughters)
    
    if all([
        function in test_functs,
        n_atoms == 1,
        n_heads == 1,
        n_daughters == 0,
        n_mothers == 0,
        F.typ.v(phrase) not in {'InrP', 'NegP', 'InjP', 'CP', 'VP'},
    ]):
        test_phrases.append(phrase)
        
print(len(test_phrases), 'phrases selected...')

84739 phrases selected...


In [5]:
# collect words to begin analysis with

phrase_sample = test_phrases

phrase_words = [    
    word for phrase in phrase_sample
        for word in L.d(phrase, 'word')
]
len(phrase_words)

158671

# Word Analysis

In [6]:
words = Words(A, context='phrase') # word CX builder

# analyze all matches; return as dict
start = datetime.now()
print(f'Beginning word construction analysis...')
wordcxs = words.cxdict(phrase_words)
print(f'\t{datetime.now() - start} COMPLETE \t[ {len(wordcxs)} ] words loaded')

Beginning word construction analysis...
	0:01:35.870065 COMPLETE 	[ 158671 ] words loaded


# Subphrase Analysis

In [7]:
subphrases = Subphrases(wordcxs, A, context='phrase')

### Troubleshoot

In [8]:
# test_small = subphrases.prep(227481)
# showcx(test_small, conds=True, condenseType='sentence')

In [9]:
# A.search('''

# phrase function=Objc|Adju
#     word pdp=prep
#     <: word pdp=prep prs#n/a|absent
    
# ''')

In [10]:
# test = subphrases.analyzestretch(L.d(783909, 'word'), debug=True)

# for res in test:
#     showcx(res, conds=True, condenseType='phrase',)

### Run on all phrases

In [11]:
silent=True

phrase2cxs = collections.defaultdict(list)
nocxs = []

# time it
start = datetime.now()

print(f'{datetime.now()-start} beginning subphrase analysis...')

for i, phrase in enumerate(phrase_sample):
     
    # analyze all known relas
    elements = L.d(phrase, 'word')
    
    # analyze with debug exceptions
    try:
        cxs = subphrases.analyzestretch(elements)
    except:
        nocxs.append(phrase)
        
        if not silent:
            sys.stderr.write(f'error on {phrase}\t{T.text(phrase)}\n')
            
        continue
        
#         sys.stderr.write(f'\nFAIL...running with debug...\n')
#         pretty(phrase)
#         subphrases.analyzestretch(elements, debug=True)
#         raise Exception('...debug complete...')

    # save those phrases that have no matching constructions
    if not cxs:
        nocxs.append(phrase)
    else:
        phrase2cxs[phrase] = cxs
        
    # report status
    if i % 2000 == 0 and i:
        print(f'\t{datetime.now()-start}\tdone with iter {i}/{len(phrase_sample)}')
        
print(f'{datetime.now()-start}\tCOMPLETE')
print('-'*20)
print(f'{len(phrase2cxs)} phrases matched with Constructions...')
print(f'{len(nocxs)} phrases not yet matched with Constructions...')

0:00:00.000042 beginning subphrase analysis...
	0:00:26.189508	done with iter 2000/84739
	0:00:52.341568	done with iter 4000/84739
	0:01:18.599864	done with iter 6000/84739
	0:01:45.953206	done with iter 8000/84739
	0:02:16.074562	done with iter 10000/84739
	0:02:48.669258	done with iter 12000/84739
	0:03:16.536969	done with iter 14000/84739
	0:03:45.172717	done with iter 16000/84739
	0:04:12.636798	done with iter 18000/84739
	0:04:39.394427	done with iter 20000/84739
	0:05:08.933871	done with iter 22000/84739
	0:05:37.976384	done with iter 24000/84739
	0:06:02.594041	done with iter 26000/84739
	0:06:28.701193	done with iter 28000/84739
	0:06:52.988728	done with iter 30000/84739
	0:07:17.070217	done with iter 32000/84739
	0:07:45.440738	done with iter 34000/84739
	0:08:11.897446	done with iter 36000/84739
	0:08:36.632683	done with iter 38000/84739
	0:09:03.497871	done with iter 40000/84739
	0:09:30.464239	done with iter 42000/84739
	0:09:52.359476	done with iter 44000/84739
	0:10:14.69

# Classifier

In [12]:
from pathlib import Path
import pickle
from dataset import build_dataset

In [13]:
# compile acceptable head lexemes from single-phrased CXs
good_heads = set()
for ph, cx_data in phrase2cxs.items():
    if len(cx_data) == 1:
        cx = cx_data[0]
        head = list(cx.getsuccroles('head'))[-1]
        good_heads.add(F.lex.v(head))

# tag the time cxs with classifications
sp = SinglePhrase(phrase2cxs.values(), good_heads, A)  
sp.label_cxs()

In [14]:
with open(data_path.joinpath('function_cxs.pickle'), 'wb') as outfile:
    pickle.dump(phrase2cxs, outfile)

## Build CSV Dataset

In [2]:
with open(data_path.joinpath('function_cxs.pickle'), 'rb') as infile:
    phrase2cxs = pickle.load(infile)

In [3]:
constructions = [c for cs in phrase2cxs.values() 
                     for c in cs
                     if F.language.v(int(c)) == 'Hebrew']
dataset = build_dataset(constructions, A.api)

In [4]:
dataset.to_csv(data_path.joinpath('function_data.csv'))

In [5]:
dataset.shape

(84357, 45)

In [6]:
dataset.columns

Index(['function', 'ref', 'book', 'ph_type', 'head', 'text', 'token', 'clause',
       'sentence', 'classi', 'head_node', 'head_voc', 'head_etcbc', 'head_pos',
       'head_type', 'plural', 'suffix', 'preposition', 'leading_prep',
       'trailing_prep', 'tokenized_prep', 'extended_prep', 'ø', 'øanchor',
       'genitive', 'definite', 'quantified', 'quant_str', 'cardinal',
       'qualitative', 'qual_str', 'demonstrative', 'demon_str', 'demon_dist',
       'ordinal', 'ord_str', 'cl_kind', 'verb', 'tense', 'verb_lex',
       'book_sbl', 'lang', 'genre', 'nom_marks', 'has_nom'],
      dtype='object')

In [7]:
dataset.head()

Unnamed: 0_level_0,function,ref,book,ph_type,head,text,token,clause,sentence,classi,...,ord_str,cl_kind,verb,tense,verb_lex,book_sbl,lang,genre,nom_marks,has_nom
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
651542,Time,Gen 1:1,Genesis,prep_ph,ראשׁית,בְּרֵאשִׁ֖ית,ב.ראשׁית,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,single.prep.bare.øanchor,...,,VC,True,qtl,ברא,Gen,Hebrew,prose,0,False
651544,Subj,Gen 1:1,Genesis,cont,אלהים,אֱלֹהִ֑ים,אלהים,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖י...,single.øanchor,...,,VC,True,qtl,ברא,Gen,Hebrew,prose,1,True
651547,Subj,Gen 1:2,Genesis,defi_ph,ארץ,הָאָ֗רֶץ,ה.ארץ,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ,single.definite,...,,VC,True,qtl,היה,Gen,Hebrew,prose,1,True
651551,Subj,Gen 1:2,Genesis,cont,חשׁך,חֹ֖שֶׁךְ,חשׁך,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום,single.bare.øanchor,...,,NC,False,,,Gen,Hebrew,prose,0,False
651554,Subj,Gen 1:2,Genesis,geni_ph,רוח,ר֣וּחַ אֱלֹהִ֔ים,רוח.אלהים,וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמּ...,וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמּ...,single.genitive,...,,VC,True,ptcp,רחף,Gen,Hebrew,prose,1,True


In [13]:
dataset.ph_type.value_counts()

prep_ph       42360
cont          16943
name           6834
geni_ph        5503
prps           3624
defi_ph        3622
numb_ph        2197
prde            717
adjv_ph         580
prin            551
advb            499
attrib_ph       333
qquant          248
card            220
prep             48
card_chain       41
appo_name        21
ordn              9
demon_ph          6
intj              1
Name: ph_type, dtype: int64

In [14]:
dataset.genre.value_counts()

prose          39061
prophetic      19524
poetry         14994
instruction     8180
list            2598
Name: genre, dtype: int64

In [15]:
dataset.nom_marks.value_counts()

1    41187
0    28002
2    13750
3     1346
4       72
Name: nom_marks, dtype: int64