# Testing Subphrase Parsing

In [1]:
import sys
import collections
import pickle
import random
import re
import copy
import networkx as nx
from datetime import datetime
import matplotlib.pyplot as plt
from Levenshtein import distance as lev_dist
from pprint import pprint

# local packages
from tf_tools.load import load_tf
from tf_tools.ling import is_disjoint
from positions import Positions, PositionsTF, Walker, Dummy
from cx_analysis.search import SearchCX
from cx_analysis.cx import Construction
from cx_analysis.build import CXbuilder, CXbuilderTF
from cx_analysis.word_grammar import Words
from cx_analysis.phrase_grammar import Subphrases

# load semantic vectors
from locations import semvector
with open(semvector, 'rb') as infile: 
    semdist = pickle.load(infile)
    
# load and configure Text-Fabric
TF, api, A = load_tf()
F, E, T, L = api.F, api.E, api.T, api.L
A.displaySetup(condenseType='phrase', withNodes=True, extraFeatures='st')

# load visualizers
se = SearchCX(A)

# load timephrases
alltimes = [
    ph for ph in F.otype.s('timephrase') 
]
timephrases = [ph for ph in alltimes if not is_disjoint(ph, A)]

# load word CXs
words = Words(A) # word CX builder

# analyze all matches; return as dict
start = datetime.now()
print(f'Beginning word construction analysis...')
wordcxs = words.cxdict(
    s for tp in timephrases
        for s in L.d(tp,'word')
)
print(f'\t{datetime.now() - start} COMPLETE \t[ {len(wordcxs)} ] words loaded')

This is Text-Fabric 7.8.12
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

119 features found and 6 ignored
  0.00s loading features ...
   |     0.00s No structure info in otext, the structure part of the T-API cannot be used
  5.22s All features loaded/computed - for details use loadLog()


Beginning word construction analysis...
	0:00:06.267461 COMPLETE 	[ 12887 ] words loaded


In [2]:
# time phrase CX builder
spc = Subphrases(wordcxs, semdist, A)

### Small tests

In [14]:
test_small = spc.geni(991)
se.showcx(test_small, conds=True)

{   '__cx__': 'geni_ph',
    'geni': {'992': 992, '993': 993, '__cx__': 'clause'},
    'head': {'__cx__': 'cont', 'head': 991}}

-- CX geni_ph (991, 992, 993) --
pattern: geni_ph
P(-1, st) == c                                          False
P(-1).name not in {qquant,card}                           True
P(-1).name != prep                                      False

pattern: clause
genitive edge found                                      True

-- CX clause (992, 993) --
pattern: clause
genitive edge relation found                             True

-- CX cont (991,) --
pattern: cont
bool(F.pdp.v(991))                                       True



### Stretch Tests

In [7]:
# test = spc.analyzestretch(L.d(1450075, 'word'), debug=False)

# for res in test:
#     se.showcx(res, conds=True)

### Pattern Searches

In [3]:
# words = [w for ph in timephrases for w in L.d(ph, 'word')]

# results = se.search(words, spc.appo_name, pattern='entity_name', show=100, shuffle=False)

### Analyze Results

In [4]:
# for res in results:
#     head, appo = list(res.getsuccroles('head'))[-1], list(res.getsuccroles('appo'))[-1]
#     hlex, alex = F.lex.v(int(head)), F.lex.v(int(appo))
    
#     se.showcx(res)
#     print()
#     print(f'lexs: {hlex} x {alex}')
#     print(f'dist: {semdist[hlex][alex]}')
#     print()

### Stretch Tests on Results

In [45]:
# elements = sorted(set(L.u(res.element, 'timephrase')[0] for res in results))

# for el in elements:
    
#     stretch = L.d(el, 'word')
#     test = spc.analyzestretch(stretch)
    
#     for res in test:
#         se.showcx(res)