# Data access with STAM

Now that we have an "isomorphic" copy of the BHSA in STAM, let's see how we can access the data
by means of STAM methods.

In [1]:
import os
from itertools import chain

from tf.app import use
from tf.core.files import dirMake

from memutil import memUsage
memUsage()

Current:  0.13 GB
Delta:    0.13 GB


# Load Text-Fabric

For comparison and checks, we load the Text-Fabric BHSA.

In [2]:
A = use("ETCBC/bhsa:clone", checkout="clone", hoist=globals())
memUsage()

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


Current:  2.81 GB
Delta:    2.68 GB


# Load STAM

In [3]:
import stam

from memutil import memUsage
memUsage()

workDir = f"{A.tempDir}/stam"
storeC = stam.AnnotationStore(file=f"{workDir}/bhsa.store.stam.csv")
memUsage()

Current:  2.81 GB
Delta:    0.00 GB
Current: 14.38 GB
Delta:   11.57 GB


## Find `VP` phrases containing just one verb word

In TF, by means of a query:

In [4]:
query = """
phrase typ=VP
    := word sp=verb
    =:
"""

resultsQ = A.search(query)

  0.47s 62770 results


In TF, by means of hand-coding:

In [5]:
resultsH = []

for w in F.otype.s("word"):
    sp = F.sp.v(w)
    if sp != "verb":
        continue
    phrase = L.u(w, otype="phrase")[0]
    tp = F.typ.v(phrase)
    if tp != "VP":
        continue
    words = E.oslots.s(phrase)
    if len(words) != 1:
        continue
    resultsH.append((phrase, w))

len(resultsH)

62770

In [6]:
set(resultsQ) == set(resultsH)

True

In STAM

in a sequence of steps (not sure whether this is optimally done)

In [9]:
# find the dataset with annotation data (there is only one)

aDataSet = list(storeC.annotationsets())[0]

In [10]:
# find the piece of data that says otype=word
# there is only one such piece

wordData = aDataSet.find_data("otype", "word")[0]

In [12]:
# find all annotations with this data
# these annotations are the TF word nodes
# We also need the ids of these annotations

wordAnnos = wordData.annotations()
wordIds = {a.id() for a in wordAnnos}
len(wordAnnos)

426590

In [13]:
# same for phrases

phraseData = aDataSet.find_data("otype", "phrase")[0]
phraseAnnos = phraseData.annotations()
phraseIds = {a.id() for a in phraseAnnos}
len(phraseAnnos)

253203

In [15]:
# find the things that are annotated with sp=verb

verbData = aDataSet.find_data("sp", "verb")[0]
verbAnnos = verbData.annotations()
len(verbAnnos)

75451

In [16]:
# We do not want these annotations, but the corresponding word annotations.
# The targets of the verb annotations may be word or lex annotations.
# We need the targets that are word annotations.
# We need these targets as ids.

verbWordAnnos = list(
    chain.from_iterable(
        (b for b in a.annotations() if b.id() in wordIds) for a in verbAnnos
    )
)
verbWordIds = {a.id() for a in verbWordAnnos}
len(verbWordAnnos)

73710

Intermediate check with TF:

In [17]:
len([x for x in F.sp.s("verb") if F.otype.v(x) == "word"])

73710

In [19]:
# find the things that are annotated with typ=VP

vpData = aDataSet.find_data("typ", "VP")[0]
vpAnnos = vpData.annotations()
len(vpAnnos)

138048

In [20]:
# Exactly analogous as how we picked the subset of verbs out of the words
# we pick the subset of VPs out of the phrases.
# We need the ids of these phrases.

vpPhraseAnnos = list(
    chain.from_iterable(
        (b for b in a.annotations() if b.id() in phraseIds) for a in vpAnnos
    )
)
vpPhraseIds = {a.id() for a in vpPhraseAnnos}
len(vpPhraseAnnos)

69024

In [21]:
# now we have to combine
# for each verb-word we look up the surrounding vp-phrases

# Note: this is the expensive step

# Note: we get slightly more results then there are vp-phrases!
# Explanation: if a vp-phrase contains more than one verb, it will occur multiple times

verbPhraseAnnos = list(
    chain.from_iterable(
        (b for b in vAnno.annotations_reverse() if b.id() in vpPhraseIds )
        for vAnno in verbWordAnnos
    )
)
len(verbPhraseAnnos)

69028

Now we have to retain those phrases with a verb in it that contain only one word.
That means: the targets of those annotations should contain just one word annotation.

In [22]:
# finally we single out those verb phrases that consist of a single word

singleVerbPhraseAnnos = list(
    vPhrase
    for vPhrase in verbPhraseAnnos
    if sum(
        1
        for b in vPhrase.annotations()
        if b.id() in wordIds
    ) == 1
)
len(singleVerbPhraseAnnos)


62770