In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import gzip

from tf.app import use
from tf.fabric import Fabric

In [3]:
ghBase = os.path.expanduser("~/github")
org = "etcbc"
repo = "dss"
subdir = "parallels"
mainpath = f"{org}/{repo}/tf"
path = f"{org}/{repo}/{subdir}/tf"
location = f"{ghBase}/{path}"
mainlocation = f"{ghBase}/{mainpath}"
version = "0.6"
module = version
tempdir = f"{ghBase}/{org}/{repo}/_temp"

In [7]:
TF = Fabric(locations=mainlocation, modules=module)

This is Text-Fabric 7.8.2
Api reference : https://annotation.github.io/text-fabric/Api/Fabric/

57 features found and 0 ignored


In [8]:
api = TF.load("lex type")
docs = api.makeAvailableIn(globals())

  0.00s loading features ...
   |     0.00s No structure info in otext, the structure part of the T-API cannot be used
  5.51s All features loaded/computed - for details use loadLog()


# Parallels

We make edges between similar lines.

When are lines similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a line to the set of lexemes in it.
* the similarity between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# Preparation

We pre-compute all sets for lines.

But because not all lines are filled with definite material, we exclude lines with 5 or less consonants.

In [9]:
CONS = "cons"

valid = set()

allLines = F.otype.s("line")

TF.indent(reset=True)
for ln in F.otype.s("line"):
    if ln in valid:
        continue
    if sum(1 for s in L.d(ln, otype="sign") if F.type.v(s) == CONS) >= 5:
        valid.add(ln)

TF.info(f"{len(valid)} contentful lines out of {len(allLines)}")

  0.82s 37106 contentful lines out of 52895


In [10]:
def makeSet(ln):
    lineSet = set()
    for s in L.d(ln, otype="word"):
        r = F.lex.v(s)
        if r:
            lineSet.add(r)
    return lineSet

In [11]:
lines = {}

TF.indent(reset=True)
for ln in valid:
    lineSet = makeSet(ln)
    if lineSet:
        lines[ln] = lineSet

nLines = len(lines)
TF.info(f"{nLines} lines")

  0.88s 37106 lines


# Measure

In [12]:
def sim(lSet, mSet):
    return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))

# Compute all similarities

We are going to perform more than half a billion of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [13]:
THRESHOLD = 60


def computeSim(limit=None):
    similarity = {}

    lineNodes = sorted(lines.keys())
    nLines = len(lineNodes)

    nComparisons = nLines * (nLines - 1) // 2

    print(f"{nComparisons} comparisons to make")
    chunkSize = nComparisons // 1000

    co = 0
    b = 0
    si = 0
    p = 0

    TF.indent(reset=True)

    stop = False
    for i in range(nLines):
        nodeI = lineNodes[i]
        lineI = lines[nodeI]
        for j in range(i + 1, nLines):
            nodeJ = lineNodes[j]
            lineJ = lines[nodeJ]
            s = sim(lineI, lineJ)
            co += 1
            b += 1
            if b == chunkSize:
                p += 1
                TF.info(f"{p:>3}‰ - {co:>12} comparisons and {si:>10} similarities")
                b = 0
                if limit is not None and p >= limit:
                    stop = True
                    break

            if s < THRESHOLD:
                continue
            similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
            si += 1
        if stop:
            break

    TF.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
    return similarity

We are going to run it to several ‰ first and do some checks then.

In [14]:
similarity = computeSim(limit=3)

688409065 comparisons to make
  0.99s   1‰ -       688409 comparisons and         12 similarities
  1.98s   2‰ -      1376818 comparisons and         20 similarities
  2.96s   3‰ -      2065227 comparisons and         28 similarities
  2.96s   3% -      2065227 comparisons and         28 similarities


We check the sanity of the results.

In [15]:
print(min(similarity.values()))
print(max(similarity.values()))

60
100


In [16]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 70]

In [17]:
print(len(eq))
print(len(neq))

1
9


In [18]:
print(eq[0])
print(neq[0])

((1552980, 1563775), 100)
((1552973, 1563769), 69)


In [19]:
print(T.text(eq[0][0][0]))
print(T.text(eq[0][0][1]))

את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעונ׳ם וידעו כי 
את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעוונ׳מה וידעו כי 


Looks good.

Now the whole computation.

But if we have done this before, and nothing has changed, we load previous results from disk.

If we do not find previous results, we compute them and save the results to disk.

In [20]:
PARA_DIR = f"{tempdir}/parallels"


def writeResults(data, location, name):
    if not os.path.exists(location):
        os.makedirs(location, exist_ok=True)
    path = f"{location}/{name}"
    with gzip.open(path, "wb") as f:
        pickle.dump(data, f)
    TF.info(f"Data written to {path}")


def readResults(location, name):
    TF.indent(reset=True)
    path = f"{location}/{name}"
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    with gzip.open(path, "rb") as f:
        data = pickle.load(f)
    TF.info(f"Data read from {path}")
    return data

In [22]:
similarity = readResults(PARA_DIR, f"sim-{version}.zip")
if not similarity:
    similarity = computeSim()
    writeResults(similarity, PARA_DIR, f"sim-{version}.zip")

  0.06s Data read from /Users/dirk/github/etcbc/dss/_temp/parallels/sim-0.6.zip


In [23]:
len(similarity)

51862

So, just over 50,000 pairs of similar lines.

# Add parallels to the TF dataset

We can add this information to the DSS dataset as an *edge feature*.

An edge feature links two nodes and may annotate that link with a value.

For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
the two lines. The similarity is a percentage, and we round it to integer values.

If *n1* is similar to *n2*, then *n2* is similar to *n1*.
In order to save space, we only add such links once.

We can then use
[`E.sim.b(node)`](https://annotation.github.io/text-fabric/tf/core/edgefeature.html#tf.core.edgefeature)
to find all nodes that are parallel to node.


In [24]:
metaData = {
    "": {
        "acronym": "dss",
        "description": "parallel lines in the DSS (computed)",
        "createdBy": "Dirk Roorda",
        "createdDate": "2019-05-09",
        "sourceCreatedDate": "2015",
        "sourceCreatedBy": "Martin G. Abegg, Jr., James E. Bowley, and Edward M. Cook",
        "convertedBy": "Jarod Jacobs, Martijn Naaijer and Dirk Roorda",
        "source": "Martin Abegg's data files, personal communication",
        "license": "Creative Commons Attribution-NonCommercial 4.0 International License",
        "licenseUrl": "http://creativecommons.org/licenses/by-nc/4.0/",
        "sourceDescription": "Dead Sea Scrolls: biblical and non-biblical scrolls",
    },
    "sim": {
        "valueType": "int",
        "edgeValues": True,
        "description": "similarity between lines, as a percentage of the common material wrt the combined material",
    },
}

In [25]:
simData = {}

for ((f, t), d) in similarity.items():
    simData.setdefault(f, {})[t] = d

In [26]:
TF.save(
    edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module
)

  0.00s Exporting 0 node and 1 edge and 0 config features to /Users/dirk/github/etcbc/dss/parallels/tf/0.6:
   |     0.10s T sim                  to /Users/dirk/github/etcbc/dss/parallels/tf/0.6
  0.10s Exported 0 node features and 1 edge features and 0 config features to /Users/dirk/github/etcbc/dss/parallels/tf/0.6


True

# Turn the parallels feature into a module

Here we show how to turn the new feature `sim` into a module, so that users can easily load it in a Jupyter notebook or in the TF browser.

In [27]:
%%bash
text-fabric-zip 'etcbc/dss/parallels/tf'

True
Create release data for etcbc/dss/parallels/tf
Found 4 versions
zip files end up in /Users/dirk/Downloads/etcbc-release/dss
zipping etcbc/dss                  0.4 with   1 features ==> parallels-tf-0.4.zip
zipping etcbc/dss                 0.4.1 with   1 features ==> parallels-tf-0.4.1.zip
zipping etcbc/dss                  0.5 with   1 features ==> parallels-tf-0.5.zip
zipping etcbc/dss                  0.6 with   1 features ==> parallels-tf-0.6.zip


I have added this file to a new release of the DSS Github repo.

# Use the parallels module

We load the DSS corpus again, but now with the parallels module.

In [30]:
A = use("dss:clone", checkout="clone", hoist=globals())

Using TF-app in /Users/dirk/github/annotation/app-dss/code:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/tf/0.6:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/parallels/tf/0.6:
	repo clone offline under ~/github (local github)
   |     0.00s No structure info in otext, the structure part of the T-API cannot be used


Lo and behold: you see the parallels module listed with one feature: `sim`. It is in *italics*, which indicates
it is an edge feature.

We just do a quick check here and in another notebook we study parallels a bit more, using the feature `sim`.

We count how many similar pairs their are, and how many 100% similar pairs there are.

In [31]:
query = """
line
-sim> line
"""
results = A.search(query)
refNode = results[20000][0]
refNode

  0.25s 51862 results


1565737

In [32]:
query = """
line
-sim=100> line
"""
results = A.search(query)

  0.14s 3646 results


Let's show a few of the pairs are 100 percent similar.

In [33]:
A.table(results, start=1, end=10, withNodes=True)

n,p,line,line.1
1,4Q268 f1:15,את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעונ׳ם וידעו כי 1552980,את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעוונ׳מה וידעו כי 1563775
2,4Q267 f2:4,ובקץ חרבן הארץ עמדו מסיגי הגבול ויתעו את ישראל ׃ 1553076,ובקץ חורבן הארץ עמדו מסיגי גבול ויתעו את ישראל 1563610
3,4Q266 f3iii:19,אשר בזה ישראל את דברי׳הם ׃ והכוכב הוא דורש התורה 1553116,אשר בזה ישראל את דברי׳הם ׃ והכוכב הוא דורש התורה 1563157
4,4Q266 f9i:3,אל יעל׳ה איש בסולם וחבל וכלי ׃ אל יעל איש למזבח בשבת 1553203,בסולם וחבל וכלי ׃ אל יעל איש למזבח בשבת 1563345
5,4Q464b f1:1,ε אמר ל׳הם 1553296,ε לאמר ל ε ׃ 1575440
6,PAM43663 f43:1,ε אמר ל׳הם 1553296,ε # אמר ל׳י ε 1588433
7,1Q4 f18:1,ε אמר ל׳הם 1553296,ε לאמור ε ׃ 1589783
8,4Q264 f1:10,חמר קורצ ולעפר תשוקת׳ו ׃ מה ישיב חמר ויוצר יד ולעצת מה יבין ׃ 1553669,קורץ ולעפר תשוקת׳ו ׃ מה ישיב חמר ויוצר יד לעצת מה יבין ׃ 1562924
9,4Q286 f11:3,ε קודש׳ו # ε ׃ 1553730,ε ת קודש ε ׃ 1564740
10,4Q401 f31:3,ε קודש׳ו # ε ׃ 1553730,ε י קודש ε ׃ 1571118


There is also a lower level way to work with edge features.

We can list all edges going out from a reference node.
What we see is tuple of pairs: the target node and the similarity between the reference node and that target node.

In [34]:
E.sim.f(refNode)

((1565738, 100),
 (1565739, 100),
 (1565740, 100),
 (1565741, 100),
 (1565742, 100),
 (1565744, 67),
 (1565745, 62),
 (1565781, 60),
 (1565782, 60),
 (1565783, 60),
 (1565791, 67),
 (1565792, 67),
 (1565793, 67),
 (1565794, 60),
 (1565796, 60),
 (1565808, 60),
 (1565809, 71),
 (1565811, 71),
 (1565813, 71),
 (1565815, 71),
 (1565819, 67),
 (1565847, 67),
 (1565848, 67),
 (1565854, 75),
 (1565855, 75),
 (1565856, 67),
 (1565857, 67),
 (1565858, 67),
 (1565859, 67),
 (1565887, 71),
 (1565900, 71),
 (1565901, 71),
 (1565902, 71),
 (1565903, 71),
 (1565905, 71),
 (1565906, 71),
 (1565907, 71),
 (1565974, 100),
 (1565976, 67),
 (1565980, 67),
 (1565982, 67),
 (1566001, 67),
 (1566018, 71),
 (1566032, 71),
 (1573353, 67))

Likewise, we can observe the nodes that target the reference node:

In [35]:
E.sim.t(refNode)

((1565730, 100),
 (1565731, 100),
 (1565732, 100),
 (1565733, 100),
 (1565734, 100),
 (1565735, 100),
 (1565736, 100))

Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines.

But there is another way:

In [36]:
E.sim.b(refNode)

((1565730, 100),
 (1565731, 100),
 (1565732, 100),
 (1565733, 100),
 (1565734, 100),
 (1565735, 100),
 (1565736, 100),
 (1565738, 100),
 (1565739, 100),
 (1565740, 100),
 (1565741, 100),
 (1565742, 100),
 (1565744, 67),
 (1565745, 62),
 (1565781, 60),
 (1565782, 60),
 (1565783, 60),
 (1565791, 67),
 (1565792, 67),
 (1565793, 67),
 (1565794, 60),
 (1565796, 60),
 (1565808, 60),
 (1565809, 71),
 (1565811, 71),
 (1565813, 71),
 (1565815, 71),
 (1565819, 67),
 (1565847, 67),
 (1565848, 67),
 (1565854, 75),
 (1565855, 75),
 (1565856, 67),
 (1565857, 67),
 (1565858, 67),
 (1565859, 67),
 (1565887, 71),
 (1565900, 71),
 (1565901, 71),
 (1565902, 71),
 (1565903, 71),
 (1565905, 71),
 (1565906, 71),
 (1565907, 71),
 (1565974, 100),
 (1565976, 67),
 (1565980, 67),
 (1565982, 67),
 (1566001, 67),
 (1566018, 71),
 (1566032, 71),
 (1573353, 67))

Let's make sure that `.b()` gives the combination of `.f()` and `.t()`.

In [37]:
f = {x[0] for x in E.sim.f(refNode)}
b = {x[0] for x in E.sim.b(refNode)}
t = {x[0] for x in E.sim.t(refNode)}

# are f and t disjoint ?

print(f"the intersection of f and t is {f & t}")

# is b the union of f and t ?

print(f"t | f = b ? {f | t == b}")

the intersection of f and t is set()
t | f = b ? True
