In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
import gzip

from tf.app import use
from tf.fabric import Fabric

In [3]:
ghBase = os.path.expanduser("~/github")
org = "etcbc"
repo = "dss"
subdir = "parallels"
mainpath = f"{org}/{repo}/tf"
path = f"{org}/{repo}/{subdir}/tf"
location = f"{ghBase}/{path}"
mainlocation = f"{ghBase}/{mainpath}"
version = "1.3"
module = version
tempdir = f"{ghBase}/{org}/{repo}/_temp"

In [4]:
TF = Fabric(locations=mainlocation, modules=module)

This is Text-Fabric 9.4.1
Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html

68 features found and 0 ignored


In [5]:
api = TF.load("lex type")
docs = api.makeAvailableIn(globals())

  0.30s Dataset without structure sections in otext:no structure functions in the T-API
  2.61s All features loaded/computed - for details use TF.isLoaded()


# Parallels

We make edges between similar lines.

When are lines similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a line to the set of lexemes in it.
* the similarity between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# Preparation

We pre-compute all sets for lines.

But because not all lines are filled with definite material, we exclude lines with 5 or less consonants.

In [6]:
CONS = "cons"

valid = set()

allLines = F.otype.s("line")

TF.indent(reset=True)
for ln in F.otype.s("line"):
    if ln in valid:
        continue
    if sum(1 for s in L.d(ln, otype="sign") if F.type.v(s) == CONS) >= 5:
        valid.add(ln)

TF.info(f"{len(valid)} contentful lines out of {len(allLines)}")

  0.47s 37106 contentful lines out of 52895


In [7]:
def makeSet(ln):
    lineSet = set()
    for s in L.d(ln, otype="word"):
        r = F.lex.v(s)
        if r:
            lineSet.add(r)
    return lineSet

In [8]:
lines = {}

TF.indent(reset=True)
for ln in valid:
    lineSet = makeSet(ln)
    if lineSet:
        lines[ln] = lineSet

nLines = len(lines)
TF.info(f"{nLines} lines")

  0.34s 37106 lines


# Measure

In [9]:
def sim(lSet, mSet):
    return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))

# Compute all similarities

We are going to perform more than half a billion of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [10]:
THRESHOLD = 60


def computeSim(limit=None):
    similarity = {}

    lineNodes = sorted(lines.keys())
    nLines = len(lineNodes)

    nComparisons = nLines * (nLines - 1) // 2

    print(f"{nComparisons} comparisons to make")
    chunkSize = nComparisons // 1000

    co = 0
    b = 0
    si = 0
    p = 0

    TF.indent(reset=True)

    stop = False
    for i in range(nLines):
        nodeI = lineNodes[i]
        lineI = lines[nodeI]
        for j in range(i + 1, nLines):
            nodeJ = lineNodes[j]
            lineJ = lines[nodeJ]
            s = sim(lineI, lineJ)
            co += 1
            b += 1
            if b == chunkSize:
                p += 1
                TF.info(f"{p:>3}‰ - {co:>12} comparisons and {si:>10} similarities")
                b = 0
                if limit is not None and p >= limit:
                    stop = True
                    break

            if s < THRESHOLD:
                continue
            similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
            si += 1
        if stop:
            break

    TF.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
    return similarity

We are going to run it to several ‰ first and do some checks then.

In [11]:
similarity = computeSim(limit=3)

688409065 comparisons to make
  0.74s   1‰ -       688409 comparisons and         12 similarities
  1.50s   2‰ -      1376818 comparisons and         20 similarities
  2.26s   3‰ -      2065227 comparisons and         28 similarities
  2.26s   3% -      2065227 comparisons and         28 similarities


We check the sanity of the results.

In [12]:
print(min(similarity.values()))
print(max(similarity.values()))

60
100


In [13]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 70]

In [14]:
print(len(eq))
print(len(neq))

1
9


In [15]:
print(eq[0])
print(neq[0])

((1552980, 1563775), 100)
((1552973, 1563769), 69)


In [16]:
print(T.text(eq[0][0][0]))
print(T.text(eq[0][0][1]))

את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעונ׳ם וידעו כי 
את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעוונ׳מה וידעו כי 


Looks good.

Now the whole computation.

But if we have done this before, and nothing has changed, we load previous results from disk.

If we do not find previous results, we compute them and save the results to disk.

In [17]:
PARA_DIR = f"{tempdir}/parallels"


def writeResults(data, location, name):
    if not os.path.exists(location):
        os.makedirs(location, exist_ok=True)
    path = f"{location}/{name}"
    with gzip.open(path, "wb") as f:
        pickle.dump(data, f)
    TF.info(f"Data written to {path}")


def readResults(location, name):
    TF.indent(reset=True)
    path = f"{location}/{name}"
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    with gzip.open(path, "rb") as f:
        data = pickle.load(f)
    TF.info(f"Data read from {path}")
    return data

In [18]:
similarity = readResults(PARA_DIR, f"sim-{version}.zip")
if not similarity:
    similarity = computeSim()
    writeResults(similarity, PARA_DIR, f"sim-{version}.zip")

File not found: C:\Users\geitb/github/etcbc/dss/_temp/parallels/sim-1.3.zip
688409065 comparisons to make
  0.72s   1‰ -       688409 comparisons and         12 similarities
  1.47s   2‰ -      1376818 comparisons and         20 similarities
  2.20s   3‰ -      2065227 comparisons and         28 similarities
  2.95s   4‰ -      2753636 comparisons and         34 similarities
  3.67s   5‰ -      3442045 comparisons and         40 similarities
  4.40s   6‰ -      4130454 comparisons and         57 similarities
  5.15s   7‰ -      4818863 comparisons and         70 similarities
  5.89s   8‰ -      5507272 comparisons and         81 similarities
  6.64s   9‰ -      6195681 comparisons and         92 similarities
  7.39s  10‰ -      6884090 comparisons and        107 similarities
  8.13s  11‰ -      7572499 comparisons and        122 similarities
  8.87s  12‰ -      8260908 comparisons and        135 similarities
  9.61s  13‰ -      8949317 comparisons and        148 similarities
    10s  1

 1m 31s 120‰ -     82609080 comparisons and       3424 similarities
 1m 32s 121‰ -     83297489 comparisons and       3528 similarities
 1m 33s 122‰ -     83985898 comparisons and       3612 similarities
 1m 33s 123‰ -     84674307 comparisons and       3612 similarities
 1m 34s 124‰ -     85362716 comparisons and       3629 similarities
 1m 35s 125‰ -     86051125 comparisons and       3659 similarities
 1m 35s 126‰ -     86739534 comparisons and       3660 similarities
 1m 36s 127‰ -     87427943 comparisons and       3663 similarities
 1m 37s 128‰ -     88116352 comparisons and       3689 similarities
 1m 38s 129‰ -     88804761 comparisons and       3764 similarities
 1m 38s 130‰ -     89493170 comparisons and       3772 similarities
 1m 39s 131‰ -     90181579 comparisons and       3792 similarities
 1m 40s 132‰ -     90869988 comparisons and       3811 similarities
 1m 41s 133‰ -     91558397 comparisons and       3811 similarities
 1m 41s 134‰ -     92246806 comparisons and     

 2m 54s 241‰ -    165906569 comparisons and      10095 similarities
 2m 55s 242‰ -    166594978 comparisons and      10171 similarities
 2m 56s 243‰ -    167283387 comparisons and      10426 similarities
 2m 57s 244‰ -    167971796 comparisons and      10426 similarities
 2m 57s 245‰ -    168660205 comparisons and      10432 similarities
 2m 58s 246‰ -    169348614 comparisons and      10518 similarities
 2m 59s 247‰ -    170037023 comparisons and      10537 similarities
 2m 59s 248‰ -    170725432 comparisons and      10570 similarities
 3m 00s 249‰ -    171413841 comparisons and      10637 similarities
 3m 01s 250‰ -    172102250 comparisons and      10644 similarities
 3m 01s 251‰ -    172790659 comparisons and      10653 similarities
 3m 02s 252‰ -    173479068 comparisons and      10660 similarities
 3m 03s 253‰ -    174167477 comparisons and      10667 similarities
 3m 04s 254‰ -    174855886 comparisons and      10667 similarities
 3m 04s 255‰ -    175544295 comparisons and     

 4m 20s 362‰ -    249204058 comparisons and      14757 similarities
 4m 21s 363‰ -    249892467 comparisons and      14794 similarities
 4m 22s 364‰ -    250580876 comparisons and      14802 similarities
 4m 23s 365‰ -    251269285 comparisons and      14813 similarities
 4m 23s 366‰ -    251957694 comparisons and      14824 similarities
 4m 24s 367‰ -    252646103 comparisons and      14834 similarities
 4m 25s 368‰ -    253334512 comparisons and      14853 similarities
 4m 25s 369‰ -    254022921 comparisons and      15021 similarities
 4m 26s 370‰ -    254711330 comparisons and      15152 similarities
 4m 27s 371‰ -    255399739 comparisons and      15165 similarities
 4m 27s 372‰ -    256088148 comparisons and      15223 similarities
 4m 28s 373‰ -    256776557 comparisons and      15322 similarities
 4m 29s 374‰ -    257464966 comparisons and      15323 similarities
 4m 30s 375‰ -    258153375 comparisons and      15408 similarities
 4m 30s 376‰ -    258841784 comparisons and     

 5m 46s 483‰ -    332501547 comparisons and      23171 similarities
 5m 47s 484‰ -    333189956 comparisons and      23181 similarities
 5m 47s 485‰ -    333878365 comparisons and      23294 similarities
 5m 48s 486‰ -    334566774 comparisons and      23325 similarities
 5m 49s 487‰ -    335255183 comparisons and      23354 similarities
 5m 50s 488‰ -    335943592 comparisons and      23414 similarities
 5m 50s 489‰ -    336632001 comparisons and      23452 similarities
 5m 51s 490‰ -    337320410 comparisons and      23459 similarities
 5m 52s 491‰ -    338008819 comparisons and      23480 similarities
 5m 53s 492‰ -    338697228 comparisons and      23612 similarities
 5m 53s 493‰ -    339385637 comparisons and      23626 similarities
 5m 54s 494‰ -    340074046 comparisons and      23721 similarities
 5m 55s 495‰ -    340762455 comparisons and      23760 similarities
 5m 55s 496‰ -    341450864 comparisons and      23777 similarities
 5m 56s 497‰ -    342139273 comparisons and     

 7m 09s 604‰ -    415799036 comparisons and      34182 similarities
 7m 09s 605‰ -    416487445 comparisons and      34207 similarities
 7m 10s 606‰ -    417175854 comparisons and      34243 similarities
 7m 11s 607‰ -    417864263 comparisons and      34256 similarities
 7m 11s 608‰ -    418552672 comparisons and      34296 similarities
 7m 12s 609‰ -    419241081 comparisons and      34349 similarities
 7m 13s 610‰ -    419929490 comparisons and      34400 similarities
 7m 13s 611‰ -    420617899 comparisons and      34505 similarities
 7m 14s 612‰ -    421306308 comparisons and      34619 similarities
 7m 15s 613‰ -    421994717 comparisons and      34746 similarities
 7m 15s 614‰ -    422683126 comparisons and      34881 similarities
 7m 16s 615‰ -    423371535 comparisons and      35046 similarities
 7m 16s 616‰ -    424059944 comparisons and      35142 similarities
 7m 17s 617‰ -    424748353 comparisons and      35236 similarities
 7m 18s 618‰ -    425436762 comparisons and     

 8m 30s 725‰ -    499096525 comparisons and      43700 similarities
 8m 31s 726‰ -    499784934 comparisons and      43748 similarities
 8m 31s 727‰ -    500473343 comparisons and      43822 similarities
 8m 32s 728‰ -    501161752 comparisons and      43836 similarities
 8m 33s 729‰ -    501850161 comparisons and      43916 similarities
 8m 34s 730‰ -    502538570 comparisons and      43938 similarities
 8m 34s 731‰ -    503226979 comparisons and      43962 similarities
 8m 35s 732‰ -    503915388 comparisons and      44003 similarities
 8m 36s 733‰ -    504603797 comparisons and      44016 similarities
 8m 36s 734‰ -    505292206 comparisons and      44031 similarities
 8m 37s 735‰ -    505980615 comparisons and      44035 similarities
 8m 38s 736‰ -    506669024 comparisons and      44062 similarities
 8m 38s 737‰ -    507357433 comparisons and      44094 similarities
 8m 39s 738‰ -    508045842 comparisons and      44129 similarities
 8m 40s 739‰ -    508734251 comparisons and     

 9m 55s 846‰ -    582394014 comparisons and      46720 similarities
 9m 56s 847‰ -    583082423 comparisons and      46735 similarities
 9m 57s 848‰ -    583770832 comparisons and      46776 similarities
 9m 57s 849‰ -    584459241 comparisons and      46842 similarities
 9m 58s 850‰ -    585147650 comparisons and      46869 similarities
 9m 59s 851‰ -    585836059 comparisons and      46907 similarities
10m 00s 852‰ -    586524468 comparisons and      46922 similarities
10m 01s 853‰ -    587212877 comparisons and      46942 similarities
10m 01s 854‰ -    587901286 comparisons and      46985 similarities
10m 02s 855‰ -    588589695 comparisons and      46993 similarities
10m 03s 856‰ -    589278104 comparisons and      47010 similarities
10m 04s 857‰ -    589966513 comparisons and      47032 similarities
10m 05s 858‰ -    590654922 comparisons and      47058 similarities
10m 05s 859‰ -    591343331 comparisons and      47080 similarities
10m 06s 860‰ -    592031740 comparisons and     

11m 32s 967‰ -    665691503 comparisons and      50072 similarities
11m 32s 968‰ -    666379912 comparisons and      50105 similarities
11m 33s 969‰ -    667068321 comparisons and      50113 similarities
11m 34s 970‰ -    667756730 comparisons and      50126 similarities
11m 35s 971‰ -    668445139 comparisons and      50135 similarities
11m 36s 972‰ -    669133548 comparisons and      50150 similarities
11m 36s 973‰ -    669821957 comparisons and      50161 similarities
11m 37s 974‰ -    670510366 comparisons and      50176 similarities
11m 38s 975‰ -    671198775 comparisons and      50179 similarities
11m 39s 976‰ -    671887184 comparisons and      50183 similarities
11m 39s 977‰ -    672575593 comparisons and      50191 similarities
11m 40s 978‰ -    673264002 comparisons and      50195 similarities
11m 41s 979‰ -    673952411 comparisons and      50199 similarities
11m 42s 980‰ -    674640820 comparisons and      50248 similarities
11m 42s 981‰ -    675329229 comparisons and     

In [19]:
len(similarity)

51862

So, just over 50,000 pairs of similar lines.

# Add parallels to the TF dataset

We can add this information to the DSS dataset as an *edge feature*.

An edge feature links two nodes and may annotate that link with a value.

For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
the two lines. The similarity is a percentage, and we round it to integer values.

If *n1* is similar to *n2*, then *n2* is similar to *n1*.
In order to save space, we only add such links once.

We can then use
[`E.sim.b(node)`](https://annotation.github.io/text-fabric/tf/core/edgefeature.html#tf.core.edgefeature)
to find all nodes that are parallel to node.


In [20]:
metaData = {
    "": {
        "acronym": "dss",
        "description": "parallel lines in the DSS (computed)",
        "createdBy": "Dirk Roorda",
        "createdDate": "2022-05-09",
        "sourceCreatedDate": "2015",
        "sourceCreatedBy": "Martin G. Abegg, Jr., James E. Bowley, and Edward M. Cook",
        "convertedBy": "Jarod Jacobs, Martijn Naaijer and Dirk Roorda",
        "source": "Martin Abegg's data files, personal communication",
        "license": "Creative Commons Attribution-NonCommercial 4.0 International License",
        "licenseUrl": "http://creativecommons.org/licenses/by-nc/4.0/",
        "sourceDescription": "Dead Sea Scrolls: biblical and non-biblical scrolls",
    },
    "sim": {
        "valueType": "int",
        "edgeValues": True,
        "description": "similarity between lines, as a percentage of the common material wrt the combined material",
    },
}

In [21]:
simData = {}

for ((f, t), d) in similarity.items():
    simData.setdefault(f, {})[t] = d

In [22]:
TF.save(
    edgeFeatures=dict(sim=simData), metaData=metaData, module=module
)

  0.00s Exporting 0 node and 1 edge and 0 config features to ./1.3:
   |     0.06s T sim                  to ./1.3
  0.06s Exported 0 node features and 1 edge features and 0 config features to ./1.3


True

# Turn the parallels feature into a module

Here we show how to turn the new feature `sim` into a module, so that users can easily load it in a Jupyter notebook or in the TF browser.

In [27]:
%%bash
text-fabric-zip 'etcbc/dss/parallels/tf'

-bash: line 1: text-fabric-zip: command not found


CalledProcessError: Command 'b"text-fabric-zip 'etcbc/dss/parallels/tf'\n"' returned non-zero exit status 127.

I have added this file to a new release of the DSS Github repo.

# Use the parallels module

We load the DSS corpus again, but now with the parallels module.

In [30]:
A = use("dss:clone", checkout="clone", hoist=globals())

Using TF-app in /Users/dirk/github/annotation/app-dss/code:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/tf/0.6:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/parallels/tf/0.6:
	repo clone offline under ~/github (local github)
   |     0.00s No structure info in otext, the structure part of the T-API cannot be used


Lo and behold: you see the parallels module listed with one feature: `sim`. It is in *italics*, which indicates
it is an edge feature.

We just do a quick check here and in another notebook we study parallels a bit more, using the feature `sim`.

We count how many similar pairs their are, and how many 100% similar pairs there are.

In [31]:
query = """
line
-sim> line
"""
results = A.search(query)
refNode = results[20000][0]
refNode

  0.25s 51862 results


1565737

In [32]:
query = """
line
-sim=100> line
"""
results = A.search(query)

  0.14s 3646 results


Let's show a few of the pairs are 100 percent similar.

In [33]:
A.table(results, start=1, end=10, withNodes=True)

n,p,line,line.1
1,4Q268 f1:15,את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעונ׳ם וידעו כי 1552980,את ארצ׳ו ולדשן בטוב אדמת׳ו ׃ ויבינו בעוונ׳מה וידעו כי 1563775
2,4Q267 f2:4,ובקץ חרבן הארץ עמדו מסיגי הגבול ויתעו את ישראל ׃ 1553076,ובקץ חורבן הארץ עמדו מסיגי גבול ויתעו את ישראל 1563610
3,4Q266 f3iii:19,אשר בזה ישראל את דברי׳הם ׃ והכוכב הוא דורש התורה 1553116,אשר בזה ישראל את דברי׳הם ׃ והכוכב הוא דורש התורה 1563157
4,4Q266 f9i:3,אל יעל׳ה איש בסולם וחבל וכלי ׃ אל יעל איש למזבח בשבת 1553203,בסולם וחבל וכלי ׃ אל יעל איש למזבח בשבת 1563345
5,4Q464b f1:1,ε אמר ל׳הם 1553296,ε לאמר ל ε ׃ 1575440
6,PAM43663 f43:1,ε אמר ל׳הם 1553296,ε # אמר ל׳י ε 1588433
7,1Q4 f18:1,ε אמר ל׳הם 1553296,ε לאמור ε ׃ 1589783
8,4Q264 f1:10,חמר קורצ ולעפר תשוקת׳ו ׃ מה ישיב חמר ויוצר יד ולעצת מה יבין ׃ 1553669,קורץ ולעפר תשוקת׳ו ׃ מה ישיב חמר ויוצר יד לעצת מה יבין ׃ 1562924
9,4Q286 f11:3,ε קודש׳ו # ε ׃ 1553730,ε ת קודש ε ׃ 1564740
10,4Q401 f31:3,ε קודש׳ו # ε ׃ 1553730,ε י קודש ε ׃ 1571118


There is also a lower level way to work with edge features.

We can list all edges going out from a reference node.
What we see is tuple of pairs: the target node and the similarity between the reference node and that target node.

In [34]:
E.sim.f(refNode)

((1565738, 100),
 (1565739, 100),
 (1565740, 100),
 (1565741, 100),
 (1565742, 100),
 (1565744, 67),
 (1565745, 62),
 (1565781, 60),
 (1565782, 60),
 (1565783, 60),
 (1565791, 67),
 (1565792, 67),
 (1565793, 67),
 (1565794, 60),
 (1565796, 60),
 (1565808, 60),
 (1565809, 71),
 (1565811, 71),
 (1565813, 71),
 (1565815, 71),
 (1565819, 67),
 (1565847, 67),
 (1565848, 67),
 (1565854, 75),
 (1565855, 75),
 (1565856, 67),
 (1565857, 67),
 (1565858, 67),
 (1565859, 67),
 (1565887, 71),
 (1565900, 71),
 (1565901, 71),
 (1565902, 71),
 (1565903, 71),
 (1565905, 71),
 (1565906, 71),
 (1565907, 71),
 (1565974, 100),
 (1565976, 67),
 (1565980, 67),
 (1565982, 67),
 (1566001, 67),
 (1566018, 71),
 (1566032, 71),
 (1573353, 67))

Likewise, we can observe the nodes that target the reference node:

In [35]:
E.sim.t(refNode)

((1565730, 100),
 (1565731, 100),
 (1565732, 100),
 (1565733, 100),
 (1565734, 100),
 (1565735, 100),
 (1565736, 100))

Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines.

But there is another way:

In [36]:
E.sim.b(refNode)

((1565730, 100),
 (1565731, 100),
 (1565732, 100),
 (1565733, 100),
 (1565734, 100),
 (1565735, 100),
 (1565736, 100),
 (1565738, 100),
 (1565739, 100),
 (1565740, 100),
 (1565741, 100),
 (1565742, 100),
 (1565744, 67),
 (1565745, 62),
 (1565781, 60),
 (1565782, 60),
 (1565783, 60),
 (1565791, 67),
 (1565792, 67),
 (1565793, 67),
 (1565794, 60),
 (1565796, 60),
 (1565808, 60),
 (1565809, 71),
 (1565811, 71),
 (1565813, 71),
 (1565815, 71),
 (1565819, 67),
 (1565847, 67),
 (1565848, 67),
 (1565854, 75),
 (1565855, 75),
 (1565856, 67),
 (1565857, 67),
 (1565858, 67),
 (1565859, 67),
 (1565887, 71),
 (1565900, 71),
 (1565901, 71),
 (1565902, 71),
 (1565903, 71),
 (1565905, 71),
 (1565906, 71),
 (1565907, 71),
 (1565974, 100),
 (1565976, 67),
 (1565980, 67),
 (1565982, 67),
 (1566001, 67),
 (1566018, 71),
 (1566032, 71),
 (1573353, 67))

Let's make sure that `.b()` gives the combination of `.f()` and `.t()`.

In [37]:
f = {x[0] for x in E.sim.f(refNode)}
b = {x[0] for x in E.sim.b(refNode)}
t = {x[0] for x in E.sim.t(refNode)}

# are f and t disjoint ?

print(f"the intersection of f and t is {f & t}")

# is b the union of f and t ?

print(f"t | f = b ? {f | t == b}")

the intersection of f and t is set()
t | f = b ? True
