In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import collections
import pickle
import gzip

from tf.app import use

In [3]:
A = use('dss:clone', checkout='clone', hoist=globals())

Using TF-app in /Users/dirk/github/annotation/app-dss/code:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/tf/0.4:
	repo clone offline under ~/github (local github)


# Parallels

We make edges between similar lines.

When are lines similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a line to the set of lexemes in it.
* the similarity between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# Preparation

We pre-compute all sets for lines.

But because not all lines are filled with definite material, we exclude lines with 5 or less consonants.

In [5]:
CONS = 'cons'

valid = set()

allLines = F.otype.s('line')

for l in F.otype.s('line'):
  if l in valid:
    continue
  if sum(1 for s in L.d(l, otype='sign') if F.type.v(s) == CONS) >= 5:
    valid.add(l)
      
info(f'{len(valid)} contentful lines out of {len(allLines)}')

 3m 41s 37106 contentful lines out of 52895


In [7]:
def makeSet(l):
  lineSet = set()
  for s in L.d(l, otype='word'):
    r = F.lex.v(s)
    if r:
      lineSet.add(r)
  return lineSet

In [8]:
lines = {}

for l in valid:
  lineSet = makeSet(l)
  if lineSet:
    lines[l] = lineSet
    
nLines = len(lines)
print(f'{nLines} lines')

37106 lines


# Measure

In [9]:
def sim(lSet, mSet):
  return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))

# Compute all similarities

We are going to perform more than half a billion of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [10]:
THRESHOLD = 60

def computeSim(limit=None):
  similarity = {}

  lineNodes = sorted(lines.keys())
  nLines = len(lineNodes)

  nComparisons = nLines * (nLines - 1) // 2

  print(f'{nComparisons} comparisons to make')
  chunkSize = nComparisons // 1000

  co = 0
  b = 0
  si = 0
  p = 0

  indent(reset=True)

  stop = False
  for i in range(nLines):
    nodeI = lineNodes[i]
    lineI = lines[nodeI]
    for j in range(i + 1, nLines):
      nodeJ = lineNodes[j]
      lineJ = lines[nodeJ]
      s = sim(lineI, lineJ)
      co += 1
      b += 1
      if b == chunkSize:
        p += 1
        info(f'{p:>3}‰ - {co:>12} comparisons and {si:>10} similarities')
        b = 0
        if limit is not None and p >= limit:
          stop = True
          break

      if s < THRESHOLD:
        continue
      similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
      si += 1
    if stop:
      break

  info(f'{p:>3}% - {co:>12} comparisons and {si:>10} similarities')
  return similarity

We are going to run it to several ‰ first and do some checks then.

In [11]:
similarity = computeSim(limit=10)

688409065 comparisons to make
  1.03s   1‰ -       688409 comparisons and         12 similarities
  2.08s   2‰ -      1376818 comparisons and         20 similarities
  3.10s   3‰ -      2065227 comparisons and         28 similarities
  4.13s   4‰ -      2753636 comparisons and         34 similarities
  5.17s   5‰ -      3442045 comparisons and         40 similarities
  6.21s   6‰ -      4130454 comparisons and         57 similarities
  7.26s   7‰ -      4818863 comparisons and         70 similarities
  8.26s   8‰ -      5507272 comparisons and         81 similarities
  9.31s   9‰ -      6195681 comparisons and         92 similarities
    10s  10‰ -      6884090 comparisons and        107 similarities
    10s  10% -      6884090 comparisons and        107 similarities


We check the sanity of the results.

In [12]:
print(min(similarity.values()))
print(max(similarity.values()))

60
100


In [13]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 70]

In [14]:
print(len(eq))
print(len(neq))

3
38


In [15]:
print(eq[0])
print(neq[0])

((1552976, 1563771), 100)
((1552969, 1563765), 69)


In [16]:
A.plain(eq[0][0][0])
A.plain(eq[0][0][1])

In [17]:
A.plain(eq[0][0][0], fmt='text-trans-full')
A.plain(eq[0][0][1], fmt='text-trans-full')

Looks good.

Now the whole computation.

But if we have done this before, and nothing has changed, we load previous results from disk.

If we do not find previous results, we compute them and save the results to disk.

In [18]:
PARA_DIR = f'{A.tempDir}/parallels'

def writeResults(data, location, name):
  if not os.path.exists(location):
    os.makedirs(location, exist_ok=True)
  path = f'{location}/{name}'
  with gzip.open(path, 'wb') as f:
    pickle.dump(data, f)
  print(f'Data written to {path}')
  
def readResults(location, name):
  path = f'{location}/{name}'
  if not os.path.exists(path):
    print(f'File not found: {path}')
    return None
  with gzip.open(path, 'rb') as f:
    data = pickle.load(f)
  print(f'Data read from {path}')
  return data

In [19]:
similarity = readResults(PARA_DIR, f'sim-{A.version}.zip')
if not similarity:
  similarity = computeSim()
  writeResults(similarity, PARA_DIR, f'sim-{A.version}.zip')

File not found: /Users/dirk/github/etcbc/dss/_temp/parallels/sim-0.4.zip
688409065 comparisons to make
  1.03s   1‰ -       688409 comparisons and         12 similarities
  2.06s   2‰ -      1376818 comparisons and         20 similarities
  3.14s   3‰ -      2065227 comparisons and         28 similarities
  4.16s   4‰ -      2753636 comparisons and         34 similarities
  5.18s   5‰ -      3442045 comparisons and         40 similarities
  6.20s   6‰ -      4130454 comparisons and         57 similarities
  7.23s   7‰ -      4818863 comparisons and         70 similarities
  8.24s   8‰ -      5507272 comparisons and         81 similarities
  9.26s   9‰ -      6195681 comparisons and         92 similarities
    10s  10‰ -      6884090 comparisons and        107 similarities
    11s  11‰ -      7572499 comparisons and        122 similarities
    12s  12‰ -      8260908 comparisons and        135 similarities
    13s  13‰ -      8949317 comparisons and        148 similarities
    14s  14‰ 

In [20]:
len(similarity)

51860

So, just over 50,000 pairs of similar lines.

Let's find out which lines have the most correspondences.

In [21]:
parallels = {}

for (l, m) in similarity:
  parallels.setdefault(l, set()).add(m)
  parallels.setdefault(m, set()).add(l)
  
print(f'{len(parallels)} out of {nLines} lines have at least one similar line')

16112 out of 37106 lines have at least one similar line


In [22]:
rankedParallels = sorted(
  parallels.items(),
  key=lambda x: (-len(x[1]), x[0]),
)

In [23]:
for (l, paras) in rankedParallels[0:10]:
  print(f'{len(paras):>4} siblings of {l} = {T.text(l)} = {T.text(l, fmt="text-source-full", descend=True)}')

 317 siblings of 1554663 = ε  # ם והב #  # ל #  #  #   #  #  #  #  #  #  ε  = -- \M whb\\l\\\ \\\\\\ -- 
 291 siblings of 1565606 = ε ותי׳כם ε  = -- wty/kM -- 
 291 siblings of 1569615 = ε  # ותי׳הם  #  ε ׃  = -- \wty/hM \ -- . 
 291 siblings of 1578905 = ε  #   #   # ותי׳כה ε ׃  = -- \ \ \wty/kh -- . 
 291 siblings of 1579077 = ε  # ותי׳נו ε  = -- \wty/nw -- 
 190 siblings of 1555317 = ε ירים למ #  ε  = -- yryM lm\ -- 
 190 siblings of 1577058 = ε ות׳ם לה #  ε  = -- wt/M lh\ -- 
 190 siblings of 1582367 = ε  # ין ל׳הון ε  = -- \yN l/hwN -- 
 181 siblings of 1554552 = ε  #  #  #  # ם וכול  #  #    #  #   = -- \\\\M wkwl \\ □\\ 
 181 siblings of 1559971 = ε ין וכל ε  = -- yN wkl -- 


In [24]:
for (l, paras) in rankedParallels[100:110]:
  print(f'{len(paras):>4} siblings of {T.text(l)} = {T.text(l, fmt="text-source-full", descend=True)}')

 102 siblings of ε ם והפריח ε  = -- M whpryj -- 
 102 siblings of וצואהוא  #  ε  = wxwahwa \ -- 
 102 siblings of ε  #  כרם וה   ׃  = -- \ krM wh □ . 
 102 siblings of יחדו וית #  ε  = yjdw wyt\ -- 
 102 siblings of וכ # ל ε ׳כה  = wk\l -- /kh 
 102 siblings of ε ים ואיכה  = -- yM waykh 
 102 siblings of ε  #  ותוצאת ε  = -- \ wtwxat -- 
 102 siblings of וי #   # חוץ ו #  ε  = wy\ \jwX w\ -- 
 102 siblings of ε י׳הם ודב ε  = -- y/hM wdb -- 
 102 siblings of ε ת ומנינ #  ε ׃  = -- t wmnyn\ -- . 


In [25]:
for (l, paras) in rankedParallels[500:510]:
  print(f'{len(paras):>4} siblings of {T.text(l)} = {T.text(l, fmt="text-source-full", descend=True)}')

  45 siblings of ε ב׳כה ובתורה ε  = -- b/kh wbtwrh -- 
  45 siblings of ε ים אשׁר ε  = -- yM aCr -- 
  45 siblings of אלוהים לכול ε  = alwhyM lkwl -- 
  45 siblings of ובבינת ε  = wbbynt -- 
  45 siblings of ε ית׳כה אשׁר ε  = -- yt/kh aCr -- 
  45 siblings of ε  # י׳כה אשׁר ε  = -- \y/kh aCr -- 
  45 siblings of ובעשׁרין ε  = wboCryN -- 
  45 siblings of ε ים אשׁר ε ׃ ╱  = -- yM aCr -- . ╱ 
  44 siblings of ε ובעדת׳נו   ε ׃  = -- wbodt/nw □ -- . 
  44 siblings of ε לכול עולמים ε ׃  = -- lkwl owlmyM -- . 


And how many lines have just one correspondence?

We look at the tail of rankedParallels.

In [26]:
pairs = [(x, list(paras)[0]) for (x, paras) in rankedParallels if len(paras) == 1]
print(f'There are {len(pairs)} exclusively parallel pairs of lines')

There are 7424 exclusively parallel pairs of lines


In [27]:
from tf.applib.helpers import dm

In [28]:
for (x, y) in pairs[0:10]:
  dm('---\n')
  print(f'similarity {similarity[(x,y)]}')
  A.plain(x, fmt='layout-orig-full')
  A.plain(y, fmt='layout-orig-full')

---


similarity 69


---


similarity 85


---


similarity 83


---


similarity 67


---


similarity 83


---


similarity 73


---


similarity 62


---


similarity 64


---


similarity 79


---


similarity 79


Why not make an overview of exactly how wide-spread parallel lines are?

We count how many lines have how many parallels.

In [30]:
parallelCount = collections.Counter()

buckets = (2, 10, 20, 50, 100)

bucketRep = {}
prevBucket = None
for bucket in buckets:
  if prevBucket is None:
    bucketRep[bucket] = f'       n <= {bucket:>3}'
  elif bucket == buckets[-1]:
    bucketRep[bucket] = f'       n >  {bucket:>3}'
  else:
    bucketRep[bucket] = f'{prevBucket:>3} <  n <= {bucket:>3}'
  prevBucket = bucket

for (l, paras) in rankedParallels:
  clusterSize = len(paras) + 1
  if clusterSize > buckets[-1]:
    theBucket = buckets[-1]
  else:
    for bucket in buckets:
      if clusterSize <= bucket:
        theBucket = bucket
        break
  parallelCount[theBucket] += 1
  
for (bucket, amount) in sorted(
  parallelCount.items(),
  key=lambda x: (-x[0], x[1]),
):
  print(f'{amount:>4} lines have {bucketRep[bucket]} sisters')

 445 lines have        n >  100 sisters
 720 lines have  20 <  n <=  50 sisters
1047 lines have  10 <  n <=  20 sisters
6476 lines have   2 <  n <=  10 sisters
7424 lines have        n <=   2 sisters


# Add parallels to the TF dataset

We can add this information to the DSS dataset as an *edge feature*.

An edge feature links two nodes and may annotate that link with a value.

For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
the two lines. The similarity is a percentage, and we round it to integer values.

If *n1* is similar to *n2*, then *n2* is similar to *n1*.
In order to save space, we only add such links once.

We can then use
[`E.sim.b(node)`](https://annotation.github.io/text-fabric/Api/Features/#edge-features)
to find all nodes that are parallel to node.


In [31]:
metaData = {
  '': {
    'acronym': 'dss',
    'description': 'parallel lines in the DSS (computed)',
    'createdBy': 'Dirk Roorda',
    'createdDate': '2019-05-09',
    'sourceCreatedDate': '2015',
    'sourceCreatedBy': 'Martin G. Abegg, Jr., James E. Bowley, and Edward M. Cook',
    'convertedBy': 'Jarod Jacobs, Martijn Naaijer and Dirk Roorda',
    'source': "Martin Abegg's data files, personal communication",
    'license': 'Creative Commons Attribution-NonCommercial 4.0 International License',
    'licenseUrl': 'http://creativecommons.org/licenses/by-nc/4.0/',
    'sourceDescription': 'Dead Sea Scrolls: biblical and non-biblical scrolls',
  },
  'sim': {
    'valueType': 'int',
    'edgeValues': True,
    'description': 'similarity between lines, as a percentage of the common material wrt the combined material',
  },
}

In [34]:
simData = {}

for ((f, t), d) in similarity.items():
  simData.setdefault(f, {})[t] = d

In [35]:
ghBase = os.path.expanduser('~/github')
subdir = 'parallels'
path = f'{A.org}/{A.repo}/{subdir}/tf'
location = f'{ghBase}/{path}'
module = A.version

In [36]:
TF.save(edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module)

   |     0.11s T sim                  to /Users/dirk/github/etcbc/dss/parallels/tf/0.4


True

# Turn the parallels feature into a module

Here we show how to turn the new feature `sim` into a module, so that users can easily load it in a Jupyter notebook or in the TF browser.

In [37]:
%%bash
text-fabric-zip 'etcbc/dss/parallels/tf'

True
Create release data for etcbc/dss/parallels/tf
Found 1 versions
zip files end up in /Users/dirk/Downloads/etcbc-release/dss
zipping etcbc/dss                  0.4 with   1 features ==> parallels-tf-0.4.zip


I have added this file to a new release of the DSS Github repo.

# Use the parallels module

We load the DSS corpus again, but now with the parallels module.

In [38]:
A = use('dss:clone', checkout='clone', hoist=globals(), mod='etcbc/dss/parallels/tf:clone')

Using TF-app in /Users/dirk/github/annotation/app-dss/code:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/tf/0.4:
	repo clone offline under ~/github (local github)
Using data in /Users/dirk/github/etcbc/dss/parallels/tf/0.4:
	repo clone offline under ~/github (local github)


Lo and behold: you see the parallels module listed with one feature: `sim`. It is in *italics*, which indicates
it is an edge feature.

We just do a quick check here and in another notebook we study parallels a bit more, using the feature `sim`.

We count how many similar pairs their are, and how many 100% similar pairs there are.

In [39]:
query = '''
line
-sim> line
'''
results = A.search(query)

  0.26s 51860 results


In [40]:
query = '''
line
-sim=100> line
'''
results = A.search(query)

  0.12s 3647 results


Let's show a few of the pairs are 100 percent similar.

In [41]:
A.table(results, start=1, end=10, withNodes=True)

n,p,line,line.1
1,4Q268 f1:15,את ארצ׳ו ולדשׁן בטוב אדמת׳ו ׃ ויבינו בעונ׳ם וידעו כי 1552976,את ארצ׳ו ולדשׁן בטוב אדמת׳ו ׃ ויבינו בעוונ׳מה וידעו כי 1563771
2,4Q267 f2:4,ובקץ חרבן הארץ עמדו מסיגי הגבול ויתעו את ישׁראל ׃ 1553072,ובקץ חורבן הארץ עמדו מסיגי גבול ויתעו את ישׁראל 1563606
3,4Q266 f3iii:19,אשׁר בזה ישׁראל את דברי׳הם ׃ והכוכב הוא דורשׁ התורה 1553112,אשׁר בזה ישׁראל את דברי׳הם ׃ והכוכב הוא דורשׁ התורה 1563153
4,4Q266 f9i:3,אל יעל׳ה אישׁ בסולם וחבל וכלי ׃ אל יעל אישׁ למזבח בשׁבת 1553199,בסולם וחבל וכלי ׃ אל יעל אישׁ למזבח בשׁבת 1563341
5,4Q464b f1:1,ε אמר ל׳הם 1553292,ε לאמר ל ε ׃ 1575436
6,PAM43663 f43:1,ε אמר ל׳הם 1553292,ε # אמר ל׳י ε 1588429
7,1Q4 f18:1,ε אמר ל׳הם 1553292,ε לאמור ε ׃ 1589779
8,4Q264 f1:10,חמר קורצ ולעפר תשׁוקת׳ו ׃ מה ישׁיב חמר ויוצר יד ולעצת מה יבין ׃ 1553665,קורץ ולעפר תשׁוקת׳ו ׃ מה ישׁיב חמר ויוצר יד לעצת מה יבין ׃ 1562920
9,4Q286 f11:3,ε קודשׁ׳ו # ε ׃ 1553726,ε ת קודשׁ ε ׃ 1564736
10,4Q401 f31:3,ε קודשׁ׳ו # ε ׃ 1553726,ε י קודשׁ ε ׃ 1571114


There is also a lower level way to work with edge features.

We can list all edges going out from a reference node.
What we see is tuple of pairs: the target node and the similarity between the reference node and that target node.

In [42]:
refNode = 1589779

E.sim.f(refNode)

((1592677, 67), (1592988, 75))

Likewise, we can observe the nodes that target the reference node:

In [43]:
E.sim.t(refNode)

((1553292, 100),
 (1556965, 60),
 (1557940, 60),
 (1558853, 67),
 (1564995, 60),
 (1567451, 60),
 (1568541, 75),
 (1569896, 60),
 (1569966, 75),
 (1570382, 75),
 (1573003, 67),
 (1575170, 60),
 (1575178, 75),
 (1575436, 100),
 (1577905, 75),
 (1579952, 75),
 (1588429, 100),
 (1588886, 75))

Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines.

But there is another way:

In [44]:
E.sim.b(refNode)

((1553292, 100),
 (1556965, 60),
 (1557940, 60),
 (1558853, 67),
 (1564995, 60),
 (1567451, 60),
 (1568541, 75),
 (1569896, 60),
 (1569966, 75),
 (1570382, 75),
 (1573003, 67),
 (1575170, 60),
 (1575178, 75),
 (1575436, 100),
 (1577905, 75),
 (1579952, 75),
 (1588429, 100),
 (1588886, 75),
 (1592677, 67),
 (1592988, 75))

Let's make sure that `.b()` gives the combination of `.f()` and `.t()`.

In [45]:
f = {x[0] for x in E.sim.f(refNode)}
b = {x[0] for x in E.sim.b(refNode)}
t = {x[0] for x in E.sim.t(refNode)}

# are f and t disjoint ?

print(f'the intersection of f and t is {f & t}')

# is b the union of f and t ?

print(f't | f = b ? {f | t == b}')

the intersection of f and t is set()
t | f = b ? True
