In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import collections
import pickle
import gzip

from tf.app import use

In [3]:
A = use(
    "hermans/works:clone",
    checkout="clone",
    backend="gitlab.huc.knaw.nl",
    hoist=globals(),
)

# Parallels

We make edges between similar sentences.

When are sentences similar?

If a certain distance metric is above a certain threshold.

We choose this metric:

* we reduce a sentence to the set of words in it, excluding punctuation.
* the similarity between two sentences is the size of the intersection divided by the size of the union of their sets times 100.

# Preparation

We pre-compute all sets for all sentences in the base text.
But we weed out the sentences that do not start with a capital letter.

In [4]:
def makeSet(sentence):
    sentenceSet = set()
    for (i, w) in enumerate(L.d(sentence, otype="word")):
        text = F.transb.v(w)
        if i == 0:
            if text == "":
                break
            firstLetter = text[0]
            if not firstLetter.isalpha() or firstLetter.upper() != firstLetter:
                break
        if text:
            sentenceSet.add(text)
            
    return sentenceSet

In [5]:
query = """
sentence
/with/
.. wit#
/or/
.. wit=base
/-/
"""
results = A.search(query, shallow=True)

  0.01s 13163 results


Weed out the sentences that do not start with a capital letter.

In [6]:
sentences = {}

for sentence in results:
    sentenceSet = makeSet(sentence)
    if sentenceSet:
        sentences[sentence] = sentenceSet

nSentences = len(sentences)
print(f"{nSentences} sentences")

10741 sentences


# Measure

In [7]:
def sim(lSet, mSet):
    return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))

# Compute all similarities

We are going to perform several millions of comparisons, each of which is more than an elemetary operation.

Let's measure time.

In [8]:
THRESHOLD = 50


def computeSim(limit=None):
    similarity = {}

    sentenceNodes = sorted(sentences.keys())
    nSentences = len(sentenceNodes)

    nComparisons = nSentences * (nSentences - 1) // 2

    print(f"{nComparisons} comparisons to make")
    chunkSize = nComparisons // 100

    co = 0
    b = 0
    si = 0
    p = 0

    A.indent(reset=True)

    stop = False
    for i in range(nSentences):
        nodeI = sentenceNodes[i]
        sentenceI = sentences[nodeI]
        for j in range(i + 1, nSentences):
            nodeJ = sentenceNodes[j]
            sentenceJ = sentences[nodeJ]
            s = sim(sentenceI, sentenceJ)
            co += 1
            b += 1
            if b == chunkSize:
                p += 1
                A.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
                b = 0
                if limit is not None and p >= limit:
                    stop = True
                    break

            if s < THRESHOLD:
                continue
            similarity[(nodeI, nodeJ)] = sim(sentenceI, sentenceJ)
            si += 1
        if stop:
            break

    A.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
    return similarity

We are going to run it to a few % first and do some checks then.

In [9]:
similarity = computeSim(limit=10)

57679170 comparisons to make
  0.66s   1% -       576791 comparisons and          1 similarities
  1.27s   2% -      1153582 comparisons and          1 similarities
  1.88s   3% -      1730373 comparisons and          2 similarities
  2.47s   4% -      2307164 comparisons and          2 similarities
  3.06s   5% -      2883955 comparisons and          7 similarities
  3.65s   6% -      3460746 comparisons and          7 similarities
  4.22s   7% -      4037537 comparisons and         11 similarities
  4.80s   8% -      4614328 comparisons and         12 similarities
  5.38s   9% -      5191119 comparisons and         16 similarities
  6.00s  10% -      5767910 comparisons and         19 similarities
  6.00s  10% -      5767910 comparisons and         19 similarities


We check the sanity of the results.

In [10]:
print(min(similarity.values()) if len(similarity) else 0)
print(max(similarity.values()) if len(similarity) else 0)

50
100


In [11]:
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= THRESHOLD]

In [12]:
print(len(eq))
print(len(neq))

1
11


In [13]:
print(eq[0] if len(eq) else 0)
print(neq[0] if len(neq) else 0)

((255776, 271017), 100)
((256084, 266565), 50)


Looks good.

Now the whole computation.

But if we have done this before, and nothing has changed, we load previous results from disk.

If we do not find previous results, we compute them and save the results to disk.

In [14]:
PARA_DIR = f"{A.tempDir}/parallels"


def writeResults(data, location, name):
    if not os.path.exists(location):
        os.makedirs(location, exist_ok=True)
    path = f"{location}/{name}"
    with gzip.open(path, "wb") as f:
        pickle.dump(data, f)
    print(f"Data written to {path}")


def readResults(location, name):
    path = f"{location}/{name}"
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    with gzip.open(path, "rb") as f:
        data = pickle.load(f)
    print(f"Data read from {path}")
    return data

In [15]:
similarity = readResults(PARA_DIR, f"sim-{A.version}.zip")
if not similarity:
    similarity = computeSim()
    writeResults(similarity, PARA_DIR, f"sim-{A.version}.zip")

File not found: /Users/me/gitlab.huc.knaw.nl/hermans/works/_temp/parallels/sim-0.4.zip
57679170 comparisons to make
  0.68s   1% -       576791 comparisons and          1 similarities
  1.30s   2% -      1153582 comparisons and          1 similarities
  1.90s   3% -      1730373 comparisons and          2 similarities
  2.50s   4% -      2307164 comparisons and          2 similarities
  3.10s   5% -      2883955 comparisons and          7 similarities
  3.69s   6% -      3460746 comparisons and          7 similarities
  4.26s   7% -      4037537 comparisons and         11 similarities
  4.87s   8% -      4614328 comparisons and         12 similarities
  5.48s   9% -      5191119 comparisons and         16 similarities
  6.12s  10% -      5767910 comparisons and         19 similarities
  6.69s  11% -      6344701 comparisons and         19 similarities
  7.25s  12% -      6921492 comparisons and         24 similarities
  7.84s  13% -      7498283 comparisons and         24 similarities


In [16]:
len(similarity)

261

So, not too many similarities.

Let's find out which lines have the most correspondences.

In [17]:
parallels = {}

for (sentence, m) in similarity:
    parallels.setdefault(sentence, set()).add(m)
    parallels.setdefault(m, set()).add(sentence)

print(f"{len(parallels)} out of {nSentences} lines have at least one similar line")

339 out of 10741 lines have at least one similar line


In [18]:
rankedParallels = sorted(
    parallels.items(),
    key=lambda x: (-len(x[1]), x[0]),
)

In [19]:
seen = set()


def getPos(node):
    sec = A.sectionStrFromNode(node)
    return f"{sec:<15} @ {node:>5}"


for (sentence, paras) in rankedParallels:
    if sentence in seen:
        continue
    plural = " " if len(paras) == 1 else "s"
    prefix = f"{len(paras):>4} sibling{plural} of "
    blank = " " * len(prefix)
    print(f"{prefix}{getPos(sentence)} ====== {T.text(sentence).strip()}")
    for para in paras:
        sim = similarity[(sentence, para)] if (sentence, para) in similarity else similarity[(para, sentence)]
        print(f"{blank}{getPos(para)} ={sim:>3}%= {T.text(para).strip()}")
        seen.add(para)
    print("")
    seen.add(sentence)

                 nms 17:p861.0   @ 268992 = 50%= Ik gaap.
                 nms 20:p1036.0  @ 269794 = 50%= Ik spring.
                 nms 34:p1734.0  @ 272887 = 50%= Ik zeg:
                 paranoia 6:p59.0 @ 258630 = 50%= Ik sliep.
                 paranoia 6:p32.0 @ 258278 = 50%= Ik wachtte.
                 nms 7:p284.0    @ 266699 = 50%= Ik niet.
                 paranoia 2:p40.0 @ 256084 = 50%= Ik las...
                 nms 20:p1027.0  @ 269751 = 50%= Ik niet...
                 paranoia 2:p92.0 @ 256537 = 50%= Ik viel.
                 nms 31:p1575.0  @ 272090 = 50%= Ik kijk.
                 nms 34:p1738.0  @ 272891 =100%= Ik:
                 nms 34:p1741.0  @ 272895 =100%= Ik:

                 nms 18:p965.0   @ 269409 = 50%= Hij glimlacht.
                 nms 5:p228.0    @ 266435 = 50%= Hij wijst.
                 paranoia 7:p135.0 @ 259556 = 50%= Hij lachte.
                 nms 5:p228.0    @ 266437 = 50%= Hij spreekt.
                 nms 20:p1036.0  @ 269797 = 50%= Hij

And how many lines have just one correspondence?

We look at the tail of rankedParallels.

Why not make an overview of exactly how wide-spread parallel lines are?

We count how many lines have how many parallels.

# Add parallels to the TF dataset

We can add this information to the Oldbabylonian dataset as an *edge feature*.

An edge feature links two nodes and may annotate that link with a value.

For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
the two lines. The similarity is a percentage, and we round it to integer values.

If *n1* is similar to *n2*, then *n2* is similar to *n1*.
In order to save space, we only add such links once.

We can then use
[`E.sim.b(node)`](https://annotation.github.io/text-fabric/Api/Features/#edge-features)
to find all nodes that are parallel to node.


In [20]:
from tfFromTei import SETTINGS

metaData = {
    "": SETTINGS["generic"],
    "sim": {
        "valueType": "int",
        "edgeValues": True,
        "description": (
            "similarity between sentences "
            " as a percentage of the common material wrt. the combined material"
        ),
    },
}

In [21]:
simData = {}
for ((f, t), d) in similarity.items():
    simData.setdefault(f, {})[t] = d

In [22]:
backendBase = os.path.expanduser(f"~/{A.backend}")
mod = "parallels"
path = f"{A.context.org}/{A.context.repo}/{mod}/tf"
location = f"{backendBase}/{path}"
module = A.version

In [24]:
TF.save(
    edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module, silent="auto"
)

  0.00s Exporting 0 node and 1 edge and 0 config features to ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4:
   |     0.00s T sim                  to ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4
  0.00s Exported 0 node features and 1 edge features and 0 config features to ~/gitlab.huc.knaw.nl/hermans/works/parallels/tf/0.4


True

# Use the parallels module

We load the Oldbabylonian corpus again, but now with the parallels module.

In [25]:
A = use(
    "hermans/works:clone",
    hoist=globals(),
    checkout="clone",
    mod="hermans/works/parallels/tf:clone",
    backend="gitlab.huc.knaw.nl",
)

Lo and behold: you see the parallels module listed with one feature: `sim`. It is in *italics*, which indicates
it is an edge feature.

We just do a quick check here.
In another notebook we can study parallels more, using this feature `sim`.

We count how many similar pairs their are, and how many 100% similar pairs there are.

In [26]:
query = """
sentence
-sim> sentence
"""
results = A.search(query)

  0.01s 261 results


In [27]:
query = """
sentence
-sim=100> sentence
"""
results = A.search(query)

  0.02s 52 results


Let's show the 100% pairs. Note that 100% does not mean *identical*,
because the similarity is based on sentences as sets of words.
So we did not take word order and multiplicity of words into account.

In [28]:
A.table(results, withNodes=True, standardFeatures=False, plainGaps=False)

n,p,sentence,sentence.1
1,paranoia 2:p5.0,255776Goed.,271017Goed.
2,paranoia 6:p5.0,258120Ik heb dorst.,270471Ik heb dorst.
3,paranoia 6:p22.0,258237Hij lachte.,259556Hij lachte.
4,paranoia 6:p43.1,258464Niets.,273232Niets.
5,paranoia 7:p18.0,258900Er was niets aan te zien.,259819Er was niets aan te zien.
6,paranoia 7:p51.0,259065Dat kan ik mij niet voorstellen!,272551Dat kan ik mij niet voorstellen.
7,paranoia 7:p105.0,259404Het geheime wapen.,259573Het geheime wapen.
8,paranoia 7:p179.0,259751Boven.,267813Boven.
9,paranoia 7:p179.0,259751Boven.,273702Boven.
10,paranoia 8:p61b.0,260257Glas...,260303Glas...


There is also a lower level way to work with edge features.

We can list all edges going out from a reference node.
What we see is tuple of pairs: the target node and the similarity between the reference node and that target node.

In [31]:
refNode1 = results[-1][0]
print(f"{refNode1=}")

E.sim.f(refNode1)

refNode1=275393


((275407, 100),)

Likewise, we can observe the nodes that target the reference node:

In [32]:
refNode2 = results[-1][1]
print(f"{refNode2=}")

E.sim.t(refNode2)

refNode2=275407


((275393, 100),)

Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines.

But there is another way:

In [33]:
E.sim.b(refNode1)

((275407, 100),)

In [34]:
E.sim.b(refNode2)

((275393, 100),)