/
parallels.py
2487 lines (2151 loc) · 89.2 KB
/
parallels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# coding: utf-8
# <img align="right" src="images/dans-small.png"/>
# <img align="right" src="images/tf-small.png"/>
# <img align="right" src="images/etcbc.png"/>
#
#
# # Parallel Passages in the MT
#
# # 0. Introduction
#
# ## 0.1 Motivation
# We want to make a list of **all** parallel passages in the Masoretic Text (MT) of the Hebrew Bible.
#
# Here is a quote that triggered Dirk to write this notebook:
#
# > Finally, the Old Testament Parallels module in Accordance is a helpful resource that enables the researcher to examine 435 sets of parallel texts, or in some cases very similar wording in different texts, in both the MT and translation, but the large number of sets of texts in this database should not fool one to think it is complete or even nearly complete for all parallel writings in the Hebrew Bible.
#
# Robert Rezetko and Ian Young.
# Historical linguistics & Biblical Hebrew. Steps Toward an Integrated Approach.
# *Ancient Near East Monographs, Number9*. SBL Press Atlanta. 2014.
# [PDF Open access available](https://www.google.nl/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&ved=0CCgQFjAB&url=http%3A%2F%2Fwww.sbl-site.org%2Fassets%2Fpdfs%2Fpubs%2F9781628370461_OA.pdf&ei=2QSdVf-vAYSGzAPArJeYCg&usg=AFQjCNFA3TymYlsebQ0MwXq2FmJCSHNUtg&sig2=LaXuAC5k3V7fSXC6ZVx05w&bvm=bv.96952980,d.bGQ)
# <img align="right" width="50%" src="parallel.png"/>
#
# ## 0.3 Open Source
# This is an IPython notebook.
# It contains a working program to carry out the computations needed to obtain the results reported here.
#
# You can download this notebook and run it on your computer, provided you have
# [Text-Fabric](https://github.com/Dans-labs/text-fabric) installed.
#
# It is a pity that we cannot compare our results with the Accordance resource mentioned above,
# since that resource has not been published in an accessible manner.
# We also do not have the information how this resource has been constructed on the basis of the raw data.
# In contrast with that, we present our results in a completely reproducible manner.
# This notebook itself can serve as the method of replication,
# provided you have obtained the necessary resources.
# See [sources](https://github.com/ETCBC/shebanq/wiki/Sources), which are all Open Access.
#
# ## 0.4 What are parallel passages?
# The notion of *parallel passage* is not a simple, straightforward one.
# There are parallels on the basis of lexical content in the passages on the one hand,
# but on the other hand there are also correspondences in certain syntactical structures,
# or even in similarities in text structure.
#
# In this notebook we do select a straightforward notion of parallel, based on lexical content only.
# We investigate two measures of similarity, one that ignores word order completely,
# and one that takes word order into account.
#
# Two kinds of short-comings of this approach must be mentioned:
#
# 1. We will not find parallels based on non-lexical criteria (unless they are also lexical parallels)
# 1. We will find too many parallels: certain short sentences (and he said), or formula like passages (and the word of God came to Moses) occur so often that they have a more subtle bearing on whether there is a common text history.
#
# For a more full treatment of parallel passages, see
#
# **Wido Th. van Peursen and Eep Talstra**:
# Computer-Assisted Analysis of Parallel Texts in the Bible -
# The Case of 2 Kings xviii-xix and its Parallels in Isaiah and Chronicles.
# *Vetus Testamentum* 57, pp. 45-72.
# 2007, Brill, Leiden.
#
# Note that our method fails to identify any parallels with Chronica_II 32.
# Van Peursen and Talstra state about this chapter and 2 Kings 18:
#
# > These chapters differ so much, that it is sometimes impossible to establish
# which verses should be considered parallel.
#
# In this notebook we produce a set of *cliques*,
# a clique being a set of passages that are *quite* similar, based on lexical information.
#
#
# ## 0.5 Authors
# This notebook is by Dirk Roorda and owes a lot to discussions with Martijn Naaijer.
#
# [Dirk Roorda](mailto:dirk.roorda@dans.knaw.nl) while discussing ideas with
# [Martijn Naaijer](mailto:m.naaijer@vu.nl).
#
#
# ## 0.6 Status
#
# * **modified: 2017-09-28** Is now part of a pipeline for transferring data from the ETCBC to Text-Fabric.
# * **modified: 2016-03-03** Added experiments based on chapter chunks and lower similarities.
#
# 165 experiments have been carried out, of which 18 with promising results.
# All results can be easily inspected, just by clicking in your browser.
# One of the experiments has been chosen as the basis for
# [crossref](https://shebanq.ancient-data.org/hebrew/note?version=4b&id=Mnxjcm9zc3JlZg__&tp=txt_tb1&nget=v)
# annotations in SHEBANQ.
#
# # 1. Results
#
# Click in a green cell to see interesting results. The numbers in the cell indicate
#
# * the number of passages that have a variant elsewhere
# * the number of *cliques* they form (cliques are sets of similar passages)
# * the number of passages in the biggest clique
#
# Below the results is an account of the method that we used, followed by the actual code to produce these results.
# # Pipeline
# See [operation](https://github.com/ETCBC/pipeline/blob/master/README.md#operation)
# for how to run this script in the pipeline.
#
# The pipeline comes in action in Section [6a](#6a) below: TF features.
# # Caveat
#
# This notebook makes use of a new feature of text-fabric, first present in 2.3.15.
# Make sure to upgrade first.
#
# ```sudo -H pip3 install --upgrade text-fabric
# In[1]:
if 'SCRIPT' not in locals():
#SCRIPT = False
SCRIPT = False
FORCE = True
FORCE_MATRIX = False
LANG_FEATURE = 'languageISO'
OCC_FEATURE = 'g_cons'
LEX_FEATURE = 'lex'
TEXT_FEATURE = 'g_word_utf8'
TRAILER_FEATURE = 'trailer_utf8'
CORE_NAME = 'bhsa'
NAME = 'parallels'
VERSION= '4b'
def stop(good=False):
if SCRIPT: sys.exit(0 if good else 1)
# In[2]:
# run this cell after all other cells
if False and not SCRIPT: HTML(other_exps)
# # 2. Experiments
#
# We have conducted 165 experiments, all corresponding to a specific choice of parameters.
# Every experiment is an attempt to identify variants and collect them in *cliques*.
#
# The table gives an overview of the experiments conducted.
#
# Every *row* corresponds to a particular way of chunking and a method of measuring the similarity.
#
# There are *columns* for each similarity *threshold* that we have tried.
# The idea is that chunks are similar if their similarity is above the threshold.
#
# The outcomes of one experiment have been added to SHEBANQ as the note set
# [crossref](https://shebanq.ancient-data.org/hebrew/note?version=4b&id=Mnxjcm9zc3JlZg__&tp=txt_tb1&nget=v).
# The experiment chosen for this is currently
#
# * *chunking*: **object verse**
# * *similarity method*: **SET**
# * *similarity threshold*: **65**
#
#
# ## 2.1 Assessing the outcomes
#
# Not all experiments lead to useful results.
# We have indicated the value of a result by a color coding, based on objective characteristics,
# such as the number of parallel passages, the number of cliques, the size of the greatest clique, and the way of chunking.
# These numbers are shown in the cells.
#
# ### 2.1.1 Assessment criteria
#
# If the method is based on *fixed* chunks, we deprecated the method and the results.
# Because two perfectly similar verses could be missed if a 100-word wide window that shifts over the text aligns differently with both verses, which will usually be the case.
#
# Otherwise, we consider the *ll*, the length of the longest clique, and *nc*, the number of cliques.
# We set three quality parameters:
# * `REC_CLIQUE_RATIO` = 5 : recommended clique ratio
# * `DUB_CLIQUE_RATIO` = 15 : dubious clique ratio
# * `DEP_CLIQUE_RATIO` = 25 : deprecated clique ratio
#
# where the *clique ratio* is $100 (ll/nc)$,
# i.e. the length of the longest clique divided by the number of cliques as percentage.
#
# An experiment is *recommended* if its clique ratio is between the recommended and dubious clique ratios.
#
# It is *dubious* if its clique ratio is between the dubious and deprecated clique ratios.
#
# It is *deprecated* if its clique ratio is above the deprecated clique ratio.
#
# # 2.2 Inspecting results
# If you click on the hyperlink in the cell, you are taken to a page that gives you
# all the details of the results:
#
# 1. A link to a file with all *cliques* (which are the sets of similar passages)
# 1. A list of links to chapter-by-chapter diff files (for cliques with just two members), and only for
# experiments with outcomes that are labeled as *promising* or *unassessed quality* or *mixed results*.
#
# To get into the variants quickly, inspect the list (2) and click through
# to see the actual variant material in chapter context.
#
# Not all variants occur here, so continue with (1) to see the remaining cliques.
#
# Sometimes in (2) a chapter diff file does not indicate clearly the relevant common part of both chapters.
# In that case you have to consult the big list (1)
#
# All these results can be downloaded from the
# [SHEBANQ github repo](https://github.com/ETCBC/shebanq/tree/master/static/docs/tools/parallel/files)
# After downloading the whole directory, open ``experiments.html`` in your browser.
# # 3. Method
#
# Here we discuss the method we used to arrive at a list of parallel passages
# in the Masoretic Text (MT) of the Hebrew Bible.
#
# ## 3.1 Similarity
#
# We have to find passages in the MT that are *similar*.
# Therefore we *chunk* the text in some way, and then compute the similarities between pairs of chunks.
#
# There are many ways to define and compute similarity between texts.
# Here, we have tried two methods ``SET`` and ``LCS``.
# Both methods define similarity as the fraction of common material with respect to the total material.
#
# ### 3.1.1 SET
#
# The ``SET`` method reduces textual chunks to *sets* of *lexemes*.
# This method abstracts from the order and number of occurrences of words in chunks.
#
# We use as measure for the similarity of chunks $C_1$ and $C_2$ (taken as sets):
#
# $$ s_{\rm set}(C_1, C_2) = {\vert C_1 \cap C_2\vert \over \vert C_1 \cup C_2 \vert} $$
#
# where $\vert X \vert$ is the number of elements in set $X$.
#
# ### 3.1.2 LCS
#
# The ``LCS`` method is less reductive: chunks are *strings* of *lexemes*,
# so the order and number of occurrences of words is retained.
#
# We use as measure for the similarity of chunks $C_1$ and $C_2$ (taken as strings):
#
# $$ s_{\rm lcs}(C_1, C_2) = {\vert {\rm LCS}(C_1,C_2)\vert \over \vert C_1\vert + \vert C_2 \vert -
# \vert {\rm LCS}(C_1,C_2)\vert} $$
#
# where ${\rm LCS}(C_1, C_2)$ is the
# [longest common subsequence](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)
# of $C_1$ and $C_2$ and
# $\vert X\vert$ is the length of sequence $X$.
#
# It remains to be seen whether we need the extra sophistication of ``LCS``.
# The risk is that ``LCS`` could fail to spot related passages when there is a large amount of transposition going on.
# The results should have the last word.
#
# We need to compute the LCS efficiently, and for this we used the python ``Levenshtein`` module:
#
# ``pip install python-Levenshtein``
#
# whose documentation is
# [here](http://www.coli.uni-saarland.de/courses/LT1/2011/slides/Python-Levenshtein.html).
#
# ## 3.2 Performance
#
# Similarity computation is the part where the heavy lifting occurs.
# It is basically quadratic in the number of chunks, so if you have verses as chunks (~ 23,000),
# you need to do ~ 270,000,000 similarity computations, and if you use sentences (~ 64,000),
# you need to do ~ 2,000,000,000 ones!
# The computation of a single similarity should be *really* fast.
#
# Besides that, we use two ways to economize:
#
# * after having computed a matrix for a specific set of parameter values, we save the matrix to disk;
# new runs can load the matrix from disk in a matter of seconds;
# * we do not store low similarity values in the matrix, low being < ``MATRIX_THRESHOLD``.
#
# The ``LCS`` method is more complicated.
# We have tried the ``ratio`` method from the ``difflib`` package that is present in the standard python distribution.
# This is unbearably slow for our purposes.
# The ``ratio`` method in the ``Levenshtein`` package is much quicker.
#
# See the table for an indication of the amount of work to create the similarity matrix
# and the performance per similarity method.
#
# The *matrix threshold* is the lower bound of similarities that are stored in the matrix.
# If a pair of chunks has a lower similarity, no entry will be made in the matrix.
#
# The computing has been done on a Macbook Air (11", mid 2012, 1.7 GHz Intel Core i5, 8GB RAM).
#
# |chunk type |chunk size|similarity method|matrix threshold|# of comparisons|size of matrix (KB)|computing time (min)|
# |:----------|---------:|----------------:|---------------:|---------------:|------------------:|-------------------:|
# |fixed |100 |LCS |60 | 9,003,646| 7| ? |
# |fixed |100 |SET |50 | 9,003,646| 7| ? |
# |fixed |50 |LCS |60 | 36,197,286| 37| ? |
# |fixed |50 |SET |50 | 36,197,286| 18| ? |
# |fixed |20 |LCS |60 | 227,068,705| 2,400| ? |
# |fixed |20 |SET |50 | 227,068,705| 113| ? |
# |fixed |10 |LCS |60 | 909,020,841| 59,000| ? |
# |fixed |10 |SET |50 | 909,020,841| 1,800| ? |
# |object |verse |LCS |60 | 269,410,078| 2,300| 31|
# |object |verse |SET |50 | 269,410,078| 509| 14|
# |object |half_verse|LCS |60 | 1,016,396,241| 40,000| 50|
# |object |half_verse|SET |50 | 1,016,396,241| 3,600| 41|
# |object |sentence |LCS |60 | 2,055,975,750| 212,000| 68|
# |object |sentence |SET |50 | 2,055,975,750| 82,000| 63|
# # 4. Workflow
#
# ## 4.1 Chunking
#
# There are several ways to chunk the text:
#
# * fixed chunks of approximately ``CHUNK_SIZE`` words
# * by object, such as verse, sentence and even chapter
#
# After chunking, we prepare the chunks for similarity measuring.
#
# ### 4.1.1 Fixed chunking
# Fixed chunking is unnatural, but if the chunk size is small, it can yield fair results.
# The results are somewhat difficult to inspect, because they generally do not respect constituent boundaries.
# It is to be expected that fixed chunks in variant passages will be mutually *out of phase*,
# meaning that the chunks involved in these passages are not aligned with each other.
# So they will have a lower similarity than they could have if they were aligned.
# This is a source of artificial noise in the outcome and/or missed cases.
#
# If the chunking respects "natural" boundaries in the text, there is far less misalignment.
#
# ### 4.1.2 Object chunking
# We can also chunk by object, such as verse, half_verse or sentence.
#
# Chunking by *verse* is very much like chunking in fixed chunks of size 20, performance-wise.
#
# Chunking by *half_verse* is comparable to fixed chunks of size 10.
#
# Chunking by *sentence* will generate an enormous amount of
# false positives, because there are very many very short sentences (down to 1-word) in the text.
# Besides that, the performance overhead is huge.
#
# The *half_verses* seem to be a very interesting candidate.
# They are smaller than verses, but there are less *degenerate cases* compared to with sentences.
# From the table above it can be read that half verses require only half as many similarity computations as sentences.
#
#
# ## 4.2 Preparing
#
# We prepare the chunks for the application of the chosen method of similarity computation (``SET`` or ``LCS``).
#
# In both cases we reduce the text to a sequence of transliterated consonantal *lexemes* without disambiguation.
# In fact, we go one step further: we remove the consonants (aleph, wav, yod) that are often silent.
#
# For ``SET``, we represent each chunk as the set of its reduced lexemes.
#
# For ``LCS``, we represent each chunk as the string obtained by joining its reduced lexemes separated by white spaces.
#
# ## 4.3 Cliques
#
# After having computed a sufficient part of the similarity matrix, we set a value for ``SIMILARITY_THRESHOLD``.
# All pairs of chunks having at least that similarity are deemed *interesting*.
#
# We organize the members of such pairs in *cliques*, groups of chunks of which each member is
# similar (*similarity* > ``SIMILARITY_THRESHOLD``) to at least one other member.
#
# We start with no cliques and walk through the pairs whose similarity is above ``SIMILARITY_THRESHOLD``,
# and try to put each member into a clique.
#
# If there is not yet a clique, we make the member in question into a new singleton clique.
#
# If there are cliques, we find the cliques that have a member similar to the member in question.
# If we find several, we merge them all into one clique.
#
# If there is no such clique, we put the member in a new singleton clique.
#
# NB: Cliques may *drift*, meaning that they contain members that are completely different from each other.
# They are in the same clique, because there is a path of pairwise similar members leading from the one chunk to the other.
#
# ### 4.3.1 Organizing the cliques
# In order to handle cases where there are many corresponding verses in corresponding chapters, we produce
# chapter-by-chapter diffs in the following way.
#
# We make a list of all chapters that are involved in cliques.
# This yields a list of chapter cliques.
# For all *binary* chapters cliques, we generate a colorful diff rendering (as HTML) for the complete two chapters.
#
# We only do this for *promising* experiments.
#
# ### 4.3.2 Evaluating clique sets
#
# Not all clique sets are equally worth while.
# For example, if we set the ``SIMILARITY_THRESHOLD`` too low, we might get one gigantic clique, especially
# in combination with a fine-grained chunking. In other words: we suffer from *clique drifting*.
#
# We detect clique drifting by looking at the size of the largest clique.
# If that is large compared to the total number of chunks, we deem the results unsatisfactory.
#
# On the other hand, when the ``SIMILARITY_THRESHOLD`` is too high, you might miss a lot of correspondences,
# especially when chunks are large, or when we have fixed-size chunks that are out of phase.
#
# We deem the results of experiments based on a partitioning into fixed length chunks as unsatisfactory, although it
# might be interesting to inspect what exactly the damage is.
#
# At the moment, we have not yet analyzed the relative merits of the similarity methods ``SET`` and ``LCS``.
# # 5. Implementation
#
#
# The rest is code. From here we fire up the engines and start computing.
# In[3]:
import sys, os, re, collections, pickle, math, difflib, glob
from difflib import SequenceMatcher
if not SCRIPT:
from IPython.display import HTML, display
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
PICKLE_PROTOCOL = 3
# sudo -H pip3 install python-Levenshtein
from Levenshtein import ratio
import utils
from tf.fabric import Fabric
# # Setting up the context: source file and target directories
#
# The conversion is executed in an environment of directories, so that sources, temp files and
# results are in convenient places and do not have to be shifted around.
# In[4]:
repoBase = os.path.expanduser('~/github/etcbc')
coreRepo = '{}/{}'.format(repoBase, CORE_NAME)
thisRepo = '{}/{}'.format(repoBase, NAME)
coreTf = '{}/tf/{}'.format(coreRepo, VERSION)
allTemp = '{}/_temp'.format(thisRepo)
thisTemp = '{}/_temp/{}'.format(thisRepo, VERSION)
thisTempTf = '{}/tf'.format(thisTemp)
thisTf = '{}/tf/{}'.format(thisRepo, VERSION)
thisNotes = '{}/shebanq/{}'.format(thisRepo, VERSION)
# In[5]:
notesFile = 'crossrefNotes.csv'
if not os.path.exists(thisNotes):
os.makedirs(thisNotes)
# # Test
#
# Check whether this conversion is needed in the first place.
# Only when run as a script.
# In[6]:
if SCRIPT:
(good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, 'crossref'), force=FORCE)
if not good: stop(good=False)
if not work: stop(good=True)
# ## 5.1 Loading the feature data
#
# We load the features we need from the BHSA core database.
# In[7]:
utils.caption(4, 'Load the existing TF dataset')
TF = Fabric(locations=coreTf, modules=[''])
# In[8]:
api = TF.load('''
otype
{} {} {}
book chapter verse number
'''.format(
LEX_FEATURE,
TEXT_FEATURE,
TRAILER_FEATURE,
))
api.makeAvailableIn(globals())
# ## 5.2 Configuration
#
# Here are the parameters on which the results crucially depend.
#
# There are also parameters that control the reporting of the results, such as file locations.
# In[9]:
# chunking
CHUNK_LABELS = {True: 'fixed', False: 'object'}
CHUNK_LBS = {True: 'F', False: 'O'}
CHUNK_SIZES = (100, 50, 20, 10)
CHUNK_OBJECTS = ('chapter', 'verse','half_verse','sentence')
# preparing
EXCLUDED_CONS = '[>WJ=/\[]' # weed out weak consonants
EXCLUDED_PAT = re.compile(EXCLUDED_CONS)
# similarity
MATRIX_THRESHOLD = 50
SIM_METHODS = ('SET', 'LCS')
SIMILARITIES = (100, 95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30)
# printing
DEP_CLIQUE_RATIO = 25
DUB_CLIQUE_RATIO = 15
REC_CLIQUE_RATIO = 5
LARGE_CLIQUE_SIZE = 50
CLIQUES_PER_FILE = 50
# assessing results
VALUE_LABELS = dict(
mis='no results available',
rec='promising results: recommended',
dep='messy results: deprecated',
dub='mixed quality: take care',
out='method deprecated',
nor='unassessed quality: inspection needed',
lr='this experiment is the last one run',
)
# note that the TF_TABLE and LOCAL_BASE_COMP are deliberately
# located in the version independent
# part of the tempdir.
# Here the results of expensive calculations are stored,
# to be used by all versions
# crossrefs for TF
TF_TABLE = '{}/parallelTable.tsv'.format(allTemp)
# crossrefs for SHEBANQ
SHEBANQ_MATRIX = (False, 'verse', 'SET')
SHEBANQ_SIMILARITY = 65
SHEBANQ_TOOL = 'parallel'
CROSSREF_STATUS = '!'
CROSSREF_KEYWORD = 'crossref'
# progress indication
VERBOSE = False
MEGA = 1000000
KILO = 1000
SIMILARITY_PROGRESS = 5 * MEGA
CLIQUES_PROGRESS = 1 * KILO
# locations and hyperlinks
LOCAL_BASE_COMP = '{}/calculus'.format(allTemp)
LOCAL_BASE_OUTP = 'files'
EXPERIMENT_DIR = 'experiments'
EXPERIMENT_FILE = 'experiments'
EXPERIMENT_PATH = '{}/{}.txt'.format(LOCAL_BASE_OUTP, EXPERIMENT_FILE)
EXPERIMENT_HTML = '{}/{}.html'.format(LOCAL_BASE_OUTP, EXPERIMENT_FILE)
NOTES_FILE = 'crossref'
NOTES_PATH = '{}/{}.csv'.format(LOCAL_BASE_OUTP, NOTES_FILE)
STORED_CLIQUE_DIR = 'stored/cliques'
STORED_MATRIX_DIR = 'stored/matrices'
STORED_CHUNK_DIR = 'stored/chunks'
CHAPTER_DIR = 'chapters'
CROSSREF_DB_FILE = 'crossrefdb.csv'
CROSSREF_DB_PATH = '{}/{}'.format(LOCAL_BASE_OUTP, CROSSREF_DB_FILE)
# ## 5.3 Experiment settings
#
# For each experiment we have to adapt the configuration settings to the parameters that define the experiment.
# In[10]:
def reset_params():
global CHUNK_FIXED, CHUNK_SIZE, CHUNK_OBJECT, CHUNK_LB, CHUNK_DESC
global SIMILARITY_METHOD, SIMILARITY_THRESHOLD, MATRIX_THRESHOLD
global meta
meta = collections.OrderedDict()
# chunking
CHUNK_FIXED = None # kind of chunking: fixed size or by object
CHUNK_SIZE = None # only relevant for CHUNK_FIXED = True
CHUNK_OBJECT = None # only relevant for CHUNK_FIXED = False; see CHUNK_OBJECTS in next cell
CHUNK_LB = None # computed from CHUNK_FIXED, CHUNK_SIZE, CHUNK_OBJ
CHUNK_DESC = None # computed from CHUNK_FIXED, CHUNK_SIZE, CHUNK_OBJ
# similarity
MATRIX_THRESHOLD = None # minimal similarity used to fill the matrix of similarities
SIMILARITY_METHOD = None # see SIM_METHODS in next cell
SIMILARITY_THRESHOLD = None # minimal similarity used to put elements together in cliques
meta = collections.OrderedDict()
def set_matrix_threshold(sim_m=None, chunk_o=None):
global MATRIX_THRESHOLD
the_sim_m = SIMILARITY_METHOD if sim_m == None else sim_m
the_chunk_o = CHUNK_OBJECT if chunk_o == None else chunk_o
MATRIX_THRESHOLD = 50 if the_sim_m == 'SET' else 60
if the_sim_m == 'SET':
if the_chunk_o == 'chapter': MATRIX_THRESHOLD = 30
else: MATRIX_THRESHOLD = 50
else:
if the_chunk_o == 'chapter': MATRIX_THRESHOLD = 55
else: MATRIX_THRESHOLD = 60
def do_params_chunk(chunk_f, chunk_i):
global CHUNK_FIXED, CHUNK_SIZE, CHUNK_OBJECT, CHUNK_LB, CHUNK_DESC
do_chunk = False
if chunk_f != CHUNK_FIXED or (chunk_f and chunk_i != CHUNK_SIZE) or (not chunk_f and chunk_i != CHUNK_OBJECT):
do_chunk = True
CHUNK_FIXED = chunk_f
if chunk_f: CHUNK_SIZE = chunk_i
else: CHUNK_OBJECT = chunk_i
CHUNK_LB = CHUNK_LBS[CHUNK_FIXED]
CHUNK_DESC = CHUNK_SIZE if CHUNK_FIXED else CHUNK_OBJECT
for p in (
'{}/{}'.format(LOCAL_BASE_OUTP, EXPERIMENT_DIR),
'{}/{}'.format(LOCAL_BASE_COMP, STORED_CHUNK_DIR),
):
if not os.path.exists(p): os.makedirs(p)
return do_chunk
def do_params(chunk_f, chunk_i, sim_m, sim_thr):
global CHUNK_FIXED, CHUNK_SIZE, CHUNK_OBJECT, CHUNK_LB, CHUNK_DESC
global SIMILARITY_METHOD, SIMILARITY_THRESHOLD, MATRIX_THRESHOLD
global meta
do_chunk = False
do_prep = False
do_sim = False
do_clique = False
meta = collections.OrderedDict()
if chunk_f != CHUNK_FIXED or (chunk_f and chunk_i != CHUNK_SIZE) or (not chunk_f and chunk_i != CHUNK_OBJECT):
do_chunk = True
do_prep = True
do_sim = True
do_clique = True
CHUNK_FIXED = chunk_f
if chunk_f: CHUNK_SIZE = chunk_i
else: CHUNK_OBJECT = chunk_i
if sim_m != SIMILARITY_METHOD:
do_prep = True
do_sim = True
do_clique = True
SIMILARITY_METHOD = sim_m
if sim_thr != SIMILARITY_THRESHOLD:
do_clique = True
SIMILARITY_THRESHOLD = sim_thr
set_matrix_threshold()
if SIMILARITY_THRESHOLD < MATRIX_THRESHOLD : return (False, False, False, False, True)
CHUNK_LB = CHUNK_LBS[CHUNK_FIXED]
CHUNK_DESC = CHUNK_SIZE if CHUNK_FIXED else CHUNK_OBJECT
meta['CHUNK TYPE'] = 'FIXED {}'.format(CHUNK_SIZE) if CHUNK_FIXED else 'OBJECT {}'.format(CHUNK_OBJECT)
meta['MATRIX THRESHOLD'] = MATRIX_THRESHOLD
meta['SIMILARITY METHOD'] = SIMILARITY_METHOD
meta['SIMILARITY THRESHOLD'] = SIMILARITY_THRESHOLD
for p in (
'{}/{}'.format(LOCAL_BASE_OUTP, EXPERIMENT_DIR),
'{}/{}'.format(LOCAL_BASE_OUTP, CHAPTER_DIR),
'{}/{}'.format(LOCAL_BASE_COMP, STORED_CLIQUE_DIR),
'{}/{}'.format(LOCAL_BASE_COMP, STORED_MATRIX_DIR),
'{}/{}'.format(LOCAL_BASE_COMP, STORED_CHUNK_DIR),
):
if not os.path.exists(p): os.makedirs(p)
return (do_chunk, do_prep, do_sim, do_clique, False)
reset_params()
# ## 5.4 Chunking
#
# We divide the text into chunks to be compared. The result is ``chunks``,
# which is a list of lists.
# Every chunk is a list of word nodes.
# In[11]:
def chunking(do_chunk):
global chunks, book_rank
if not do_chunk:
info('CHUNKING ({} {}): already chunked into {} chunks'.format(CHUNK_LB, CHUNK_DESC, len(chunks)))
meta['# CHUNKS'] = len(chunks)
return
chunk_path = '{}/{}/chunk_{}_{}'.format(
LOCAL_BASE_COMP, STORED_CHUNK_DIR,
CHUNK_LB, CHUNK_DESC,
)
if os.path.exists(chunk_path):
with open(chunk_path, 'rb') as f: chunks = pickle.load(f)
info('CHUNKING ({} {}): Loaded: {:>5} chunks'.format(
CHUNK_LB, CHUNK_DESC,
len(chunks),
))
else:
info('CHUNKING ({} {})'.format(CHUNK_LB, CHUNK_DESC))
chunks = []
book_rank = {}
for b in F.otype.s('book'):
book_name = F.book.v(b)
book_rank[book_name] = b
words = L.d(b, otype='word')
nwords = len(words)
if CHUNK_FIXED:
nchunks = nwords // CHUNK_SIZE
if nchunks == 0:
nchunks = 1
common_incr = nwords
special_incr = 0
else:
rem = nwords % CHUNK_SIZE
common_incr = rem // nchunks
special_incr = rem % nchunks
word_in_chunk = -1
cur_chunk = -1
these_chunks = []
for w in words:
word_in_chunk += 1
if word_in_chunk == 0 or (word_in_chunk >= CHUNK_SIZE + common_incr + (1 if cur_chunk < special_incr else 0)):
word_in_chunk = 0
these_chunks.append([])
cur_chunk += 1
these_chunks[-1].append(w)
else:
these_chunks = [L.d(c, otype='word') for c in L.d(b, otype=CHUNK_OBJECT)]
chunks.extend(these_chunks)
chunkvolume = sum(len(c) for c in these_chunks)
if VERBOSE:
info('CHUNKING ({} {}): {:<20s} {:>5} words; {:>5} chunks; sizes {:>5} to {:>5}; {:>5}'.format(
CHUNK_LB, CHUNK_DESC,
book_name, nwords, len(these_chunks),
min(len(c) for c in these_chunks),
max(len(c) for c in these_chunks),
'OK' if chunkvolume == nwords else 'ERROR',
))
with open(chunk_path, 'wb') as f: pickle.dump(chunks, f, protocol=PICKLE_PROTOCOL)
info('CHUNKING ({} {}): Made {} chunks'.format(CHUNK_LB, CHUNK_DESC, len(chunks)))
meta['# CHUNKS'] = len(chunks)
# ## 5.5 Preparing
#
# In order to compute similarities between chunks, we have to compile each chunk into the information that really matters for the comparison. This is dependent on the chosen method of similarity computing.
#
# ### 5.5.1 Preparing for SET comparison
#
# We reduce words to their lexemes (dictionary entries) and from them we also remove the aleph, wav, and yods.
# The lexeme feature also contains characters (`/ [ =`) to disambiguate homonyms. We also remove these.
# If we end up with something empty, we skip it.
# Eventually, we take the set of these reduced word lexemes, so that we effectively ignore order and multiplicity of words. In other words: the resulting similarity will be based on lexeme content.
#
# ### 5.5.2 Preparing for LCS comparison
#
# Again, we reduce words to their lexemes as for the SET preparation, and we do the same weeding of consonants and empty strings. But then we concatenate everything, separated by a space. So we preserve order and multiplicity.
# In[12]:
def preparing(do_prepare):
global chunk_data
if not do_prepare:
info('PREPARING ({} {} {}): Already prepared'.format(CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD))
return
info('PREPARING ({} {} {})'.format(CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD))
chunk_data = []
if SIMILARITY_METHOD == 'SET':
for c in chunks:
words = (EXCLUDED_PAT.sub('', Fs(LEX_FEATURE).v(w).replace('<', 'O')) for w in c)
clean_words = (w for w in words if w != '')
this_data = frozenset(clean_words)
chunk_data.append(this_data)
else:
for c in chunks:
words = (EXCLUDED_PAT.sub('', Fs(LEX_FEATURE).v(w).replace('<', 'O')) for w in c)
clean_words = (w for w in words if w != '')
this_data = ' '.join(clean_words)
chunk_data.append(this_data)
info('PREPARING ({} {} {}): Done {} chunks.'.format(CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, len(chunk_data)))
# ## 5.6 Similarity computation
#
# Here we implement our two ways of similarity computation.
# Both need a massive amount of work, especially for experiments with many small chunks.
# The similarities are stored in a ``matrix``, a data structure that stores a similarity number for each pair of chunk indexes.
# Most pair of chunks will be dissimilar. In order to save space, we do not store similarities below a certain threshold.
# We store matrices for re-use.
#
# ### 5.6.1 SET similarity
# The core is an operation on the sets, associated with the chunks by the prepare step. We take the cardinality of the intersection divided by the cardinality of the union.
# Intuitively, we compute the proportion of what two chunks have in common against their total material.
#
# In case the union is empty (both chunks have yielded an empty set), we deem the chunks not to be interesting as a parallel pair, and we set the similarity to 0.
#
# ### 5.6.2 LCS similarity
# The core is the method `ratio()`, taken from the Levenshtein module.
# Remember that the preparation step yielded a space separated string of lexemes, and these strings are compared on the basis of edit distance.
# In[13]:
def similarity_post():
nequals = len({x for x in chunk_dist if chunk_dist[x] >= 100})
cmin = min(chunk_dist.values()) if len(chunk_dist) else '!empty set!'
cmax = max(chunk_dist.values()) if len(chunk_dist) else '!empty set!'
meta['LOWEST AVAILABLE SIMILARITY'] = cmin
meta['HIGHEST AVAILABLE SIMILARITY'] = cmax
meta['# EQUAL COMPARISONS'] = nequals
info('SIMILARITY ({} {} {} M>{}): similarities between {} and {}. {} are 100%'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
cmin, cmax, nequals,
))
def similarity(do_sim):
global chunk_dist
total_chunks = len(chunks)
total_distances = total_chunks * (total_chunks - 1) // 2
meta['# SIMILARITY COMPARISONS'] = total_distances
SIMILARITY_PROGRESS = total_distances // 100
if SIMILARITY_PROGRESS >= MEGA:
sim_unit = MEGA
sim_lb = 'M'
else:
sim_unit = KILO
sim_lb = 'K'
if not do_sim:
info('SIMILARITY ({} {} {} M>{}): Using {:>5} {} ({}) comparisons with {} entries in matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
total_distances // sim_unit, sim_lb, total_distances, len(chunk_dist),
))
meta['# STORED SIMILARITIES'] = len(chunk_dist)
similarity_post()
return
matrix_path = '{}/{}/matrix_{}_{}_{}_{}'.format(
LOCAL_BASE_COMP, STORED_MATRIX_DIR,
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
)
if os.path.exists(matrix_path):
with open(matrix_path, 'rb') as f: chunk_dist = pickle.load(f)
info('SIMILARITY ({} {} {} M>{}): Loaded: {:>5} {} ({}) comparisons with {} entries in matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
total_distances // sim_unit, sim_lb, total_distances, len(chunk_dist),
))
meta['# STORED SIMILARITIES'] = len(chunk_dist)
similarity_post()
return
info('SIMILARITY ({} {} {} M>{}): Computing {:>5} {} ({}) comparisons and saving entries in matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
total_distances // sim_unit, sim_lb, total_distances
))
chunk_dist = {}
wc = 0
wt = 0
if SIMILARITY_METHOD == 'SET':
# method SET: all chunks have been reduced to sets, ratio between lengths of intersection and union
for i in range(total_chunks):
c_i = chunk_data[i]
for j in range(i + 1, total_chunks):
c_j = chunk_data[j]
u = len(c_i | c_j)
# HERE COMES THE SIMILARITY COMPUTATION
d = 100 * len(c_i & c_j) / u if u != 0 else 0
# HERE WE STORE THE OUTCOME
if d >= MATRIX_THRESHOLD:
chunk_dist[(i,j)] = d
wc += 1
wt += 1
if wc == SIMILARITY_PROGRESS:
wc = 0
info('SIMILARITY ({} {} {} M>{}): Computed {:>5} {} comparisons and saved {} entries in matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
wt // sim_unit, sim_lb, len(chunk_dist),
))
elif SIMILARITY_METHOD == 'LCS':
# method LCS: chunks are sequence aligned, ratio between length of all common parts and total length
for i in range(total_chunks):
c_i = chunk_data[i]
for j in range(i + 1, total_chunks):
c_j = chunk_data[j]
# HERE COMES THE SIMILARITY COMPUTATION
d = 100 * ratio(c_i, c_j)
# HERE WE STORE THE OUTCOME
if d >= MATRIX_THRESHOLD:
chunk_dist[(i,j)] = d
wc += 1
wt += 1
if wc == SIMILARITY_PROGRESS:
wc = 0
info('SIMILARITY ({} {} {} M>{}): Computed {:>5} {} comparisons and saved {} entries in matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
wt // sim_unit, sim_lb, len(chunk_dist),
))
with open(matrix_path, 'wb') as f: pickle.dump(chunk_dist, f, protocol=PICKLE_PROTOCOL)
info('SIMILARITY ({} {} {} M>{}): Computed {:>5} {} ({}) comparisons and saved {} entries in matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD,
wt // sim_unit, sim_lb, wt, len(chunk_dist),
))
meta['# STORED SIMILARITIES'] = len(chunk_dist)
similarity_post()
# ## 5.7 Cliques
#
# Based on the value for the ``SIMILARITY_THRESHOLD`` we use the similarity matrix to pick the *interesting*
# similar pairs out of it. From these pairs we lump together our cliques.
#
# Our list of experiments will select various values for ``SIMILARITY_THRESHOLD``, which will result
# in various types of clique behavior.
#
# We store computed cliques for re-use.
#
# ## 5.7.1 Selecting passages
#
# We take all pairs from the similarity matrix which are above the threshold, and add both members to a list of passages.
#
# ## 5.7.2 Growing cliques
# We inspect all passages in our set, and try to add them to the cliques we are growing.
# We start with an empty set of cliques.
# Each passage is added to a clique with which it has *enough familiarity*, otherwise it is added to a new clique.
# *Enough familiarity means*: the passage is similar to at least one member of the clique, and the similarity is at least ``SIMILARITY_THRESHOLD``.
# It is possible that a passage is thus added to more than one clique. In that case, those cliques are merged.
# This may lead to growing very large cliques if ``SIMILARITY_THRESHOLD`` is too low.
# In[14]:
def key_chunk(i):
c = chunks[i]
w = c[0]
return (-len(c), L.u(w, otype='book')[0], L.u(w, otype='chapter')[0], L.u(w, otype='verse')[0])
def meta_clique_pre():
global similars, passages
info('CLIQUES ({} {} {} M>{} S>{}): inspecting the similarity matrix'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD, SIMILARITY_THRESHOLD,
))
similars = {x for x in chunk_dist if chunk_dist[x] >= SIMILARITY_THRESHOLD}
passage_set = set()
for (i,j) in similars:
passage_set.add(i)
passage_set.add(j)
passages = sorted(passage_set, key=key_chunk)
meta['# SIMILAR COMPARISONS'] = len(similars)
meta['# SIMILAR PASSAGES'] = len(passages)
def meta_clique_pre2():
info('CLIQUES ({} {} {} M>{} S>{}): {} relevant similarities between {} passages'.format(
CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, MATRIX_THRESHOLD, SIMILARITY_THRESHOLD,
len(similars), len(passages),
))
def meta_clique_post():
global l_c_l
meta['# CLIQUES'] = len(cliques)
scliques = collections.Counter()
for c in cliques:
scliques[len(c)] += 1
l_c_l = max(scliques.keys()) if len(scliques) > 0 else 0
totmn = 0
totcn = 0