<a href="https://colab.research.google.com/github/Clear-Bible/Alignments/blob/main/notebooks/Alignments.ipynb" target="_parent">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Bible Alignments

This notebook shows a simple example of loading and displaying the alignment data for Mark 1-2. 

This assumes you have a local copy of the repo for loading data files. 

In [1]:
%%capture
# quietly install the code and dependencies
# comment out the capture line for debugging information
!pip install "bible-alignments>=0.3"

In [2]:
import os
from pathlib import Path

import pandas as pd
# don't limit the width
pd.set_option("max_colwidth", None)
pd.set_option("display.max_columns", None)

from bible_alignments import config, grapecity

DATAPATH = config.ROOT / "data"
print(f"DATAPATH = {DATAPATH}")

config.ALIGNMENTS = DATAPATH / "alignments"
config.SOURCES = DATAPATH / "sources"
config.TARGETS = DATAPATH / "targets"
config.NAMES = DATAPATH / "names"


DATAPATH = /Users/sboisen/git/Clear-Bible/Alignments/data


In [3]:
# display the source data for Mark 1:1
# source text is omitted because of copyright, but lemmas indicate the words
# use the configuration for Young's Literal Translation so we can include target text
from bible_alignments import gcsource
cfg = config.Configuration(sourceid="NA27", targetid="YLT", targetlanguage="eng", processid="manual")
sourcerd = gcsource.Reader(cfg)

_fields: tuple = ("identifier", "text", "lemma", "pos", "morph", "gloss")
print("\t     ".join(_fields))
for k in sourcerd.keys():
    if k.startswith("41001001"):
        s = sourcerd[k]
        print("\t     ".join(f"{getattr(s, f):10}" for f in _fields))

identifier	     text	     lemma	     pos	     morph	     gloss
410010010011	     --        	     ἀρχή      	     noun      	     n- -nsf-  	     [the] beginning
410010010021	     --        	     ὁ         	     det       	     ra -gsn-  	     of the    
410010010031	     --        	     εὐαγγέλιον	     noun      	     n- -gsn-  	     good news 
410010010041	     --        	     Ἰησοῦς    	     Name      	     nr -gsm-  	     of Jesus  
410010010051	     --        	     Χριστός   	     Name      	     nr -gsm-  	     Christ    
410010010061	     --        	     υἱός      	     noun      	     n- -gsm-  	     [the] son 
410010010071	     --        	     θεός      	     noun      	     n- -gsm-  	     of God.   


In [4]:
# likewise with the target data
# note the isPunc attribute is not correct for several tokens
from bible_alignments import gctarget
targetrd = gctarget.Reader(cfg)

_fields: tuple = ("identifier", "text", "transType", "isPunc", "isPrimary")
print("\t     ".join(_fields))
for k in targetrd.keys():
    if k.startswith("41001001"):
        s = targetrd[k]
        print("\t     ".join(f"{getattr(s, f):10}" for f in _fields))

identifier	     text	     transType	     isPunc	     isPrimary
41001001001	     A         	     m         	              0	              0
41001001002	     beginning 	     k         	              0	              1
41001001003	     of        	     m         	              0	              0
41001001004	     the       	     k         	              0	              1
41001001005	     good      	     k         	              0	              1
41001001006	     news      	     k         	              0	              0
41001001007	     of        	     m         	              0	              0
41001001008	     Jesus     	     k         	              0	              1
41001001009	     Christ    	     k         	              0	              1
41001001010	     ,         	               	              0	              0
41001001011	     Son       	     k         	              0	              1
41001001012	     of        	     m         	              0	              0
41001001013	     God     

In [5]:
rd = grapecity.Reader(cfg)

## Displaying Alignment Data

In [6]:
# show an overview of the loaded alignment data
# note there are about 50% more English tokens than Greek ones: many of these are likely to be punctuation.
rd.display()

Source:	NA27	(138013 words)
Target:	YLT	(223879 words)
Process:	manual
127902 alignments


In [7]:
# display the aligned tokens for Mark 1:1
for k in rd.keys():
    if k.startswith("41001001"):
        rd[k].display()

41001001.1: ['ἀρχή']	['beginning', 'A']
41001001.2: ['ὁ']	['the']
41001001.3: ['εὐαγγέλιον']	['good', 'of', 'news']
41001001.4: ['Ἰησοῦς']	['Jesus', 'of']
41001001.5: ['Χριστός']	['Christ']
41001001.6: ['υἱός']	['Son']
41001001.7: ['θεός']	['God', 'of']


In [8]:
# display the aligned tokens for Mark 1:2
for k in rd.keys():
    if k.startswith("41001002"):
        rd[k].display()

41001002.1: ['καθώς']	['As']
41001002.2: ['γράφω']	['written', 'it', 'hath', 'been']
41001002.3: ['ἐν']	['in']
41001002.4: ['ὁ']	['the']
41001002.5: ['προφήτης']	['prophets']
41001002.6: ['ὁράω']	['Lo']
41001002.7: ['ἀποστέλλω']	['send', 'I']
41001002.8: ['ἄγγελος']	['messenger']
41001002.9: ['ἐγώ']	['My']
41001002.10: ['πρό']	['before']
41001002.11: ['πρόσωπον']	['face']
41001002.12: ['σύ']	['thy']
41001002.13: ['ὅς']	['who']
41001002.14: ['κατασκευάζω']	['prepare', 'shall']
41001002.15: ['ὁδός']	['way']
41001002.16: ['σύ']	['thy']


In [9]:
# return alignment groups whose lemma is 'προφήτης' (prophet) and display the first 20
for alignmentgroup in rd.source_concordance('προφήτης')[:20]:
    alignmentgroup.display()

40001022.12: ['προφήτης']	['prophet']
40002005.13: ['προφήτης']	['prophet']
40002015.15: ['προφήτης']	['prophet']
40002017.8: ['προφήτης']	['prophet']
40002023.13: ['προφήτης']	['prophets']
40003003.9: ['προφήτης']	['prophet']
40004014.7: ['προφήτης']	['prophet']
40005012.15: ['προφήτης']	['prophets']
40005017.10: ['προφήτης']	['prophets']
40007012.20: ['προφήτης']	['prophets']
40008017.7: ['προφήτης']	['prophet']
40010041.3: ['προφήτης']	['prophet', 'a']
40010041.6: ['προφήτης']	['prophet', 'of', 'a']
40010041.8: ['προφήτης']	['prophet’s', 'a']
40011009.5: ['προφήτης']	['prophet', 'a']
40011009.11: ['προφήτης']	['prophet', 'a']
40011013.4: ['προφήτης']	['prophets']
40012017.7: ['προφήτης']	['prophet']
40012039.21: ['προφήτης']	['prophet']
40013017.7: ['προφήτης']	['prophets']


In [11]:
# Show an alignment matrix for MRK 1:2, displaying Greek lemmas
# -G- indicates gold standard alignments
#
# Notes:
# - YLT has a different textual basis, so doesn't include any alignable text for "in Isaiah"
rd.dataframe("41001001")

Unnamed: 0,A,beginning,of,the,good,news,of.1,Jesus,Christ,",",Son,of.2,God,.
ἀρχή,-G-,-G-,,,,,,,,,,,,
ὁ,,,,-G-,,,,,,,,,,
εὐαγγέλιον,,,-G-,,-G-,-G-,,,,,,,,
Ἰησοῦς,,,,,,,-G-,-G-,,,,,,
Χριστός,,,,,,,,,-G-,,,,,
υἱός,,,,,,,,,,,-G-,,,
θεός,,,,,,,,,,,,-G-,-G-,
