## Imports

In [133]:
# require lxml 4.9.1
from lxml import etree
import os
import pandas as pd

## Constants

In [134]:
WORD_SENSES = "SensesNT.csv"

LOWFAT_SOURCE = "../../../Nestle1904/lowfat"
NODES_SOURCE = "../../../Nestle1904/lowfat"

LOWFAT_DEST = "lowfat"
NODES_DEST = "nodes"

NAMESPACE = "{http://www.w3.org/XML/1998/namespace}id"

## Methods

In [135]:
def getSenseDataDict():
    senseDict = {}
    df = pd.read_csv(WORD_SENSES, dtype=str)
    for index, row in df.iterrows():
        strong = row['StrongNumber']
        sense = row['SenseNumber']
        # Create a key for each instance.
        try:
            instances = row['Instances'].split()
            if strong not in senseDict:
                senseDict[strong] = {}
            for inst in instances:
                senseDict[strong][inst] = sense
        # There were no instances.
        except:
            continue
    return senseDict

In [136]:
# Add leading 0s when len(strong) < 4.
def addZeros(strong):
    strongPadded = "" + strong
    l = 4 - len(strong)
    while l > 0:
        strongPadded = "0" + strongPadded 
        l -= 1
    return strongPadded

In [147]:
def addWordSenseData(source, destination):
    
    senseDict = getSenseDataDict()
    files = sorted(os.listdir(source))
    # Only use the 27 manuscripts of the NT -- ignore other files.
    missingData = {}

    for filename in files[:27]:

        filepath = os.path.join(source, filename)
        tree = etree.parse(filepath)
        root = tree.getroot()

        for element in root.iter('w'):
            # Don't include the id prefix 'n'.
            id = element.attrib.get(NAMESPACE)[1:]
            # '0010' is at the end of nodes in Sense file.
            idPadded = id + '0010'
            strong = element.attrib.get('strong')
            strongPadded = addZeros(strong)
            try:
                senseNumber = senseDict[strongPadded][idPadded]
                print(id, senseNumber)
            except:
                if strongPadded not in missingData:
                    missingData[strongPadded] = id
                elif strongPadded in missingData:
                    missingData[strongPadded] += " " + id

    return missingData

In [148]:
# Count total number of word nodes in Greek NT.
def getTotalWordNodes(source):
    files = sorted(os.listdir(source))
    count = 0
    for filename in files[:27]:

        filepath = os.path.join(source, filename)
        tree = etree.parse(filepath)
        root = tree.getroot()

        for element in root.iter('w'):
            count += 1

    return count

# print(getTotalWordNodes(LOWFAT_SOURCE))

In [None]:
d = addWordSenseData(LOWFAT_SOURCE, LOWFAT_DEST)

In [None]:
from collections import OrderedDict
a = OrderedDict(sorted(d.items()))

print('strong' + "," + 'instances')
for k,v in a.items():
    print(k + "," + v)

In [151]:
total = 0
for k,v in a.items():
    total += len(v.split())
print(total)

83209


In [142]:
# Number of instances in sense data. 
total = 0
for k,v in getSenseDataDict().items():
    for k in v.keys():
        if k[-4:] != "0010":
            print(k)
        else:
            total += 1
print(total)

60574
