## Imports

In [199]:
# require lxml 4.9.1
from lxml import etree
import os
import pandas as pd
from collections import OrderedDict

## Constants

In [200]:
WORD_SENSES = "SensesNT.csv"

LOWFAT_SOURCE = "../../../Nestle1904/lowfat"
NODES_SOURCE = "../../../Nestle1904/lowfat"

LOWFAT_DEST = "lowfat"
NODES_DEST = "nodes"

NAMESPACE = "{http://www.w3.org/XML/1998/namespace}id"

SENSE_ATTR = "sense"

## Methods

In [201]:
def getSenseDataDict():
    senseDict = {}
    df = pd.read_csv(WORD_SENSES, dtype=str)
    for index, row in df.iterrows():
        strong = row['StrongNumber']
        sense = row['SenseNumber']
        # Create a key for each instance.
        try:
            instances = row['Instances'].split()
            if strong not in senseDict:
                senseDict[strong] = {}
            for inst in instances:
                senseDict[strong][inst] = sense
        # There were no instances.
        except:
            continue
    return senseDict

In [202]:
# Add leading 0s when len(strong) < 4.
def addZeros(strong):
    strongPadded = "" + strong
    l = 4 - len(strong)
    while l > 0:
        strongPadded = "0" + strongPadded 
        l -= 1
    return strongPadded

In [211]:
def addWordSenseData(source, destination):
    
    senseDict = getSenseDataDict()
    files = sorted(os.listdir(source))
    missingData = {}
    # Only use the 27 manuscripts of the NT -- ignore other files.
    for filename in files[:27]:

        readpath = os.path.join(source, filename)
        writepath = os.path.join(destination, filename)
        tree = etree.parse(readpath)
        root = tree.getroot()

        for element in root.iter('w'):
            # Don't include the id prefix 'n'.
            id = element.attrib.get(NAMESPACE)[1:]
            # '0010' is at the end of nodes in Sense file.
            idPadded = id + '0010'
            strong = element.attrib.get('strong')
            strongPadded = addZeros(strong)
            try:
                senseNumber = senseDict[strongPadded][idPadded]
                element.set(SENSE_ATTR, senseNumber)
            except:
                if strongPadded not in missingData:
                    missingData[strongPadded] = id
                elif strongPadded in missingData:
                    missingData[strongPadded] += " " + id
                element.set(SENSE_ATTR, 'TBD')

        # Write updated xml tree to file. 
        tree.write(open(writepath, 'wb'))

    return missingData

In [212]:
# Count total number of word nodes in Greek NT.
def getTotalWordNodes(source):
    files = sorted(os.listdir(source))
    count = 0
    for filename in files[:27]:

        filepath = os.path.join(source, filename)
        tree = etree.parse(filepath)
        root = tree.getroot()

        for element in root.iter('w'):
            count += 1

    return count

In [213]:
# Pass in result from addWordSenseData
def printMissingNodes(data):
    x = OrderedDict(sorted(data.items()))
    print('strong' + "," + 'instances')
    for k,v in x.items():
        print(k + "," + v)

In [214]:
def getTotalNodes(data):
    total = 0
    for k,v in data.items():
        total += len(v.split())
    return total

In [215]:
# Number of instances in sense data. 
def getSenseNodeCount():
    total = 0
    for k,v in getSenseDataDict().items():
        for k in v.keys():
            if k[-4:] != "0010":
                print(k)
            else:
                total += 1
    return total

## Execute Methods

In [216]:
nodes = addWordSenseData(NODES_SOURCE, NODES_DEST)

In [217]:
lowfat = addWordSenseData(LOWFAT_SOURCE, LOWFAT_DEST)

In [218]:
print(getTotalNodes(nodes))

83209
