## Imports

In [186]:
# require lxml 4.9.1
from lxml import etree
import os
import pandas as pd
from collections import OrderedDict

## Constants

In [187]:
WORD_SENSES = "SensesNT.csv"

LOWFAT_SOURCE = "../../../Nestle1904/lowfat"
NODES_SOURCE = "../../../Nestle1904/lowfat"

LOWFAT_DEST = "lowfat"
NODES_DEST = "nodes"

NAMESPACE = "{http://www.w3.org/XML/1998/namespace}id"

SENSE_ATTR = "sensenumber"

## Methods

In [None]:
def getSenseDataDict():
    senseDict = {}
    df = pd.read_csv(WORD_SENSES, dtype=str)
    for index, row in df.iterrows():
        strong = row['StrongNumber']
        sense = row['SenseNumber']
        # Create a key for each instance.
        try:
            instances = row['Instances'].split()
            if strong not in senseDict:
                senseDict[strong] = {}
            for inst in instances:
                senseDict[strong][inst] = sense
        # There were no instances.
        except:
            continue
    return senseDict

In [None]:
# Add leading 0s when len(strong) < 4.
def addZeros(strong):
    strongPadded = "" + strong
    l = 4 - len(strong)
    while l > 0:
        strongPadded = "0" + strongPadded 
        l -= 1
    return strongPadded

In [None]:
def addWordSenseData(source, destination):
    
    senseDict = getSenseDataDict()
    files = sorted(os.listdir(source))
    missingData = {}
    # Only use the 27 manuscripts of the NT -- ignore other files.
    for filename in files[:2]:

        readpath = os.path.join(source, filename)
        writepath = os.path.join(destination, filename)
        tree = etree.parse(readpath)
        root = tree.getroot()

        for element in root.iter('w'):
            # Don't include the id prefix 'n'.
            id = element.attrib.get(NAMESPACE)[1:]
            # '0010' is at the end of nodes in Sense file.
            idPadded = id + '0010'
            strong = element.attrib.get('strong')
            strongPadded = addZeros(strong)
            try:
                senseNumber = senseDict[strongPadded][idPadded]
                element.set(SENSE_ATTR, senseNumber)
            except:
                print(idPadded,strongPadded)
                if strongPadded not in missingData:
                    missingData[strongPadded] = id
                elif strongPadded in missingData:
                    missingData[strongPadded] += " " + id
                element.set(SENSE_ATTR, 'TBD')

        # Write updated xml tree to file. 
        tree.write(open(writepath, 'wb'))

    return missingData

In [None]:
# Count total number of word nodes in Greek NT.
def getTotalWordNodes(source):
    files = sorted(os.listdir(source))
    count = 0
    for filename in files[:27]:

        filepath = os.path.join(source, filename)
        tree = etree.parse(filepath)
        root = tree.getroot()

        for element in root.iter('w'):
            count += 1

    return count

In [None]:
# Pass in result from addWordSenseData
def printMissingNodes(data):
    x = OrderedDict(sorted(data.items()))
    print('strong' + "," + 'instances')
    for k,v in x.items():
        print(k + "," + v)

In [None]:
def getTotalNodes(data):
    total = 0
    for k,v in data.items():
        total += len(v.split())
    return total

In [None]:
# Number of instances in sense data. 
def getSenseNodeCount():
    total = 0
    for k,v in getSenseDataDict().items():
        for k in v.keys():
            if k[-4:] != "0010":
                print(k)
            else:
                total += 1
    return total

## Execute Methods

In [None]:
nodes = addWordSenseData(NODES_SOURCE, NODES_DEST)

400010010010010 0976
400010010020010 1078
400010010030010 2424
400010010040010 5547
400010010050010 5207
400010010060010 1138
400010010070010 5207
400010010080010 0011
400010020010010 0011
400010020020010 1080
400010020030010 3588
400010020040010 2464
400010020060010 1161
400010020050010 2464
400010020070010 1080
400010020080010 3588
400010020090010 2384
400010020110010 1161
400010020100010 2384
400010020120010 1080
400010020130010 3588
400010020140010 2455
400010020150010 2532
400010020160010 3588
400010020170010 0080
400010020180010 0846
400010030020010 1161
400010030010010 2455
400010030030010 1080
400010030040010 3588
400010030050010 5329
400010030060010 2532
400010030070010 3588
400010030080010 2196
400010030090010 1537
400010030100010 3588
400010030110010 2283
400010030130010 1161
400010030120010 5329
400010030140010 1080
400010030150010 3588
400010030160010 2074
400010030180010 1161
400010030170010 2074
400010030190010 1080
400010030200010 3588
400010030210010 0689
4000100400200

KeyboardInterrupt: 

In [None]:
lowfat = addWordSenseData(LOWFAT_SOURCE, LOWFAT_DEST)

In [None]:
print(getTotalNodes(nodes))

137779
