Skip to content

Commit

Permalink
Script to add the idents to the metadata records.
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Jan 12, 2017
1 parent 9fccaf5 commit e8b3f06
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 1 deletion.
File renamed without changes.
8 changes: 7 additions & 1 deletion sigil-tools/README.md → utils/README.md
@@ -1,6 +1,6 @@
## Metadata Tools

Features tools (well, currently one hack) to mass-maintain metadata.
Features tools (well, currently one hack) to mass-maintain, e.g., metadata.


### sigils-table.py
Expand All @@ -10,3 +10,9 @@ documents, and to write new sigils created in that table back into the
metadata.

Currently requires pandas and python3. Run with `--help` for help.

### add-gsa-numbers.py

Hack to add the gsa 'Idents' to the metadata.

Cf. faustedition/faust-gen-html#36.
68 changes: 68 additions & 0 deletions utils/add-gsa-numbers.py
@@ -0,0 +1,68 @@
#!/usr/bin/env python3
# coding: utf-8

import pandas as pd
from lxml import etree
ns=dict(f='http://www.faustedition.net/ns')
import os
import sys
import logging
logging.basicConfig(level=logging.INFO,
format='%(levelname)s:%(funcName)s:%(message)s')
logger = logging.getLogger(__name__ if __name__ != '__main__' else sys.argv[0])


def read_table(mapping_filename,
transcript_filename='../target/faust-transcripts.xml'):
table = pd.read_excel(mapping_filename, index_col="Signatur")
table['docpath'] = None
idno_xml = etree.parse(transcript_filename)

for idno_gsa in table.index:
transcript_el = idno_xml.xpath("//*[f:idno = 'GSA %s']" % idno_gsa,
namespaces=ns)
if not transcript_el:
logger.warn("No file found for signature %s", idno_gsa)
continue
else:
logger.debug("Found an entry for signature %s", idno_gsa)

docpath = transcript_el[0].attrib['document']
oldpath = table.loc[idno_gsa, "docpath"]
if oldpath is not None and oldpath != docpath:
logger.warn("Overwriting %s with %s for signature %s", oldpath,
docpath, idno_gsa)
table.loc[idno_gsa, "docpath"] = docpath
return table


def write_idnos(table, rootdir='../data/xml'):
for entry in table.itertuples():
if entry.docpath is None:
logger.warn("Skipping %s", entry)
continue
fullpath = os.path.join(rootdir, entry.docpath)
md_xml = etree.parse(fullpath)
first_idno = md_xml.xpath("//f:idno[. = 'GSA %s']" % entry.Index,
namespaces=ns)[0]
tail = first_idno.tail
new_idno = etree.Element("{http://www.faustedition.net/ns}idno")
new_idno.attrib['type'] = 'gsa_ident'
new_idno.text = str(entry.Ident)
first_idno.addnext(new_idno)
first_idno.tail = tail
md_xml.write(fullpath, encoding='utf-8', xml_declaration=True)


def main():
if len(sys.argv) < 2:
logger.error("Usage: %s excel-file [xml-dir]", sys.argv[0])
sys.exit(1)
table = read_table(sys.argv[1])
if len(sys.argv) >= 3:
write_idnos(table, sys.argv[2])
else:
write_idnos(table)

if __name__ == '__main__':
main()
File renamed without changes.

0 comments on commit e8b3f06

Please sign in to comment.