Skip to content

Commit

Permalink
(Excel) table to consumable XML conversion script
Browse files Browse the repository at this point in the history
  • Loading branch information
thvitt committed Sep 15, 2021
1 parent 2fc56f2 commit ce212b9
Show file tree
Hide file tree
Showing 2 changed files with 232 additions and 0 deletions.
12 changes: 12 additions & 0 deletions utils/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,15 @@ Cf. faustedition/faust-gen-html#36.
## detect_pages.py

Tries to find the bounding box of the page in the facsimile images and writes them to a json file. Requires scikit-image, use `--help`.

## table2xml.py

Configuration-driven tool to convert tables (as in Excel) to an XML consumable by XSLTs.

Run `table2xml.py -o foo.xml foo.xlsx` to convert the table in foo.xlsx to a standard XML representation. If you wish to customize the transformation,

1. Generate a configuration file by calling `table2xml.py -C foo-config.yml foo.xlsx`
2. Edit `foo-config.yml` to your liking. It is a documented YAML file.
3. Run `table2xml.py -c foo-config.yml -o foo.xml foo.xlsx` to create the customized XML file.

You can use `-c` and `-C` at the same time to adjust an existing configuration to new table headers.
220 changes: 220 additions & 0 deletions utils/table2xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env python3

import argparse
from email.policy import default
import re
from ast import arg
from dataclasses import dataclass
from io import StringIO
from os import fspath
from pathlib import Path
from typing import Iterable, Optional

import pandas as pd
from lxml import etree, objectify
from lxml.builder import E, ElementMaker
from ruamel.yaml import YAML

DOC = """
An opinionated and configurable converter from tables (Excel, csv) to XML input.
The script will parse a table (using pandas) and create an XML document that
contains <record> element for each table row which will contain a <value>
element for each non-nan value of that row. Many aspects are configurable: use
``table2xml.py -C config.yml input.xlsx`` to create a starting point config
file for the given table, modify config.yml and use
``table2xml.py -c config.yml -o doc.xml input.xlsx`` to generate an XML file
according to this configuration file.
"""

DEFAULT_CONFIG = """
# Namespace of the XML document. "" = no namespace
namespace: ""
# Name of the root element
root: table
# Name of the element for reach record
row: record
# If true, transpose the table, i.e. one record is one column instead of one row
transpose: false
# Default column handling. The default generates <value key="column header">cell contents</value>
default:
element: value
key: key
skipna: true
skipre: false
# Each column specification can have the following keys:
#
# header: the name of the table header. Required except for the default (see below)
# element: Name of the element to generate. If this is missing, fall back to the default and ignore key and value.
# key: If present, an attribute that will receive the key (i.e. the column header name)
# value: If present, an attribute that will receive the value. If missing, the value will be the element’s text content
# skip: If present and true, this column will never be included with the conversion.
# skipna: If present and true, missing (nan) values will be skipped in the xml
# skipre: If present and a non-empty regular expression, the cell will be skipped if it matches the expression
# Configuration for each column. If a column is not in this list, or if a column’s entry only has the header key,
# it will be handled according to the default column specification above.
columns: []
"""


def getargparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(description=DOC)
p.add_argument("table", type=Path, help="Table to read")
p.add_argument(
"-C",
"--write-config",
type=argparse.FileType("wb"),
help="Write a configuration file",
)
p.add_argument(
"-c",
"--config",
type=argparse.FileType("rb"),
help="Read the configuration file",
)
p.add_argument(
"-o",
"--output",
type=argparse.FileType("wb"),
help="Convert to XML and write file",
)
return p


def read_table(path: Path) -> pd.DataFrame:
if path.suffix in {".xlsx", ".xls"}:
return pd.read_excel(fspath(path))
if path.suffix == ".csv":
return pd.read_csv(fspath(path))
return pd.read_table(fspath(path))


def without_keys(dictionary: dict, keys: Iterable) -> dict:
"""Returns a copy of the given dictionary without the given keys."""
result = dict(dictionary)
for key in keys:
if key in result:
del result[key]
return result


class Converter:
def __init__(self) -> None:
yaml = YAML()
self.config = yaml.load(StringIO(DEFAULT_CONFIG))
self._columns = None

def load_config(self, config_file):
yaml = YAML()
with config_file:
self.config.update(yaml.load(config_file))
self._columns = None

def save_config(self, config_file):
yaml = YAML()
with config_file:
yaml.dump(self.config, config_file)

def add_columns(self, columns: Iterable):
"""Add column specs from the given list to the spec"""
for header in map(str, columns):
if header not in self.columns:
self.config["columns"].append({"header": header})
self._columns = None

@property
def columns(self):
"""
The columns cache. A dictionary {header: spec}, where spec is a columns
spec dictionary with values from the config defaults filled in.
"""
if self._columns is None:
self._columns = self._get_columns_cache()
return self._columns

def _get_columns_cache(self):
columns = {}
for col in self.config["columns"] or []:
colspec = dict(col)
# a colspec w/o header doesn’t really make sense, but if it has an element name, we expect a like-named column.
if "header" not in colspec:
if "element" in colspec:
colspec["header"] = colspec["element"]
else:
continue # skip broken config

# copy the defaults and update them with the given config.
if "element" in colspec:
# if an element is specified, key and value will not be taken from the default config
defaults = without_keys(
self.config["default"], ["key", "value", "skip"]
)
else:
defaults = dict(self.config["default"])
defaults.update(colspec)

# now make the full spec accessable via header
columns[colspec["header"]] = defaults
return columns

def table2xml(self, table: pd.DataFrame):
if self.config.get("transpose", False):
table = table.T
self.add_columns(table.columns) # just in case something is missing
records = table.to_dict(orient="records")

if self.config["namespace"]:
nsargs = dict(
namespace=self.config["namespace"],
nsmap={None: self.config["namespace"]},
)
else:
nsargs = {}

E = ElementMaker(**nsargs)
root_el = E(self.config["root"])
for row in records:
row_el = E(self.config["row"])
for key, value in row.items():
spec = self.columns[key]
if spec.get("skip", False):
continue
if spec.get("skipna", False) and pd.isna(value):
continue
if spec.get("skipre", False) and re.match(spec["skipre"], str(value)):
continue
item_el = E(spec["element"])
if spec.get("key"):
item_el.set(spec["key"], key)
if spec.get("value"):
item_el.set(spec["value"], value)
else:
item_el.text = str(value)
row_el.append(item_el)
root_el.append(row_el)
self.xml = root_el.getroottree()
return root_el


def _main():
options = getargparser().parse_args()
converter = Converter()
if options.config:
converter.load_config(options.config)
table = read_table(options.table)
if converter.config.get("transpose"):
converter.add_columns(table.index)
else:
converter.add_columns(table.columns)
if options.write_config:
converter.save_config(options.write_config)
if options.output:
et = converter.table2xml(table).getroottree()
et.write(options.output, encoding="utf-8", pretty_print=True)


if __name__ == "__main__":
_main()

0 comments on commit ce212b9

Please sign in to comment.