(Excel) table to consumable XML conversion script

faustedition · Sep 15, 2021 · ce212b9 · ce212b9
1 parent 2fc56f2
commit ce212b9
Show file tree

Hide file tree

Showing 2 changed files with 232 additions and 0 deletions.
diff --git a/utils/README.md b/utils/README.md
@@ -20,3 +20,15 @@ Cf. faustedition/faust-gen-html#36.
 ## detect_pages.py
 
 Tries to find the bounding box of the page in the facsimile images and writes them to a json file. Requires scikit-image, use `--help`.
+
+## table2xml.py
+
+Configuration-driven tool to convert tables (as in Excel) to an XML consumable by XSLTs.
+
+Run `table2xml.py -o foo.xml foo.xlsx` to convert the table in foo.xlsx to a standard XML representation. If you wish to customize the transformation,
+
+1. Generate a configuration file by calling `table2xml.py -C foo-config.yml foo.xlsx`
+2. Edit `foo-config.yml` to your liking. It is a documented YAML file.
+3. Run `table2xml.py -c foo-config.yml -o foo.xml foo.xlsx` to create the customized XML file.
+
+You can use `-c` and `-C` at the same time to adjust an existing configuration to new table headers.
diff --git a/utils/table2xml.py b/utils/table2xml.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+
+import argparse
+from email.policy import default
+import re
+from ast import arg
+from dataclasses import dataclass
+from io import StringIO
+from os import fspath
+from pathlib import Path
+from typing import Iterable, Optional
+
+import pandas as pd
+from lxml import etree, objectify
+from lxml.builder import E, ElementMaker
+from ruamel.yaml import YAML
+
+DOC = """
+An opinionated and configurable converter from tables (Excel, csv) to XML input.
+
+The script will parse a table (using pandas) and create an XML document that
+contains <record> element for each table row which will contain a <value>
+element for each non-nan value of that row. Many aspects are configurable: use
+``table2xml.py -C config.yml input.xlsx`` to create a starting point config
+file for the given table, modify config.yml and use 
+``table2xml.py -c config.yml -o doc.xml input.xlsx`` to generate an XML file
+according to this configuration file.
+"""
+
+DEFAULT_CONFIG = """
+# Namespace of the XML document. "" = no namespace
+namespace: ""
+# Name of the root element
+root: table
+# Name of the element for reach record
+row: record
+# If true, transpose the table, i.e. one record is one column instead of one row
+transpose: false
+
+# Default column handling. The default generates <value key="column header">cell contents</value>
+default:
+  element: value 
+  key: key
+  skipna: true
+  skipre: false
+
+# Each column specification can have the following keys:
+#
+# header: the name of the table header. Required except for the default (see below)
+# element: Name of the element to generate. If this is missing, fall back to the default and ignore key and value.
+# key: If present, an attribute that will receive the key (i.e. the column header name)
+# value: If present, an attribute that will receive the value. If missing, the value will be the element’s text content
+# skip: If present and true, this column will never be included with the conversion.
+# skipna: If present and true, missing (nan) values will be skipped in the xml
+# skipre: If present and a non-empty regular expression, the cell will be skipped if it matches the expression
+
+# Configuration for each column. If a column is not in this list, or if a column’s entry only has the header key,
+# it will be handled according to the default column specification above.
+columns: []
+"""
+
+
+def getargparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(description=DOC)
+    p.add_argument("table", type=Path, help="Table to read")
+    p.add_argument(
+        "-C",
+        "--write-config",
+        type=argparse.FileType("wb"),
+        help="Write a configuration file",
+    )
+    p.add_argument(
+        "-c",
+        "--config",
+        type=argparse.FileType("rb"),
+        help="Read the configuration file",
+    )
+    p.add_argument(
+        "-o",
+        "--output",
+        type=argparse.FileType("wb"),
+        help="Convert to XML and write file",
+    )
+    return p
+
+
+def read_table(path: Path) -> pd.DataFrame:
+    if path.suffix in {".xlsx", ".xls"}:
+        return pd.read_excel(fspath(path))
+    if path.suffix == ".csv":
+        return pd.read_csv(fspath(path))
+    return pd.read_table(fspath(path))
+
+
+def without_keys(dictionary: dict, keys: Iterable) -> dict:
+    """Returns a copy of the given dictionary without the given keys."""
+    result = dict(dictionary)
+    for key in keys:
+        if key in result:
+            del result[key]
+    return result
+
+
+class Converter:
+    def __init__(self) -> None:
+        yaml = YAML()
+        self.config = yaml.load(StringIO(DEFAULT_CONFIG))
+        self._columns = None
+
+    def load_config(self, config_file):
+        yaml = YAML()
+        with config_file:
+            self.config.update(yaml.load(config_file))
+        self._columns = None
+
+    def save_config(self, config_file):
+        yaml = YAML()
+        with config_file:
+            yaml.dump(self.config, config_file)
+
+    def add_columns(self, columns: Iterable):
+        """Add column specs from the given list to the spec"""
+        for header in map(str, columns):
+            if header not in self.columns:
+                self.config["columns"].append({"header": header})
+        self._columns = None
+
+    @property
+    def columns(self):
+        """
+        The columns cache. A dictionary {header: spec}, where spec is a columns
+        spec dictionary with values from the config defaults filled in.
+        """
+        if self._columns is None:
+            self._columns = self._get_columns_cache()
+        return self._columns
+
+    def _get_columns_cache(self):
+        columns = {}
+        for col in self.config["columns"] or []:
+            colspec = dict(col)
+            # a colspec w/o header doesn’t really make sense, but if it has an element name, we expect a like-named column.
+            if "header" not in colspec:
+                if "element" in colspec:
+                    colspec["header"] = colspec["element"]
+                else:
+                    continue  # skip broken config
+
+            # copy the defaults and update them with the given config.
+            if "element" in colspec:
+                # if an element is specified, key and value will not be taken from the default config
+                defaults = without_keys(
+                    self.config["default"], ["key", "value", "skip"]
+                )
+            else:
+                defaults = dict(self.config["default"])
+            defaults.update(colspec)
+
+            # now make the full spec accessable via header
+            columns[colspec["header"]] = defaults
+        return columns
+
+    def table2xml(self, table: pd.DataFrame):
+        if self.config.get("transpose", False):
+            table = table.T
+        self.add_columns(table.columns)  # just in case something is missing
+        records = table.to_dict(orient="records")
+
+        if self.config["namespace"]:
+            nsargs = dict(
+                namespace=self.config["namespace"],
+                nsmap={None: self.config["namespace"]},
+            )
+        else:
+            nsargs = {}
+
+        E = ElementMaker(**nsargs)
+        root_el = E(self.config["root"])
+        for row in records:
+            row_el = E(self.config["row"])
+            for key, value in row.items():
+                spec = self.columns[key]
+                if spec.get("skip", False):
+                    continue
+                if spec.get("skipna", False) and pd.isna(value):
+                    continue
+                if spec.get("skipre", False) and re.match(spec["skipre"], str(value)):
+                    continue
+                item_el = E(spec["element"])
+                if spec.get("key"):
+                    item_el.set(spec["key"], key)
+                if spec.get("value"):
+                    item_el.set(spec["value"], value)
+                else:
+                    item_el.text = str(value)
+                row_el.append(item_el)
+            root_el.append(row_el)
+        self.xml = root_el.getroottree()
+        return root_el
+
+
+def _main():
+    options = getargparser().parse_args()
+    converter = Converter()
+    if options.config:
+        converter.load_config(options.config)
+    table = read_table(options.table)
+    if converter.config.get("transpose"):
+        converter.add_columns(table.index)
+    else:
+        converter.add_columns(table.columns)
+    if options.write_config:
+        converter.save_config(options.write_config)
+    if options.output:
+        et = converter.table2xml(table).getroottree()
+        et.write(options.output, encoding="utf-8", pretty_print=True)
+
+
+if __name__ == "__main__":
+    _main()