In [None]:
import xml.etree.ElementTree as ET #Element Tree XML parsing library
import re #Regex library
import yaml
from collections import OrderedDict
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

In [None]:
def yaml_creator(filename, path = "../xml-files/", destination="../yaml-files/"):   
    assert '.xml' in filename[-4:]
    parsed_fields = xmlParse(path + filename)
    
    #pairs = [(field, {'constraints': None, 'distribution': None, 'type': None}) for field in parsed_fields]
    keyvalue_pairs = {}
    for field in parsed_fields:
        keyvalue_pairs[field] = {'constraints': None, 'distribution': None, 'type': None}
    output = yaml.dump(keyvalue_pairs, explicit_start=True, default_flow_style=False, sort_keys=False)
    output = "\n".join(output.split("\n",2)[1:])
    
    try:
        file_test = open(destination + filename[:-4] + ".yaml", "r") # attempt to read the directory first
    
    except FileNotFoundError: # if there is no file with the given filename, then proceed
        file_object = open(destination + filename[:-4] + ".yaml", "w+") # w+ means read and write
        file_object.write("---\nrows:\n")
        file_object.write(output) # write to file
        file_object.close() # close file
        return

    file_test.close() # if we get here, there was already a file
    raise MemoryError(path + 'File ' + filename[:-4] + '''.yaml already exists in the current working directory. To avoid overwriting, aborting process.''')



In [None]:
# @params
#   root: root of an ElementTree
#   columns: empty array to store column names
# Fills the columns array with column names
def dfs(root, columns):
    if root.getchildren() == []:
        columns.append(root)
    for child in root.getchildren():
        dfs(child, columns)

# @param root: root of the tree
# Outputs the node containing the 'instance' tag which is the start of the data form
def findInstanceTag(root):
    results = []
    if ('instance' in root.tag):
        return root
    for child in root.getchildren():
        tagElement = root.find(child.tag)
        results.append(findInstanceTag(tagElement))
        for node in results:
            if node is not None:
                return node
    
# @param columns: array of column names
# Outputs a new array of column names stripped of their xmlns tags 
def cleanColumnNames(columns):
    xmlns_re = "({.+})"
    cleaned_columns = [re.sub(xmlns_re, '', name.tag) for name in columns]
    return cleaned_columns

def xmlParse(fileName):
    inputFile = fileName #insert xml file name
    parser = ET.parse(inputFile) #initialize the ElementTree parser
    root = parser.getroot() #Find the root of the tree, the <html> tag
    form = findInstanceTag(root) #find the instances tag - this is the start of the data
    columns = [] #initialize empty array to store the column names
    dfs(form, columns) #extract column names with DFS
    return cleanColumnNames(columns) #clean the columns and return

import os 

def dirIterator(directory = "../xml-files"):
    for filename in os.listdir(directory):
        if filename.endswith('.xml'):
            yaml_creator(filename)
        else:
            continue

#xmlParse('xml-files/death_report.xml')
dirIterator('../xml-files')


  
  


In [None]:
yaml_creator("death_report.xml")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=59003b3a-4258-4703-b30d-75642543bba1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>