In [1]:
import xml.etree.ElementTree as ET #Element Tree XML parsing library
import re #Regex library
import yaml
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper

In [2]:
# @params
#   root: root of an ElementTree
#   columns: empty array to store column names
# Fills the columns array with column names
def dfs(root, columns):
    if root.getchildren() == []:
        columns.append(root)
    for child in root.getchildren():
        dfs(child, columns)

# @param root: root of the tree
# Outputs the node containing the 'instance' tag which is the start of the data form
def findInstanceTag(root):
    results = []
    if ('instance' in root.tag):
        return root
    for child in root.getchildren():
        tagElement = root.find(child.tag)
        results.append(findInstanceTag(tagElement))
        for node in results:
            if node is not None:
                return node
    
# @param columns: array of column names
# Outputs a new array of column names stripped of their xmlns tags 
def cleanColumnNames(columns):
    xmlns_re = "({.+})"
    cleaned_columns = [re.sub(xmlns_re, '', name.tag) for name in columns]
    return cleaned_columns

def xmlParse(fileName):
    inputFile = fileName #insert xml file name
    parser = ET.parse(inputFile) #initialize the ElementTree parser
    root = parser.getroot() #Find the root of the tree, the <html> tag
    form = findInstanceTag(root) #find the instances tag - this is the start of the data
    columns = [] #initialize empty array to store the column names
    dfs(form, columns) #extract column names with DFS
    return cleanColumnNames(columns) #clean the columns and return

xmlParse('death_report.xml')

  
  


['lat',
 'long',
 'error',
 'message',
 'source',
 'source_id',
 '_id',
 'name',
 'short_name',
 'patient_id',
 'date_of_birth',
 'sex',
 'name',
 'phone',
 'patient_age_in_years',
 'patient_age_in_months',
 'patient_age_in_days',
 'patient_uuid',
 'patient_id',
 'patient_name',
 'patient_short_name',
 'patient_display_name',
 'date_of_death',
 'place_of_death',
 'place_of_death_other',
 'death_information',
 'submit',
 'r_summary_details',
 'c_patient_age',
 'r_patient_details',
 'r_death_info',
 'r_key_instruction',
 'blank_note',
 'r_referral',
 'r_undo',
 '__date_of_death',
 '__place_of_death',
 '__place_of_death_other',
 '__death_information',
 '__patient_uuid',
 '__patient_id',
 '__household_uuid',
 '__source',
 '__source_id',
 'instanceID']

In [3]:
def yaml_creator(filename, path = "yaml-files-spencer/"):   
    assert '.xml' in filename[-4:]
    keyvalue_pairs = [{'column': i, 'type': None} for i in xmlParse(filename)]
    output = yaml.dump(keyvalue_pairs, explicit_start=True, default_flow_style=False)

    try:
        file_test = open(path + filename[:-4] + ".yaml", "r") # attempt to read the directory first
    
    except FileNotFoundError: # if there is no file with the given filename, then proceed
        file_object = open(path + filename[:-4] + ".yaml", "w+") # w+ means read and write
        file_object.write(output) # write to file
        file_object.close() # close file
        return

    file_test.close() # if we get here, there was already a file
    raise MemoryError(path + 'File ' + filename[:-4] + '''.yaml already exists in the current working directory. To avoid overwriting, aborting process.''')



  
  


MemoryError: File death_report.yaml already exists in the current working directory. To avoid overwriting, aborting process.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=59003b3a-4258-4703-b30d-75642543bba1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>