In [13]:
import xml.etree.ElementTree as ET
import pandas as pd
import glob
import yaml

class XMLReader:


    def __init__(self, config_file="config.yml"):
        """
        Initializes the XMLReader with a configuration file.

        Args:
          config_file: The path to the YAML configuration file.
        """
        try:
            with open(config_file, 'r') as f:
                self.config = yaml.safe_load(f)
        except FileNotFoundError:
            print(f"Error: Config file '{config_file}' not found.")
            self.config = None
        except yaml.YAMLError:
            print(f"Error: Invalid YAML format in '{config_file}'.")
            self.config = None

        if self.config is not None:
            self.head_elements = self.config.get('head', [])
            self.tracker_element = self.config.get('tracker_element', 'tracker')
            self.item_elements = self.config.get('item_elements', [])


    # Replicate head_data for each tracker item
    def process_tracker(self, data, tracker_element, head_data,  parent_data=None):
        tracker_data = {} if parent_data is None else parent_data.copy()
        for child in tracker_element:
            if child.tag == self.tracker_element:
                # Recursive call for nested 'tracker' elements
                self.process_tracker(data=data, tracker_element=child,head_data=head_data, parent_data=tracker_data)
            elif child.tag in self.item_elements:
                tracker_data[child.tag] = child.text.strip() if child.text else None
        if tracker_data:  # Only add if tracker_data is not empty
            data.append({**head_data, **tracker_data}) # Combine head_data and tracker_data

    def read_xml_files(self, filenames):
        """
        Reads multiple XML files and stores the data in a pandas DataFrame.

        Args:
          filenames: A list of filenames or a glob pattern to match multiple files.

        Returns:
          pandas.DataFrame: A DataFrame containing the extracted data from all files.
        """
        if self.config is None:
            print("Error: No valid configuration loaded.")
            return None

        all_data = []
        for filename in filenames:
            try:
                # Parse the XML file
                tree = ET.parse(filename)
                root = tree.getroot()

                data = []

                # Find and process the 'head' element
                head = root.find("head")
                if head is not None:
                    head_data = {}
                    for element in self.head_elements:
                        child = head.find(element)
                        if child is not None:
                            head_data[element] = child.text.strip() if child.text else None


                    tracker = root.find(self.tracker_element)
                    if tracker is not None:
                        self.process_tracker(data, tracker, head_data)

                all_data.extend(data)

            except FileNotFoundError:
                print(f"Error: File '{filename}' not found.")
            except ET.ParseError:
                print(f"Error: Invalid XML format in '{filename}'.")


        # Create a pandas DataFrame from the extracted data
        df = pd.DataFrame(all_data)
        return df


In [14]:

# Example usage with multiple files and config file:
filenames = glob.glob("data/*.xml") # Replace with your XML files

print(filenames)
reader = XMLReader()  # Uses the default config file "config.yaml"
df = reader.read_xml_files(filenames)
if df is not None:
    print(df)

['data\\xml1.xml', 'data\\xml2.xml']
              info        date    item value
0   First XML file  2024-10-02  Item B    25
1   First XML file  2024-10-02  Item A    15
2  Second XML file  2024-10-03  Item E    55
3  Second XML file  2024-10-03  Item D    45
4  Second XML file  2024-10-03  Item C    35


In [9]:
import os

def list_files_in_current_directory():
  """Lists all files in the current directory."""
  current_directory = os.getcwd()
  files = os.listdir(current_directory)
  print("Files in the current directory:")
  for file in files:
    print(file)

list_files_in_current_directory()

Files in the current directory:
config.yml
data
xml1.html
xml2.html
xml3.txt
xml_testing.ipynb
