# Creating XES data from XML data

1. Installing relevant packages
2. Data cleaning
3. "kleine Anfragen"
4. Other Drucksache types


## Installing relevant packages

First install and import all relevant packages

In [1]:
#pip install pandas
#pip install pm4py
#pip install xml.etree.ElementTree

In [2]:
import pandas as pd
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.objects.log.util import dataframe_utils
import xml.etree.ElementTree as ET
import csv
import argparse
import os
import pickle
import opyenxes
import xml.etree.ElementTree as ET
import xml.dom.minidom
import re
from datetime import datetime

## Duplicating all elements with multiple vorgangsbezug (<4)

**"1_all_files.xml"** is a combination of all files (40.147) as they come directly from importing them.

Before working with the data we need to add, as otherwise the format will not work:
```
<anfrage>
    ...
</anfrage>
```


In [3]:
tree = ET.parse('.//Input_Data/1_all_files.xml')
root = tree.getroot()

In [4]:
new_root = ET.Element(root.tag)

# Iterate through each document
for document in root.findall('.//document'):
    for vorgangsbezug in document.findall('.//vorgangsbezug'):
        new_document = ET.Element('document')
        
        # Copy all the elements except for vorgangsbezug
        for element in document:
            if element.tag not in ['vorgangsbezug', 'text']:
                new_document.append(element)

        # Append the current vorgangsbezug to the new document
        new_document.append(vorgangsbezug)

        # Append the new document to the combined XML
        new_root.append(new_document)

new_tree = ET.ElementTree(new_root)

output_file_path = './/Input_Data/1_file_per_vorgangsbezug.xml'
new_tree.write(output_file_path, encoding='utf-8', xml_declaration=True)

### Append extra answers to "kleine Anfragen"

This was done manually, but could also be automated.

## Extrabehandlung Drucksachetyp "Entschließungsantrag" - Bezug anpassen

The events of the type "Entschließungsantrag" all refer only to their own type ("Entschließungsantrag BT), resulting in a very bland process. Looking at the titles of these, it was clear that most of them all refer to other drucksache-types, along all of the spectrum and therefore should be sorted into them. By searching for the relation in their title (19/XXX) they were assigned the related <vorgangybezug> informatation, resulting in a complex map. These events now can be sorted into the right traces and then put into their respective groups ("kleine Anfrage", "Antrag", ...) 

In [5]:
def replace_vorgangsbezug(xml_data, output_file):
    tree = ET.parse(xml_data)
    root = tree.getroot()

    # Create a dictionary to store document numbers and corresponding vorgangsbezug elements
    document_dict = {}

    for doc in root.findall('.//document'):
        doc_number = doc.find('./dokumentnummer').text
        vorgangsbezug = doc.find('./vorgangsbezug')
        if doc_number and vorgangsbezug is not None:
            document_dict[doc_number] = vorgangsbezug
    
    # Iterate through elements to replace vorgangsbezug for Entschließungsantrag
    for doc in root.findall('.//document[drucksachetyp="Entschließungsantrag"]'):
        title = doc.find('./titel').text
        document_numbers = re.findall(r'\b\d{2}/\d+\b[,\s]*', title)

        # Use the first document number if there are multiple matches
        document_number_to_clean = document_numbers[0] if document_numbers else None
        document_number_match = re.search(r'\b\d{2}/\d{1,6}\b', str(document_number_to_clean))
        document_number = document_number_match.group() if document_number_match else None

        if document_number is not None and document_number in document_dict:
            vorgangsbezug = document_dict[document_number]

            # Remove existing vorgangsbezug in Entschließungsantrag
            existing_vorgangsbezug = doc.find('./vorgangsbezug')
            if existing_vorgangsbezug is not None:
                doc.remove(existing_vorgangsbezug)

            # Add the new vorgangsbezug
            doc.append(vorgangsbezug)

    tree.write(output_file)

xml_file = ".//Input_Data/1_files_per_vorgangsbezug.xml"
output_file = ".//Input_Data/1_Bezug_Entschl.xml"
replace_vorgangsbezug(xml_file, output_file)

## Make seperate files for all Vorgangsbezüge

## CONVERT DATA

Approach that is based on the specific context of kleine Anfragen and Answers.

### Define methods: 

In [6]:
def convert_to_xes(document_element):
    # Extract all relevant information from the document element
    concept_name = document_element.find('drucksachetyp').text
    vorgangsbezug = document_element.find('.//vorgangsbezug/id').text
    date_str = document_element.find('datum').text
    timestamp = datetime.strptime(date_str, "%Y-%m-%d").strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    

    # Create an event
    event = ET.Element('event')
    # Add elements to the trace
    ET.SubElement(event, 'string', {'key': 'concept:name', 'value': concept_name})
    ET.SubElement(event, 'string', {'key': 'vorgangsbezug', 'value': vorgangsbezug})
    ET.SubElement(event, 'date', {'key': 'time:timestamp', 'value': timestamp})
 
    event.append(ET.Element('int', {'key': 'wahlperiode', 'value': document_element.find('wahlperiode').text}))
    event.append(ET.Element('string', {'key': 'dokumentnummer', 'value': document_element.find('dokumentnummer').text}))
    event.append(ET.Element('string', {'key': 'typ', 'value': document_element.find('typ').text}))
    event.append(ET.Element('string', {'key': 'dokumentart', 'value': document_element.find('dokumentart').text}))
    event.append(ET.Element('string', {'key': 'herausgeber', 'value': document_element.find('herausgeber').text}))
    event.append(ET.Element('int', {'key': 'id', 'value': document_element.find('id').text}))
    event.append(ET.Element('string', {'key': 'titel', 'value': document_element.find('titel').text}))
       
    
    # Determine org:resource based on ressort/titel
    # If Ressort exists:
    if document_element.find('ressort') is not None:
        ET.SubElement(event, 'string', {'key': 'org:resource', 'value': document_element.find('.//ressort/titel').text})
        ET.SubElement(event, 'string', {'key': 'org:role', 'value': 'Ressort'})
        ressource_element = ET.SubElement(event, 'string', {'key': 'resource_details', 'value': 'Ressort'})
        if document_element.find('.//ressort/federfuehrend') is not None:
            ET.SubElement(ressource_element, 'string', {'key': 'federfuehrend', 'value': document_element.find('.//ressort/federfuehrend').text})
        ET.SubElement(ressource_element, 'string', {'key': 'herkunft', 'value': 'Ressort'})
        urheber_elements = document_element.findall('urheber')
        for Urheber_name in [urheber.find('titel').text for urheber in urheber_elements]:
            ET.SubElement(ressource_element, 'string', {'key': 'urheber', 'value': Urheber_name})
    
    else:
        urheber_elements = document_element.findall('urheber')
        if urheber_elements is not None:
            urheber_names = [urheber.find('titel').text for urheber in urheber_elements]
            faction_names = [urheber.find('titel').text for urheber in urheber_elements if 'Fraktion' in urheber.find('titel').text]
            # If Fraktion exists:
            if faction_names:
                faction_name = ' zusammen mit '.join(urheber_names)
                ET.SubElement(event, 'string', {'key': 'org:resource', 'value': faction_name})
                ressource_element = ET.SubElement(event, 'string', {'key': 'resource_details', 'value': 'Fraktion'})
                ET.SubElement(event, 'string', {'key': 'org:role', 'value': 'Fraktion'})
                ET.SubElement(ressource_element, 'string', {'key': 'einbringer', 'value': document_element.find('.//urheber/einbringer').text})
                ET.SubElement(ressource_element, 'string', {'key': 'herkunft', 'value': 'Urheber'})
            else:
                ressource_name = ' zusammen mit '.join(urheber_names)                 
                if 'Ausschuss' in ressource_name:
                    ET.SubElement(event, 'string', {'key': 'org:resource', 'value': ressource_name})
                    ressource_element = ET.SubElement(event, 'string', {'key': 'resource_details', 'value': 'Ausschuss'})
                    ET.SubElement(event, 'string', {'key': 'org:role', 'value': 'Ausschuss'})
                elif 'Deutsche Delegation' in ressource_name:
                    ET.SubElement(event, 'string', {'key': 'org:resource', 'value': ressource_name})
                    ressource_element = ET.SubElement(event, 'string', {'key': 'resource_details', 'value': 'Deutsche Delegation'})
                    ET.SubElement(event, 'string', {'key': 'org:role', 'value': 'Deutsche Delegation'})
                else:
                    ressource_element = ET.SubElement(event, 'string', {'key': 'resource_details', 'value': 'Sonstiges'})
                    ET.SubElement(event, 'string', {'key': 'org:resource', 'value': ressource_name})
                    ET.SubElement(event, 'string', {'key': 'org:role', 'value': 'Sonstiges'})
                ET.SubElement(ressource_element, 'string', {'key': 'herkunft', 'value': 'Urheber'})
            # Add urheber element
            for Urheber_name in urheber_names:
                ET.SubElement(ressource_element, 'string', {'key': 'urheber', 'value': Urheber_name})
        else:
            ressource_element = ET.SubElement(event, 'string', {'key': 'org:resource', 'value': 'Sonstiges'})
            ET.SubElement(ressource_element, 'string', {'key': 'typ', 'value': 'Sonstiges'})
            # Add urheber element to Sonstiges
            for Urheber_name in urheber_names:
                ET.SubElement(ressource_element, 'string', {'key': 'urheber', 'value': Urheber_name})
        
 
    # Vorgangsbezug
    vorgangsbezug_element = ET.SubElement(event, 'string', {'key': 'vorgangsbezug', 'value': document_element.find('.//vorgangsbezug/titel').text})
    ET.SubElement(vorgangsbezug_element, 'string', {'key': 'vorgangsbezug_typ', 'value': document_element.find('.//vorgangsbezug/vorgangstyp').text})
    ET.SubElement(vorgangsbezug_element, 'int', {'key': 'vorgangsbezug_id', 'value': document_element.find('.//vorgangsbezug/id').text})
    ET.SubElement(vorgangsbezug_element, 'int', {'key': 'vorgangsbezug_anzahl', 'value': document_element.find('vorgangsbezug_anzahl').text})
    
    # Autoren
    autoren_anzahl = document_element.find('autoren_anzahl')
    if autoren_anzahl is not None:
        autoren_list = ET.Element('list', {'key': 'autoren'})

        for autoren_anzeige in document_element.findall('.//autoren_anzeige'):
            author_id = autoren_anzeige.find('id').text
            author_title_element = autoren_anzeige.find('titel')
            author_title = author_title_element.text if author_title_element is not None else ''
            author_info = f"{author_title}, {author_id}"

            ET.SubElement(autoren_list, 'string', {'key': 'autor', 'value': author_info})

        event.append(autoren_list)

    # Fundstelle
    fundstelle = document_element.find('.//fundstelle')
    if fundstelle is not None:
        fundstelle_element = ET.SubElement(event, 'string', {'key': 'fundstelle', 'value': fundstelle.find('pdf_url').text if fundstelle.find('pdf_url') is not None else ''})
        if fundstelle.find('verteildatum') is not None:
            ET.SubElement(fundstelle_element, 'date', {'key': 'verteildatum', 'value': fundstelle.find('verteildatum').text if fundstelle.find('verteildatum') is not None else ''})
        ET.SubElement(fundstelle_element, 'string', {'key': 'pdf_hash', 'value': document_element.find('pdf_hash').text if document_element.find('pdf_hash') is not None else ''})
    
    # Urheber
    urheber_elements = document_element.findall('urheber')
    if urheber_elements is not None:
        urheber_list = [urheber.find('titel').text for urheber in urheber_elements]
        if 'Bundesregierung' in urheber_list:
            org_group_value = 'Bundesregierung' 
        elif 'Fraktion' in urheber_list:
            org_group_value = 'Opposition' 
        else:
            org_group_value = 'Bundestag'
        ET.SubElement(event, 'string', {'key': 'org:group', 'value': org_group_value})

    event.append(ET.Element('string', {'key': 'aktualisiert', 'value': document_element.find('aktualisiert').text}))
    
    return event

In [7]:
def create_and_save_xes(root, output_file):
# Create a dictionary to store traces based on vorgangsbezug
    traces_dict = {}

    # Iterate through each <document> element
    for document_element in root.findall('.//document'):
        # Convert the <document> element to XES event
        event = convert_to_xes(document_element)

        # Get vorgangsbezug to use as the key
        vorgangsbezug = document_element.find('.//vorgangsbezug/id').text

        # Check if the vorgangsbezug is already a trace
        if vorgangsbezug in traces_dict:
            traces_dict[vorgangsbezug].append(event)
        else:
            # Create a new trace if it doesn't exist
            traces_dict[vorgangsbezug] = [event]

    # Add global elements

    # Create a new XES log
    xes_log = ET.Element('log', {'xes.version': '1.0', 'xes.features': 'nested-attributes'})
    xes_log.append(ET.Element('extension', {'name': 'Concept', 'prefix': 'concept', 'uri': 'http://www.xes-standard.org/concept.xesext'}))
    xes_log.append(ET.Element('extension', {'name': 'Organizational', 'prefix': 'org', 'uri': 'http://www.xes-standard.org/org.xesext'}))
    xes_log.append(ET.Element('extension', {'name': 'Time', 'prefix': 'time', 'uri': 'http://www.xes-standard.org/time.xesext'}))


    global_scope_trace = ET.SubElement(xes_log, 'global', {'scope': 'trace'})
    ET.SubElement(global_scope_trace, 'string', {'key': 'concept:name', 'value': ''})

    global_scope_event = ET.SubElement(xes_log, 'global', {'scope': 'event'})
    ET.SubElement(global_scope_event, 'string', {'key': 'concept:name', 'value': ''})
    ET.SubElement(global_scope_event, 'date', {'key': 'time:timestamp', 'value': '1970-01-01T00:00:00.000+00:00'})

    xes_log.append(ET.Element('classifier', {'name': 'drucksachetyp', 'keys': 'concept:name'}))

    for trace_key in traces_dict:
        trace = ET.Element('trace')

        trace_id = ''.join(filter(str.isdigit, trace_key))
        ET.SubElement(trace, 'string', {'key': 'concept:name', 'value': trace_id})

        events = sorted(traces_dict[trace_key], key=lambda x: x.find('date').attrib['value'])
        trace.extend(events)
        xes_log.append(trace)

    xes_tree = ET.ElementTree(xes_log)
    xml_string = ET.tostring(xes_log, encoding='utf-8', xml_declaration=True).decode()

    # Prettifying
    dom = xml.dom.minidom.parseString(xml_string)
    pretty_xml_string = dom.toprettyxml(indent="  ")

    # Save
    with open(output_file, 'w', encoding='utf-8') as output_file:
        output_file.write(pretty_xml_string)
        print("all done!")

## For kleine Anfrage

In [8]:
tree = ET.parse('.//XML files/Kleine_Anfrage.xml')
root = tree.getroot()
output_file = ".//Event Logs/Kleine_Anfrage.xes"

In [9]:
create_and_save_xes(root, output_file)

all done!


Manual Changes:
- Change order of the events in trace: key="concept:name" value="281447", so that the Answer is after the inquiry 
- Delete traces with only "ergänzende Antwort" refereing to "17/XXX"

## Gesetzgebung

Use same definition as for "kleine Anfrage"

In [10]:
tree = ET.parse('.//XML files/Gesetzgebung.xml')
root = tree.getroot()
output_file = './/Event Logs/Gesetzentwurf.xes'

In [11]:
create_and_save_xes(root, output_file)

all done!


## Antrag

In [12]:
tree = ET.parse('.//XML files/Antrag.xml')
root = tree.getroot()
output_file = './/Event Logs/Antrag.xes'

In [13]:
create_and_save_xes(root, output_file)

all done!


## Geschäftsordnung

In [15]:
tree = ET.parse('.//XML files/Geschäftsordnung.xml')
root = tree.getroot()
output_file = './/Event Logs/Geschäftsordnung.xes'

In [16]:
create_and_save_xes(root, output_file)

all done!


## Große Anfrage

In [17]:
tree = ET.parse('.//XML files/Große_Anfrage.xml')
root = tree.getroot()
output_file = './/Event Logs/Große_Anfrage.xes'

In [18]:
create_and_save_xes(root, output_file)

all done!


## Rechtsverordnung

In [19]:
tree = ET.parse('.//XML files/Rechtsverordnung.xml')
root = tree.getroot()
output_file = './/Event Logs/Rechtsverordnung.xes'

In [20]:
create_and_save_xes(root, output_file)

all done!
