In [1]:
from pathlib import Path

In [2]:
path = Path.cwd() / "input"

In [3]:
dir_1 = path / "test1.xml"
dir_2 = path / "test2.xml"

In [4]:
import pandas as pd

import xml.etree.ElementTree as ET


def get_df(dir_):
    tree = ET.parse(dir_)
    root = tree.getroot()
    data = []
    for timeSeriesSet in root.findall(".//{http://www.wldelft.nl/fews}timeSeriesSet"):
        moduleInstanceId = timeSeriesSet.find(
            "{http://www.wldelft.nl/fews}moduleInstanceId"
        ).text
        valueType = timeSeriesSet.find("{http://www.wldelft.nl/fews}valueType").text
        parameterId = timeSeriesSet.find("{http://www.wldelft.nl/fews}parameterId").text
        locationSetId = timeSeriesSet.find(
            "{http://www.wldelft.nl/fews}locationSetId"
        ).text
        timeSeriesType = timeSeriesSet.find(
            "{http://www.wldelft.nl/fews}timeSeriesType"
        ).text
        timeStep = timeSeriesSet.find("{http://www.wldelft.nl/fews}timeStep").get(
            "unit"
        )
        readWriteMode = timeSeriesSet.find(
            "{http://www.wldelft.nl/fews}readWriteMode"
        ).text

        data.append(
            {
                "moduleInstanceId": moduleInstanceId,
                "valueType": valueType,
                "parameterId": parameterId,
                "locationSetId": locationSetId,
                "timeSeriesType": timeSeriesType,
                "timeStep": timeStep,
                "readWriteMode": readWriteMode,
            }
        )

    return root, pd.DataFrame(data).set_index(["moduleInstanceId", "parameterId"])

In [5]:
(root_1, df_1), (root_2, df_2) = get_df(dir_1), get_df(dir_2)

In [6]:
for col in df_1.columns:
    print(df_1[col].unique())

['sample']
['LocationSet']
['external historical']
['nonequidistant']
['add originals']


In [7]:
print(
    f"in de versie 1 versie zitten {len(df_1)}, en in de {len(df_2)} versie 2"
)

in de versie 1 versie zitten 24466, en in de 26406 versie 2


In [8]:
both = df_1.join(df_2, lsuffix="_ont", rsuffix="_prepro", how="left").dropna()
len(both)

24424

In [9]:
diff_1 = set(df_1.index).difference(both.index)

In [10]:
print(f"In Versie zitten {len(diff_1)} timeseries die niet in versie 2 zitten")

In Versie zitten 42 timeseries die niet in versie 2 zitten


In [11]:
df_1.loc[list(diff_1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,valueType,locationSetId,timeSeriesType,timeStep,readWriteMode
moduleInstanceId,parameterId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Import_1,WNS14708,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS14704,sample,LocationSet,external historical,nonequidistant,add originals
Import_2,WNS14700,sample,LocationSet,external historical,nonequidistant,add originals
Import_2,WNS14704,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS15028,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS14705,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS14605,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS14710,sample,LocationSet,external historical,nonequidistant,add originals
Import_2,WNS14715,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS14694,sample,LocationSet,external historical,nonequidistant,add originals


In [12]:
output_file = "filtered_timeseries.xml"

def output_to_xml(root, diff_df, output_file):
    indexes = []
    for timeSeriesSet in root.findall(".//{http://www.wldelft.nl/fews}timeSeriesSet"):
        moduleInstanceId = timeSeriesSet.find("{http://www.wldelft.nl/fews}moduleInstanceId").text
        parameterId = timeSeriesSet.find("{http://www.wldelft.nl/fews}parameterId").text
        if (moduleInstanceId, parameterId) in diff_df:
            indexes.append((moduleInstanceId, parameterId))

    kwargs={"version": "1.1", 
    "xmlns": "http://www.wldelft.nl/fews", 
    "xmlns:xsi":"http://www.w3.org/2001/XMLSchema-instance", 
    "xsi:schemaLocation":"http://www.wldelft.nl/fews http://fews.wldelft.nl/schemas/version1.0/validationRuleSets.xsd"}
    new_root =  ET.Element('validationRuleSets', **kwargs)

    validation_rule_set = ET.SubElement(new_root, 'validationRuleSet', validationRuleSetId="ExtremeValue", timeZone="CET")
    log_level = ET.SubElement(validation_rule_set, 'logLevel')
    log_level.text = 'DEBUG'
    extreme_values_functions = ET.SubElement(validation_rule_set, 'extremeValuesFunctions')
    ET.SubElement(extreme_values_functions, 'hardMax', constantLimit="@H_MAX@")
    ET.SubElement(extreme_values_functions, 'hardMin', constantLimit="@H_MIN@")
    ET.SubElement(extreme_values_functions, 'softMax', constantLimit="@S_MAX@")
    ET.SubElement(extreme_values_functions, 'softMin', constantLimit="@S_MIN@")

    for moduleInstanceId, parameterId in indexes:
        for timeSeriesSet in root.findall(".//{http://www.wldelft.nl/fews}timeSeriesSet"):
            if (
                timeSeriesSet.find("{http://www.wldelft.nl/fews}moduleInstanceId").text == moduleInstanceId
                and timeSeriesSet.find("{http://www.wldelft.nl/fews}parameterId").text == parameterId
            ):
                # new_root.append(timeSeriesSet)
                time_series_set = ET.SubElement(validation_rule_set, 'timeSeriesSet')
                for name in ['moduleInstanceId','valueType', 'parameterId', 'locationSetId', 'timeSeriesType', 'timeStep',  'readWriteMode']: 
                    ET.SubElement(time_series_set, name).text = timeSeriesSet.find("{http://www.wldelft.nl/fews}"+f"{name}").text

    new_tree = ET.ElementTree(new_root)
    new_tree.write(output_file, default_namespace="")

In [13]:
output_to_xml(root_1, diff_1, output_file="diff_1.xml")

In [14]:
diff_2 = set(df_2.index).difference(both.index)

In [15]:
print(f"In versie 2 zitten {len(diff_2)} timeseries die niet in versie 1 zitten")

In versie 2 zitten 1982 timeseries die niet in versie 1 zitten


In [16]:
diff_2

{('Import_2', 'WNS14140'),
 ('Import_2', 'TTSWDE[n]'),
 ('Import_2', 'WNS13733'),
 ('Import_1', 'WNS13895'),
 ('Import_1', 'WNS13978'),
 ('Import_1', 'WNS13716'),
 ('Import_1', 'WNS13502'),
 ('Import_1', 'WNS13712'),
 ('Import_2', 'WNS13159'),
 ('Import_2', 'WNS14102'),
 ('Import_1', 'WNS13998'),
 ('Import_1', 'WNS13248'),
 ('Import_1', 'WNS13923'),
 ('Import_2', 'WNS13292'),
 ('Import_2', 'WNS13151'),
 ('Import_2', 'WNS14131'),
 ('Import_1', 'WNS13549'),
 ('Import_1', 'WNS13990'),
 ('Import_2', 'WNS13906'),
 ('Import_2', 'WNS13351'),
 ('Import_2', 'WNS13734'),
 ('Import_1', 'WNS14172'),
 ('Import_1', 'WNS13420'),
 ('Import_2', 'WNS13787'),
 ('Import_1', 'WNS13166'),
 ('Import_2', 'WNS13433'),
 ('Import_1', 'WNS13230'),
 ('Import_1', 'WNS13480'),
 ('Import_1', 'WNS14010'),
 ('Import_2', 'WNS13797'),
 ('Import_1', 'WNS13596'),
 ('Import_2', 'TTSWDE[n/ha]'),
 ('Import_1', 'WNS13818'),
 ('Import_1', 'WNS13873'),
 ('Import_2', 'TSTNDWDE[ug/l]'),
 ('Import_1', 'WNS14005'),
 ('Import_2', 'TS

In [17]:
df_2.loc[list(diff_2)]

Unnamed: 0_level_0,Unnamed: 1_level_0,valueType,locationSetId,timeSeriesType,timeStep,readWriteMode
moduleInstanceId,parameterId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Import_2,WNS14140,sample,LocationSet,external historical,nonequidistant,add originals
Import_2,TTSWDE[n],sample,LocationSet,external historical,nonequidistant,add originals
Import_2,WNS13733,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS13895,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS13978,sample,LocationSet,external historical,nonequidistant,add originals
...,...,...,...,...,...,...
Import_2,WNS13664,sample,LocationSet,external historical,nonequidistant,add originals
Import_1,WNS13194,sample,LocationSet,external historical,nonequidistant,add originals
Import_2,WNS13202,sample,LocationSet,external historical,nonequidistant,add originals
Import_2,WNS13244,sample,LocationSet,external historical,nonequidistant,add originals


In [18]:
output_to_xml(root_2, diff_2, output_file="diff_2.xml")

format file with:<br>
    extension: `pretty XML`<br> 
    and Vscode user settings:<br> 

        ```json
         "prettyxml.settings.addSpaceBeforeSelfClosingTag": false,
        "prettyxml.settings.attributesInNewlineThreshold": 9,
        "prettyxml.settings.positionAllAttributesOnFirstLine": true,
        "prettyxml.settings.preserveWhiteSpacesInComment": true,
        "editor.insertSpaces": true,
        "[xml]": {
        "editor.defaultFormatter": "PrateekMahendrakar.prettyxml"
        },
        ```

