In [0]:
%pip install databricks=0.2 databricks-langchain==0.5.1

In [0]:
import os
import sys
import json
import xml.etree.ElementTree as ET
from typing import Any, Dict, List

from schemas.nifi_metadata import processor_schema, processor_properties_schema, processor_connections_schema
from tools.xml_tools import parse_nifi_template, list_xml_files

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, current_date, lit
from  pyspark.sql import functions as F

from __future__ import annotations

catalog = "sdickey"
schema = "nifi_workflow_metadata"



In [0]:
xml_file_path = "/Volumes/sdickey/nxp_nifi_flows_xml/raw_files/ICN8_NiFi_flows_2025-05-06.xml" 
xml_volumes_path = "/Volumes/sdickey/nxp_nifi_flows_xml/raw_files"

spark = SparkSession.builder.getOrCreate()

xml_file_paths = list_xml_files(xml_volumes_path)
print(xml_file_paths)

# Read the XML file contents
with open(xml_file_path, "r", encoding="utf-8") as f:
    
    xml_content = f.read()
    response = json.loads(parse_nifi_template(xml_content))

    print(response.keys())

    raw_processors_df = spark.createDataFrame(response["processors"])
    raw_connections_df = spark.createDataFrame(response["connections"])

    # 1. Base processors table
    processors_df = (
        raw_processors_df
        .selectExpr(
            "id",
            "name",
            "type",
            "parentGroupId as parent_group_id",
            "parentGroupName as parent_group_name"
        )
        .withColumn("created_date", current_date())
        .withColumn("last_updated_date", current_date())
    )
    
    # 2. Properties table (explode the map into rows)
    processor_properties_df = (
        raw_processors_df
        .select(
            F.col("id").alias("processor_id"),
            F.posexplode_outer("properties").alias("pos", "property_name", "property_value")
        )
        .select(
            "processor_id",
            F.col("property_name").alias("property_name"),
            F.col("property_value").alias("property_value"),
            (F.col("pos") + F.lit(1)).cast("string").alias("property_rank"),
            F.current_date().alias("created_date"),
            F.current_date().alias("last_updated_date"),
        )
    )

    # 2. connections table
    connections_df = (
        raw_connections_df
        .selectExpr(
            "source as source_processor_id",
            "destination as destination_processor_id",
            "relationships"
        )
        .withColumn("created_date", current_date())
        .withColumn("last_updated_date", current_date())
    )


    processors_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.nifi_processors")
    processor_properties_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.nifi_processor_properties")
    connections_df.write.format("delta").mode("overwrite").saveAsTable(f"{catalog}.{schema}.nifi_connections")


spark.stop()