In [None]:
from dataclasses import dataclass
import re
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


def device_schema() -> StructType:
    return StructType(
        [
            StructField("node_id", StringType(), False),
            StructField(
                "info",
                StructType(
                    [
                        StructField(
                            "system",
                            StructType(
                                [
                                    StructField("sysName", StringType(), False),
                                    StructField("sysUpTimeSec", IntegerType(), False),
                                    StructField("serialNum", StringType(), False),
                                    StructField("sysObjectName", StringType(), False),
                                    StructField("intfTotal", StringType(), False),
                                    StructField("intfCollect", IntegerType(), False),
                                    StructField("snmpdown", StringType(), False),
                                    StructField("name", StringType(), False),
                                    StructField("nodedown", StringType(), False),
                                    StructField("nodeType", StringType(), False),
                                    StructField("region", StringType(), False),
                                    StructField("location", StringType(), False),
                                    StructField("location_id", StringType(), False),
                                    StructField("city", StringType(), False),
                                    StructField("state", StringType(), False),
                                    StructField("country", StringType(), False),
                                    StructField("tier", StringType(), False),
                                    StructField("divison", StringType(), False),
                                ]
                            ),
                            False,
                        ),
                        StructField(
                            "status",
                            MapType(
                                StringType(),
                                StructType(
                                    [
                                        StructField("event", StringType(), False),
                                        StructField("element", StringType(), False),
                                        StructField("index", StringType(), False),
                                        StructField("status", StringType(), False),
                                        StructField("value", StringType(), False),
                                    ]
                                ),
                            ),
                            False,
                        ),
                        StructField(
                            "interface",
                            MapType(
                                StringType(),
                                StructType(
                                    [
                                        StructField("collect", StringType(), False),
                                        StructField("ifHighSpeed", StringType(), False),
                                        StructField("ifType", StringType(), False),
                                        StructField(
                                            "ifOperStatus", StringType(), False
                                        ),
                                        StructField("interface", StringType(), False),
                                    ]
                                ),
                            ),
                            False,
                        ),
                        StructField(
                            "tempStatus",
                            MapType(
                                StringType(),
                                StructType(
                                    [
                                        StructField(
                                            "TemperatureStateName", StringType(), False
                                        ),
                                    ]
                                ),
                            ),
                            False,
                        ),
                    ]
                ),
                False,
            ),
        ]
    )


@dataclass
class Json_Data_Loader:
    spark: SparkSession
    dbutils: object
    read_dirs: list[str]
    target: str
    schema: StructType = device_schema()

    def get_file_timestamp(self, file_path: str) -> Column:
        """
        Assuming the filename follows pattern '...path/servername-timestamp.json',
        will find the timestamp portion, parse it, and wrap it in a spark column
        """
        timestamp_string = re.sub(r".*\/\w+-(\d+)\.json", r"\1", file_path)
        timestamp = datetime.strptime(timestamp_string, "%Y%m%d%H%M%S")
        return to_timestamp(lit(timestamp))

    def read_json(self, file_path: str) -> DataFrame:
        return (
            self.spark.read.schema(self.schema)
            .json(file_path, multiLine=True)
            .filter(col("info.system.nodeType").isin(["router", "switch"]))
            .withColumn("event_timestamp", self.get_file_timestamp(file_path))
            .withColumn("id", expr("uuid()"))
        )

    def process_json_file(self, file_path: str):
        print(f"Processing json file {file_path}")
        df = self.read_json(file_path)
        print(f"Loaded {df.count()} records")
        df.write.mode("append").parquet(self.target)
        self.dbutils.fs.mv(file_path, file_path.replace("/unprocessed", "/processed"))

    def load_files(self) -> None:
        for read_dir in self.read_dirs:
            print(f"Processing read dir: {read_dir}")
            files_raw = self.dbutils.fs.ls(f"{read_dir}/unprocessed")
            files = [file.path for file in files_raw if file.name.endswith(".json")]
            for file_path in files:
                self.process_json_file(file_path)

In [None]:
Json_Data_Loader(
    spark,
    dbutils,
    ["mnt/edl/raw/nmis_dc_logs/routers", "mnt/edl/raw/nmis_dc_logs/switches"],
    "mnt/edl/raw/nmis_dc_logs/device_events.parquet",
).load_files()