In [None]:
from dataclasses import dataclass
import re
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *


@dataclass
class ServerInfo:
    name: str
    base_url: str


@dataclass
class JsonDataLoader:
    spark: SparkSession
    dbutils: object
    read_dirs: list[str]
    target: str

    def __init__(self, spark, dbutils, read_dirs, target):
        self.spark = spark
        self.dbutils = dbutils
        self.read_dirs = read_dirs
        self.target = target

    def get_file_timestamp(self, file_path: str) -> Column:
        timestamp_string = re.sub(r".*\/\w+-(\d+)\.json", r"\1", file_path)
        timestamp = datetime.strptime(timestamp_string, "%Y%m%d%H%M%S")
        return to_timestamp(lit(timestamp))

    def read_json(self, file_path: str, schema: StructType) -> DataFrame:
        return (
            self.spark.read.schema(schema)
            .json(file_path, multiLine=True)
            .withColumn("event_timestamp", self.get_file_timestamp(file_path))
            .withColumn("id", expr("uuid()"))
        )

    def process_json_file(self, file_path: str, schema: StructType):
        print(f"Processing json file {file_path}")
        df = self.read_json(file_path, schema)
        print(f"Loaded {df.count()} records")
        df.write.mode("append").parquet(self.target)
        self.dbutils.fs.mv(file_path, file_path.replace("/unprocessed", "/processed"))

    def load_files(self, schema: StructType) -> None:
        for read_dir in self.read_dirs:
            print(f"Processing read dir: {read_dir}")
            files_raw = self.dbutils.fs.ls(f"{read_dir}/unprocessed")
            files = [file.path for file in files_raw if file.name.endswith(".json")]
            for file_path in files:
                self.process_json_file(file_path, schema)


device_schema = StructType(
    [
        StructField("_id", StringType(), False),
        StructField(
            "catchall",
            StructType(
                [
                    StructField(
                        "data",
                        StructType(
                            [
                                StructField("sysName", StringType(), False),
                                StructField("sysUpTimeSec", IntegerType(), False),
                                StructField("serialNum", StringType(), False),
                                StructField("sysObjectName", StringType(), False),
                                StructField("intfTotal", StringType(), False),
                                StructField("intfCollect", IntegerType(), False),
                                StructField("snmpdown", StringType(), False),
                                StructField("name", StringType(), False),
                                StructField("nodedown", StringType(), False),
                                StructField("netType", StringType(), False),
                                StructField("region", StringType(), False),
                                StructField("location", StringType(), False),
                                StructField("city", StringType(), False),
                                StructField("state", StringType(), False),
                                StructField("country", StringType(), False),
                                StructField("siteId", StringType(), False),
                                StructField("stratum", StringType(), False),
                                StructField("roleType", StringType(), False),
                                StructField("host", StringType(), False),
                                StructField("nmc_last_seen", StringType(), False),
                                StructField(
                                    "SNOW_assignmentGroup", StringType(), False
                                ),
                                StructField("uuid", StringType(), False),
                            ]
                        ),
                        False,
                    ),
                    StructField("server_name", StringType(), False),
                ]
            ),
            False,
        ),
        StructField(
            "latest_data",
            StructType(
                [
                    StructField(
                        "subconcepts",
                        StructType(
                            [
                                StructField(
                                    "health",
                                    StructType(
                                        [
                                            StructField(
                                                "data",
                                                StructType(
                                                    [
                                                        StructField(
                                                            "cpuHealth",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "memHealth",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "intHealth",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "intfUp",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "availability",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "reachability",
                                                            StringType(),
                                                            False,
                                                        ),
                                                    ]
                                                ),
                                                False,
                                            ),
                                        ]
                                    ),
                                    False,
                                ),
                            ]
                        ),
                        False,
                    ),
                ]
            ),
            False,
        ),
        StructField("event_timestamp", TimestampType(), False),
        StructField("id", StringType(), False),
    ]
)

interface_schema = StructType(
    [
        StructField("_id", StringType(), False),
        StructField(
            "catchall",
            StructType(
                [
                    StructField("node_uuid", StringType(), False),
                ]
            ),
            False,
        ),
        StructField(
            "inventory",
            StructType(
                [
                    StructField(
                        "data",
                        StructType(
                            [
                                StructField("interface", StringType(), False),
                                StructField("ifDescr", StringType(), False),
                                StructField("Description", StringType(), False),
                                StructField("ifSpeed", StringType(), False),
                                StructField("collect", StringType(), False),
                                StructField("ifLastChange", StringType(), False),
                                StructField("ifOperStatus", StringType(), False),
                                StructField("ifType", StringType(), False),
                                StructField("ifIndex", StringType(), False),
                            ]
                        ),
                        False,
                    ),
                ]
            ),
            False,
        ),
        StructField(
            "latest_data",
            StructType(
                [
                    StructField(
                        "subconcepts",
                        StructType(
                            [
                                StructField(
                                    "interface",
                                    StructType(
                                        [
                                            StructField(
                                                "derived_data",
                                                StructType(
                                                    [
                                                        StructField(
                                                            "availability",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "inputUtil",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "outputUtil",
                                                            StringType(),
                                                            False,
                                                        ),
                                                        StructField(
                                                            "totalUtil",
                                                            StringType(),
                                                            False,
                                                        ),
                                                    ]
                                                ),
                                                False,
                                            ),
                                        ]
                                    ),
                                    False,
                                ),
                            ]
                        ),
                        False,
                    ),
                ]
            ),
            False,
        ),
        StructField("event_timestamp", TimestampType(), False),
        StructField("id", StringType(), False),
    ]
)

In [None]:
JsonDataLoader(
    spark,
    dbutils,
    ["mnt/edl/raw/nmis_dc_logs/nmis9"],
    "mnt/edl/raw/nmis_dc_logs/nmis9_device_events.parquet",
).load_files(device_schema)

JsonDataLoader(
    spark,
    dbutils,
    ["mnt/edl/raw/nmis_dc_logs/nmis9_interfaces"],
    "mnt/edl/raw/nmis_dc_logs/nmis9_interface_events.parquet",
).load_files(interface_schema)