In [None]:
from dataclasses import dataclass, field
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
from datetime import datetime


def create_apic_devices(df: DataFrame) -> DataFrame:
    load_time = datetime.utcnow().strftime("%Y-%m-%d")
    return df.select(
        df.date_time,
        df.environment.cast(StringType()),
        df.location.cast(StringType()),
        df.node_name.cast(StringType()),
        df.tcam_current_util.cast(StringType()),
        df.tcam_max_capacity.cast(StringType()),
        df.total_tcam_percentage.cast(StringType()),
        df.cpu_kernel.cast(StringType()),
        df.cpu_user.cast(StringType()),
        df.used_avg.cast(FloatType()),
        df.total_avg.cast(FloatType()),
        df.total_capacity_percentage.cast(FloatType()),
        df.lpm_current_util.cast(FloatType()),
        df.lpm_max_capacity.cast(FloatType()),
        lit(load_time).alias("load_time"),
    )


@dataclass
class Table:
    name: str
    schema_env: str
    df_builder: Callable[[DataFrame], DataFrame]
    df: DataFrame = None


def tables():
    return [
        Table(
            "apic_data",
            "com.deere.enterprise.datalake.enhance.apic_data@1.0.1",
            create_apic_devices,
        ),
    ]


@dataclass
class APIC_Enhancer:
    spark: SparkSession
    raw_table: str
    target_dir: str
    tables: list[Table] = field(default_factory=tables)
    db_name: str = "edl_stage"
    dataset_env: str = (
        "com.deere.enterprise.datalake.enhance.iit_apic_enhanced"  # Change
    )

    def create_db_if_not_exists(self):
        self.spark.sql(f"CREATE DATABASE IF NOT EXISTS {self.db_name}")

    def write_data_to_table(self, table: Table):
        print(
            f"Appending {table.df.count()} records to table {self.db_name}.{table.name}"
        )
        writer = table.df.write

        # Allow opt out of a path when running locally or for tests.
        if self.target_dir != None:
            writer = writer.option("path", f"{self.target_dir}/{table.name}")

        writer.saveAsTable(
            f"{self.db_name}.{table.name}", format="parquet", mode="overwrite"
        )
        self.spark.sql(
            f"""
            ALTER TABLE {self.db_name}.{table.name}
            SET TBLPROPERTIES (
                'edl_datatype' = '{self.dataset_env}',
                'edl_representation' = '{table.schema_env}',
                'edl_state' = 'edl_ready',
                'edl_fullLoad' = 'True'
            )
            """
        )

    def enhance(self):
        self.create_db_if_not_exists()

        raw_df = self.spark.read.parquet(self.raw_table)

        print(f"Found {raw_df.count()} new record to process...")

        for table in self.tables:
            table.df = table.df_builder(raw_df)

        for table in self.tables:
            self.write_data_to_table(table)

In [None]:
APIC_Enhancer(
    spark,
    "/mnt/edl/raw/iit_apic_raw/TCAM.parquet",
    "/mnt/sandbox/AWS-EDL-INFRA-INTEL-DATA/enhance/staging/apic_enhanced",
).enhance()