In [0]:
# Importing the required modules and functions
from pyspark.sql.functions import col, current_date, current_timestamp
from delta.tables import DeltaTable

In [0]:
class SparkUtils:
    
    # Method to read the parquet files from bronze staging layer with read options & return the pyspark dataframe    
    def read_parquet_file(self, spark, file_path, columns):
        
        # Reading the parquet file using spark & function args
        df = (
            spark \
            .read \
            .parquet(f"abfss://{file_path}")
        )
        
        # Fetching specified columns from the dataframe using columns list arg        
        df = (
            df \
            .select(*columns)
        )

        return df


    # Method to add load_date audit column to the pyspark dataframe returned by read_parquet_file method
    def add_audit_column(self, df):

        # Adding audit column to the pyspark dataframe
        df = (
            df \
            .withColumn("load_date", current_timestamp())
        )
        
        return df


    # Method to delete 1 year old and current day records if any, with the date functions
    def del_archive_records(self, spark, basePath):

        # Read the archive delta table
        archive_table = (
            DeltaTable \
            .forPath(spark, f"abfss://{basePath}")
        )

        # Deleting the records
        archive_table.delete("CAST(load_date AS DATE) < date_sub(current_date(), 365) OR CAST(load_date AS DATE) = current_date()")


    # Method to append the Delta archive table with the bronze pyspark dataframe
    def load_archive_table(self, df, basePath):
        
        # Writing data to the table in the specified basepath
        df \
        .write \
        .format("delta") \
        .mode("append") \
        .save(f"abfss://{basePath}")

        return True