## 1. Defining functions to monitor processed files

In [0]:
''' Assuming there are no modifications to existing files & only new files will be added with new data '''

In [0]:
# defining get_processed_files function

def get_processed_files():
    """This function is for getting already processed files"""

    try:
        processed = spark.sql("""
                SELECT DISTINCT source_file 
                FROM workspace.bronze.processed_files 
            """).collect()
        return set([row.source_file for row in processed])
    except:
        #no processed files found
        return set()

In [0]:
# defining the logger function
from datetime import datetime

def log_processed_file(filename, row_count):
    """Log processed file details"""

    log_df = spark.createDataFrame(
        [(filename, row_count, datetime.now())],
        ["source_file", "row_count", "processed_ts"]
    )

    log_df.write.mode("append").saveAsTable("workspace.bronze.processed_files")

## 2. Define files path and Start Batch Processing

In [0]:
source_crm_files = dbutils.fs.ls("/Volumes/workspace/bronze/sources/source_crm/")
source_erp_files = dbutils.fs.ls("/Volumes/workspace/bronze/sources/source_erp/")

csv_crm_files = [file.path for file in source_crm_files if file.path.endswith('.csv')]
csv_erp_files = [file.path for file in source_erp_files if file.path.endswith('.csv')]

all_csv_files = [*csv_crm_files, *csv_erp_files]

In [0]:
#get already processed files
processed_files = get_processed_files()
#filter for only new added files to the volume
new_files = [file for file in all_csv_files if file not in processed_files]

print(f"Total files: {len(all_csv_files)}, New files: {len(new_files)}")

In [0]:
#start processing new files

for file_path in new_files:
    print(f"processing new file: {file_path}")
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    row_count = df.count()

    #write to delta tables
    table_name = file_path.split('/')[-1].split('.')[0]
    df.write.mode("append").saveAsTable(f"workspace.bronze.{table_name}")

    #log processed file
    log_processed_file(file_path, row_count)

    print(f"Processed: {file_path} {row_count} rows")