In [None]:
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
from notebookutils import mssparkutils

spark = SparkSession.builder.getOrCreate()

# source and destination paths
storage_account = "$STORAGE_ACCOUNT"
ecr_post_bundle_file_path = f"abfss://bundle-snapshots@{storage_account}.dfs.core.windows.net/post/ecr"
ecr_rerun_file_path = f"abfss://source-data@{storage_account}.dfs.core.windows.net/ecr-rerun"

# parquet log file: timestamp, filename, and destination path
timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
parquet_file_name = f"moved_files_log_{timestamp_str}.parquet"
parquet_file_path = f"{ecr_rerun_file_path}/logs/{parquet_file_name}"

# dataframe to track moved files
moved_files_log = spark.createDataFrame([], schema="filename string, source_path string, dest_path string, timestamp string, file_exists_skip boolean")

# get directory contents
files = mssparkutils.fs.ls(ecr_post_bundle_file_path)
for file in files:
    src_path = file.path
    dest_path = f"{ecr_rerun_file_path}/{file.name}"

    # capture the timestamp before moving the file
    move_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # check if the file exists
    file_exists = mssparkutils.fs.exists(dest_path)

    # move the files if it doesn't exist
    if not file_exists:
        mssparkutils.fs.mv(src=src_path, dest=dest_path, create_path=True)

    # log the file move
    new_row = spark.createDataFrame([(file.name, src_path, dest_path, move_timestamp, file_exists)])
    moved_files_log = moved_files_log.union(new_row)

# add current timestamp
moved_files_log = moved_files_log.withColumn("log_timestamp", current_timestamp())

# write log to parquet
moved_files_log.write.mode("append").parquet(parquet_file_path)

# inspect log of moved files
moved_files_log.show()