In [None]:
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
from notebookutils import mssparkutils

spark = SparkSession.builder.getOrCreate()

# source and destination paths
storage_account = "$STORAGE_ACCOUNT"
ecr_post_bundle_file_path = f"abfss://bundle-snapshots@{storage_account}.dfs.core.windows.net/post/ecr"
ecr_rerun_file_path = f"abfss://source-data@{storage_account}.dfs.core.windows.net//ecr-rerun"

# parquet log file: timestamp, filename, and destination path
timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
parquet_file_name = f"copied_files_log_{timestamp_str}.parquet"
delta_tables = f"abfss://delta-tables@{storage_account}.dfs.core.windows.net/"
parquet_file_path = f"{delta_tables}/ecr-rerun-logs/{parquet_file_name}"

# dataframe to track moved files
copied_files_log = spark.createDataFrame([], schema="filename string, source_path string, dest_path string, timestamp string, file_exists_skip boolean, success boolean")

# outer try/except for acessing list of file
# inner try/except for issues copying files and marking success or failure
try:
    # get list of files
    files = mssparkutils.fs.ls(ecr_post_bundle_file_path)

    for file in files:
        # initialize 'success' flag
        success = True
        try:
            src_path = file.path
            dest_path = f"{ecr_rerun_file_path}/{file.name}"

            # capture the timestamp before copying the file
            copy_timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

            # check if the file exists
            file_exists = mssparkutils.fs.exists(dest_path)

            # copy the files if it doesn't exist
            if not file_exists:
                mssparkutils.fs.cp(src=src_path, dest=dest_path)
            else:
                # if the file already exists, set 'success' to false
                success = False

        except Exception as e:
            # if there's an error copying, set 'success' to false
            success = False
            print(f"Error copying file {file.name}: {str(e)}")

        # log the file copy
        new_row = spark.createDataFrame([(file.name, src_path, dest_path, copy_timestamp, file_exists, success)])
        copied_files_log = copied_files_log.union(new_row)

except Exception as e:
    print(f"Error retrieving file list: {str(e)}")
    
# add current timestamp
copied_files_log = copied_files_log.withColumn("log_timestamp", current_timestamp())

# write log to parquet
copied_files_log.write.mode("append").parquet(parquet_file_path)
    
# inspect log of moved files
copied_files_log.show()