In [0]:
from azure.storage.fileshare import ShareServiceClient
import os
import tqdm
from pathlib import Path
from pyspark.sql import functions as F
import asyncio

In [0]:
MAX_CONCURRENT_COPIES = 2
RETRY_INTERVAL_HOURS = 12
MAX_ITERATIONS = 2

In [0]:
def getSrcFileClient(src_path):
    acc_name = dbutils.secrets.get(scope = "adc_store", key = "pacs_intfileshare_accname")
    acc_key = dbutils.secrets.get(scope = "adc_store", key = "pacs_intfileshare_acckey")

    # Connection string
    connection_string = f"DefaultEndpointsProtocol=https;AccountName={acc_name};AccountKey={acc_key};EndpointSuffix=core.windows.net"

    # File share name
    share_name = "intfileshare"

    # Get a share client via connection string
    share_client = ShareServiceClient.from_connection_string(connection_string).get_share_client(share_name)

    file_client = share_client.get_file_client(src_path)

    return file_client

In [0]:

async def copySrcFileToDst(row):
    src_path = os.path.join(row["src_root"], row["src_proj_dir"], row["src_subdirs"], row["src_filename"])

    try:
        file_client = getSrcFileClient(src_path)
        file_bytes = file_client.download_file().readall()

        # Make parent directory if not exist
        Path(row["dst_filepath"]).parent.absolute().mkdir(parents=True, exist_ok=True)

        # Write to Databricks
        with open(row["dst_filepath"], "wb") as f:
            f.write(file_bytes)
    except Exception as e:
        return (row["item_id"], 'failed')
    
    return (row["item_id"], 'done')


In [0]:

for _ in tqdm.tqdm(range(MAX_ITERATIONS)):
    df = spark.sql(f"""
        SELECT *
        FROM 1_inland.sectra.pacs_file_copy
        WHERE 
            active_ind = 1 
            AND LOWER(status) != 'done' 
            AND (
                TIMEDIFF(HOUR, last_run_at, CURRENT_TIMESTAMP()) > {RETRY_INTERVAL_HOURS}
                OR last_run_at IS NULL)
        LIMIT {MAX_CONCURRENT_COPIES}     
    """)

    if df.count() > 0:
        pass
    else:
        break

    coros = [copySrcFileToDst(row) for row in df.collect()]
    results = await asyncio.gather(*coros)
    result_df = spark.createDataFrame(data=results, schema=["item_id", "status"])
    result_df.createOrReplaceTempView("temp_file_copy_results")

    spark.sql("""
        MERGE INTO 1_inland.sectra.pacs_file_copy AS t USING temp_file_copy_results AS s
        ON t.item_id = s.item_id
        WHEN MATCHED THEN UPDATE SET 
        t.status = s.status,
        last_run_at = CURRENT_TIMESTAMP()
    """)

In [0]:
%sql
SELECT *
FROM 1_inland.sectra.pacs_file_copy
WHERE status = 'done'