In [0]:
# Notebook parameters

params = {
    "src_proj_dir": "",
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
from azure.storage.fileshare import ShareServiceClient
import os
import pydicom
from io import BytesIO
from pydicom.fileset import FileSet
import pandas as pd
import tqdm
from pyspark.sql import functions as F
from math import ceil
from pyspark.sql import types as T


In [0]:
acc_name = dbutils.secrets.get(scope = "adc_store", key = "pacs_intfileshare_accname")
acc_key = dbutils.secrets.get(scope = "adc_store", key = "pacs_intfileshare_acckey")

# Connection string
connection_string = f"DefaultEndpointsProtocol=https;AccountName={acc_name};AccountKey={acc_key};EndpointSuffix=core.windows.net"

# File share name
share_name = "intfileshare"

# Get a share client via connection string
share_client = ShareServiceClient.from_connection_string(connection_string).get_share_client(share_name)

# Source folder in the file share
src_root = "sectra"

# project dir
src_proj_dir = params["src_proj_dir"]

# parent dir at destination
dst_parent_dir = f"/Volumes/1_inland/sectra/vone/{src_proj_dir.replace(' ', '_')}"

In [0]:

# Add DICOMDIR in subdirectories to the table
items = list(share_client.list_directories_and_files(f"{src_root}/{src_proj_dir}"))
item_names = [x['name'] for x in items if x['name'] != 'DICOMDIR']
schema = T.StructType([T.StructField("src_subdirs", T.StringType(), True)])

df = spark.createDataFrame(data = item_names, schema = schema)
df.createOrReplaceTempView("temp_new_files")



# Insert DICOMDIR at subdir level to table
q = f"""
    INSERT INTO 1_inland.sectra.pacs_file_copy (
        src_root, src_proj_dir, src_subdirs, src_filename,
        dst_filepath, active_ind, copy_status, added_at, num_copy_tries, 
        process_status, num_process_tries, src_delete_status, num_delete_tries
    )
    SELECT
        '{src_root}', '{src_proj_dir}', src_subdirs, 'DICOMDIR',
        CONCAT('{dst_parent_dir}/',src_subdirs,'/DICOMDIR'), 1, 'new', CURRENT_TIMESTAMP(), 0,
        'new', 0, 'new', 0
    FROM temp_new_files
"""
spark.sql(q)

In [0]:
# Download DICOMDIR at project directory
file_client = share_client.get_file_client(f"{src_root}/{src_proj_dir}/DICOMDIR")
file_bytes = file_client.download_file().readall()

os.makedirs(dst_parent_dir, exist_ok=True)

# Write DICOMDIR to Databricks
with open(f"{dst_parent_dir}/DICOMDIR", "wb") as f:
    f.write(file_bytes)

In [0]:

# Insert the DICOMDIR at proj_dir to table
q = f"""
    INSERT INTO 1_inland.sectra.pacs_file_copy (
        src_root, src_proj_dir, src_subdirs, src_filename,
        dst_filepath, active_ind, copy_status, added_at, last_copy_run_at, num_copy_tries,
        process_status, num_process_tries, src_delete_status, num_delete_tries
    )
    VALUES (
        '{src_root}', '{src_proj_dir}', '.', 'DICOMDIR',
        '{dst_parent_dir}/DICOMDIR', 1, 'done', CURRENT_TIMESTAMP(), CURRENT_TIMESTAMP(), 1,
        'pending', 0, 'pending', 0
    );          
"""
spark.sql(q)

In [0]:

# Define schema for loading DICOMDIR
schema = T.StructType([
        T.StructField("src_subdirs", T.StringType(), True),
        T.StructField("src_file", T.StringType(), True),
])



In [0]:
# Read DICOMDIR
dicomdir = pydicom.dcmread(f"{dst_parent_dir}/DICOMDIR")

seq_len = len(dicomdir.DirectoryRecordSequence)
batch_size = 5000
max_iter = ceil(seq_len/batch_size)

# Use batch processing to avoid "maximum recursion depth exceeded" error
for iter_ind in tqdm.tqdm(range(max_iter)):
    i = iter_ind*batch_size
    j = min(i+batch_size, seq_len)

    # Create an empty data frame
    df = spark.createDataFrame(data = [], schema = schema)

    data = []

    for x in dicomdir.DirectoryRecordSequence[i:j]:

        # Retrieve ReferencedFileID which contains dcm file path info if exists
        try:
            refFileID = x["ReferencedFileID"].value
        except:
            continue

        data.append((f"{refFileID[1]}/{refFileID[2]}", refFileID[3]))

    # Add source subdirs and dcm files
    df = spark.createDataFrame(
        data=data,
        schema=schema
    )

    # Add destination file paths
    df = df.withColumn("dst_filepath", F.concat(F.lit(dst_parent_dir), F.lit("/"), F.col("src_subdirs"), F.lit("/"), F.col("src_file")))

    df.createOrReplaceTempView("temp_new_files")

    # Insert dcm file paths to table
    spark.sql(f"""
    INSERT INTO 1_inland.sectra.pacs_file_copy
    (src_root, src_proj_dir, src_subdirs, src_filename,
    dst_filepath, active_ind, copy_status, added_at, num_copy_tries,
    process_status, num_process_tries, src_delete_status, num_delete_tries)
    SELECT
    '{src_root}','{src_proj_dir}',src_subdirs, src_file,
    dst_filepath, 1, 'new', current_timestamp(), 0,
    'new', 0, 'new', 0
    FROM temp_new_files
    """)

#del dcm_file