In [0]:
# Notebook parameters

params = {
    "proj_dir": "",
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
from azure.storage.fileshare import ShareServiceClient
import os
import tqdm
from pathlib import Path
from pyspark.sql import functions as F
from pyspark.sql import types as T
import asyncio
from delta.tables import *
import time
import pydicom
from functools import lru_cache
from math import ceil

In [0]:
sectra_dir = "/Volumes/1_inland/sectra/vone/"
proj_dir_path = f"{sectra_dir}/{params['proj_dir']}"


In [0]:
# Read DICOMDIR
dicomdir = pydicom.dcmread(f"{sectra_dir}/{params['proj_dir']}/DICOMDIR")


seq_len = len(dicomdir.DirectoryRecordSequence)
batch_size = 5000
max_iter = ceil(seq_len/batch_size)

# Define schema for loading DICOMDIR
schema = T.StructType([
        T.StructField("subdirs", T.StringType(), True),
        T.StructField("filename", T.StringType(), True),
])

print("Start loop")

# Use batch processing to avoid "maximum recursion depth exceeded" error
for iter_ind in tqdm.tqdm(range(max_iter)):
    i = iter_ind*batch_size
    j = min(i+batch_size, seq_len)

    # Create an empty data frame
    df = spark.createDataFrame(data = [], schema = schema)

    data = []

    for x in dicomdir.DirectoryRecordSequence[i:j]:

        # Retrieve ReferencedFileID which contains dcm file path info if exists
        try:
            refFileID = x["ReferencedFileID"].value
        except:
            continue

        data.append((f"{refFileID[1]}/{refFileID[2]}", refFileID[3]))

    # Add source subdirs and dcm files
    df = spark.createDataFrame(
        data=data,
        schema=schema
    )

    # Add destination file paths
    df = df.withColumn("filepath", F.concat(F.lit(sectra_dir), F.lit(params['proj_dir']), F.lit("/"), F.col("subdirs"), F.lit("/"), F.col("filename")))

    df.createOrReplaceTempView("temp_new_files")

    # Insert dcm file paths to table
    spark.sql(f"""
    INSERT INTO 1_inland.sectra.pacs_file_process
    (file_path, proj_dir, subdirs, filename, 
    added_at, file_status, file_status_updt_dt, 
    priority, active_ind
    )
    SELECT
    filepath,'{params["proj_dir"]}',subdirs, filename,
    current_timestamp(), 'new', current_timestamp(),
    0, 1
    FROM temp_new_files
    """)

