In [0]:
# Notebook parameters

params = {
    "proj_dir": "",
    "adf_start_time": "",
    "adf_pipeline_id": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
import os
import time
from datetime import datetime
from tqdm import tqdm
import pydicom
import asyncio
from functools import lru_cache
from pyspark.sql import types as T


In [0]:
ROOT_DIR = "/Volumes/1_inland/sectra/vone"
proj_dir_path = os.path.join(ROOT_DIR, params["proj_dir"])
start_time = datetime.strptime(params["adf_start_time"][:26], "%Y-%m-%dT%H:%M:%S.%f")

In [0]:
@lru_cache(maxsize=128, typed=False)
def retrievePersonId(accession_nbr):
    if accession_nbr is None:
        return 'unknown'
    
    try:
        person_id = spark.sql(f"""
            SELECT MAX(MillPersonId) AS MillPersonId
            FROM 4_prod.pacs.all_pacs_ref_nbr
            WHERE refnbr = '{accession_nbr}'
        """).first()["MillPersonId"]
    except:
        person_id = 'unknown'
    
    if person_id is None:
        person_id = 'unknown'
    
    return str(person_id)

In [0]:
async def async_getmtime(filepath):
    return (filepath, datetime.fromtimestamp(os.path.getmtime(filepath)))

def getmtime(filepath):
    return (filepath, datetime.fromtimestamp(os.path.getmtime(filepath)))


def _update_dicomdir_pid(filepath):
    dcm = pydicom.dcmread(filepath)
    for x in dcm.DirectoryRecordSequence:
        try:
            nbr = x["AccessionNumber"].value
            break
        except:
            continue

    pid = retrievePersonId(nbr)

    """
    for x in dcm.DirectoryRecordSequence:
        try:
            x["PatientID"].value = pid
        except:
            continue
    """

    #dcm.save_as(filepath)

    return (filepath, nbr, pid)


def _update_dcm_pid(filepath):
    dcm = pydicom.dcmread(filepath)

    try:
        nbr = dcm["AccessionNumber"].value
    except:
        nbr = None

    pid = retrievePersonId(nbr)

    #dcm["PatientID"].value = pid

    #dcm.save_as(filepath)

    return (filepath, nbr, pid)



async def update_infile_pid(filepath):
    if str(filepath.split("/")[-1]).upper() == "DICOMDIR":
        return _update_dicomdir_pid(filepath)
    else:
        return _update_dcm_pid(filepath)
    


In [0]:
# Define schema for loading DICOMDIR
schema = T.StructType([
        T.StructField("filepath", T.StringType(), True),
        T.StructField("accession_nbr", T.StringType(), True),
        T.StructField("pid", T.StringType(), True)
])

min_batch_size = 400
file_queue = []
for root, subdirs, files in tqdm(os.walk(os.path.join(ROOT_DIR, params["proj_dir"]))):
    #coros = [async_getmtime(os.path.join(root, filename)) for filename in files]
    #results = await asyncio.gather(*coros)
    filepaths = [os.path.join(root, filename) for filename in files]
    ctimes = [datetime.fromtimestamp(os.path.getctime(fp)) for fp in filepaths]
    newfile_flags = [mt >= start_time for mt in mtimes]
    file_queue += [fp for i, fp in enumerate(filepaths) if newfile_flags[i]]

    if len(file_queue) >= min_batch_size:
        coros = [update_infile_pid(fp) for fp in file_queue]
        results = await asyncio.gather(*coros)
        file_queue = []
        break



In [0]:
results

In [0]:
# Read DICOMDIR
dicomdir = pydicom.dcmread(f"{proj_dir}/DICOMDIR")


seq_len = len(dicomdir.DirectoryRecordSequence)
batch_size = 100
max_iter = ceil(seq_len/batch_size)


# Define schema for loading DICOMDIR
schema = T.StructType([
        T.StructField("filepath", T.StringType(), True),
        T.StructField("accession_nbr", T.StringType(), True),
        T.StructField("mill_pid", T.StringType(), True)
])

print("Start loop")

# Use batch processing to avoid "maximum recursion depth exceeded" error
for iter_ind in tqdm(range(max_iter)):
    i = iter_ind*batch_size
    j = min(i+batch_size, seq_len)

    # Create an empty data frame
    df = spark.createDataFrame(data = [], schema = schema)

    data = []

    for x in dicomdir.DirectoryRecordSequence[i:j]:


In [0]:
# Read DICOMDIR

accession_nbrs = []

for i in range(289):
    try:
        fp = f"{proj_dir_path}/{str(i).zfill(6)}/DICOMDIR"
        dicomdir = pydicom.dcmread(fp)
        
    except:
        continue

    for x in dicomdir.DirectoryRecordSequence:
        try:
            accession_nbrs.append(x["AccessionNumber"].value)
        except:
            continue

print(set(accession_nbrs))
print(len(set(accession_nbrs)))