In [0]:
# Notebook parameters

params = {
    "proj_dir": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
import os
import time
from datetime import datetime
from tqdm import tqdm
import pydicom
import asyncio
from functools import lru_cache
from pyspark.sql import types as T
from pyspark.sql import functions as F
from glob import glob

In [0]:
ROOT_DIR = "/Volumes/1_inland/sectra/vone"
proj_dir_path = os.path.join(ROOT_DIR, params["proj_dir"])

In [0]:
@lru_cache(maxsize=128, typed=False)
def retrievePersonId(accession_nbr):
    if accession_nbr is None:
        return 'unknown'
    try:
        person_id = spark.sql(f"""
            SELECT MAX(MillPersonId) AS MillPersonId
            FROM 4_prod.pacs.all_pacs_ref_nbr
            WHERE refnbr = '{accession_nbr}'
        """).first()["MillPersonId"]
    except:
        person_id = 'unknown'
    
    if person_id is None:
        person_id = 'unknown'
    
    return str(person_id)

In [0]:
def extractAccessionNbrFromDicomdir(filepath):
    dcm = pydicom.dcmread(filepath)
    
    nbr = "unknown"
    for x in dcm.DirectoryRecordSequence:
        try:
            nbr = x["AccessionNumber"].value
            break
        except:
            continue

    return nbr

In [0]:
data = []
for fp in tqdm(glob(f"{proj_dir_path}/*/DICOMDIR")):
    data.append((fp, extractAccessionNbrFromDicomdir(fp)))

accession_nbr = spark.createDataFrame(data=data, schema=["filepath", "accession_nbr"])
accession_nbr.createOrReplaceTempView("temp_accession_nbr")

In [0]:
df = spark.sql(f"""
SELECT
    t.filepath,
    MAX(REPLACE(REPLACE(t.filepath, '{proj_dir_path}/',''), '/DICOMDIR', '')) AS Subdir,
    MAX(t.accession_nbr) AS AccessionNbr,
    MAX(s.MillPersonId) AS MillPersonId
FROM temp_accession_nbr AS t
LEFT JOIN 4_prod.pacs.all_pacs_ref_nbr AS s
ON t.accession_nbr = s.RefNbr
GROUP BY t.filepath
""")

In [0]:
display(df)