In [0]:
# Notebook parameters

params = {
    "proj_dir": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
import os
import time
from datetime import datetime
from tqdm.notebook import tqdm
import pydicom
import asyncio
from functools import lru_cache
from pyspark.sql import types as T
from pyspark.sql import functions as F
from glob import glob

In [0]:
ROOT_DIR = "/Volumes/1_inland/sectra/vone"
proj_dir_path = os.path.join(ROOT_DIR, params["proj_dir"])

In [0]:
def extractAccessionNbrFromDicomdir(filepath):
    dcm = pydicom.dcmread(filepath)
    
    accession_nbr = []
    study_id = []
    description = []
    fileid = []
    print(dcm)
    for x in dcm.DirectoryRecordSequence:
        print(x)
        try:
            accession_nbr.append(str(x["AccessionNumber"].value))
        except:
            accession_nbr.append("unknown")

        try:
            study_id.append(str(x["StudyID"].value))
        except:
            study_id.append("unknown")

        try:
            description.append(str(x["StudyDescription"].value))
        except:
            description.append("unknown")

        try:
            fileid.append(str(x["ReferencedFileID"].value))
        except:
            fileid.append("unknown")
    return accession_nbr, study_id, description, fileid

In [0]:
def extractMetadataFromDcm(filepath):
    dcm = pydicom.dcmread(filepath)
    
    try:
        accession_nbr = str(dcm["AccessionNumber"].value)
    except:
        accession_nbr = "unknown"

    try:
        study_id = str(dcm["StudyID"].value)
    except:
        study_id = "unknown"

    try:
        code_value = str(dcm["CodeValue"].value)
    except:
        code_value = "unkown"


    try:
        description = str(dcm["StudyDescription"].value)
    except:
        description = "unknown"

    return accession_nbr, study_id, code_value, description

In [0]:
data = []
for scandir in tqdm(glob(f"{proj_dir_path}/*/*/")):
    dcm_path = os.path.join(scandir, "000001.dcm")
    if not os.path.exists(dcm_path):
        dcm_files = glob(os.path.join(scandir, "*.dcm"))
        dcm_path = dcm_files[0]

    try:
        accession_nbr, study_id, code_value, description = extractMetadataFromDcm(dcm_path)
    except:
        accession_nbr = "unknown"
        study_id = "unknown"
        code_vaue = "unknown"
        description = "unknown"

    
    item = {}
    item["filepath"] = scandir
    item["accession_nbr"] = accession_nbr
    item["study_id"] = study_id
    item["code_value"] = code_value
    item["study_description"] = description
    data.append(item)


accession_nbr = spark.createDataFrame(data=data)
accession_nbr.createOrReplaceTempView("temp_accession_nbr")

In [0]:
%sql
SELECT *
FROM temp_accession_nbr


In [0]:
%sql
SELECT *
FROM temp_accession_nbr
WHERE study_description = 'unknown'

In [0]:
df = spark.sql(f"""
SELECT
    --t.filepath,
    MAX(REPLACE(REPLACE(t.filepath, '{proj_dir_path}/',''), '/DICOMDIR', '')) AS Subdir,
    MAX(t.accessionnbr) AS AccessionNbr,
    MAX(s.PersonId) AS MillPersonId
FROM temp_accession_nbr AS t
LEFT JOIN 4_prod.pacs.imaging_metadata AS s
ON t.accessionnbr = s.AccessionNbr
GROUP BY t.filepath
ORDER BY Subdir ASC
""")

In [0]:

df = spark.sql(f"""
WITH nhs AS (
    SELECT DISTINCT
        MillPersonId,
        NhsNumber
    FROM 4_prod.pacs_dlt.intmd_pacs_patient_alias
)
SELECT
    --t.filepath,
    MAX(REPLACE(REPLACE(t.filepath, '{proj_dir_path}/',''), '/DICOMDIR', '')) AS Subdir,
    MAX(t.accession_nbr) AS AccessionNbr,
    MAX(s.PersonId) AS MillPersonId,
    MAX(nhs.NhsNumber) AS NhsNumber
FROM temp_accession_nbr AS t
LEFT JOIN 4_prod.pacs.imaging_metadata AS s
ON t.accession_nbr = s.AccessionNbr
LEFT JOIN nhs
ON nhs.MillPersonId = s.PersonId
GROUP BY t.filepath
ORDER BY Subdir ASC
""")


In [0]:
pd = df.toPandas()
pd.to_csv(os.path.join(proj_dir_path, "ImageInfo.csv"), index=False)

In [0]:
display(df)

In [0]:
display(df.groupby("MillPersonId").count())