In [0]:
# Notebook parameters

params = {
    "proj_dir": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
import os
import time
from datetime import datetime
from tqdm.notebook import tqdm
import pydicom
import asyncio
from functools import lru_cache
from pyspark.sql import types as T
from pyspark.sql import functions as F
from glob import glob

In [0]:
ROOT_DIR = "/Volumes/1_inland/sectra/vone"
full_proj_path = os.path.join(ROOT_DIR, params["proj_dir"])

In [0]:
scan_dirs = glob(f"{full_proj_path}/*/*/")
from pyspark.sql import Row

scan_df = spark.createDataFrame([Row(scan_dir=sd) for sd in scan_dirs])

def file_01_exists(scan_dir):
    if os.path.exists(os.path.join(scan_dir, "000001.dcm")):
        return "dbfs:"+os.path.join(scan_dir, "000001.dcm")
    else:
        dcm_file = glob(os.path.join(scan_dir, "*.dcm"))[0]
        return "dbfs:"+os.path.join(scan_dir, dcm_file)


from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType, StringType

file_exists_udf = udf(file_01_exists, StringType())

scan_df = scan_df.withColumn("dcm_file", file_exists_udf("scan_dir"))
scan_df = scan_df.withColumnRenamed("scan_dir", "full_dir_path")
from pyspark.sql.functions import expr

scan_df = scan_df.withColumn("scan_dir", expr(f"replace(full_dir_path, '{full_proj_path}/', '')"))
display(scan_df)

In [0]:
# Collect file paths as a Python list using DataFrame API
file_paths = scan_df.select("dcm_file").toPandas()["dcm_file"].tolist()

df = (
    spark.read.format("binaryFile")
    .load(file_paths)
    .withColumnRenamed("path", "dcm_file")
)

scan_df = scan_df.join(df, on="dcm_file", how="left")
#display(scan_df)

In [0]:
from io import BytesIO
from pydicom.filebase import DicomFileLike

In [0]:
import pandas as pd
from pathlib import Path

def process_dicom_batch(pdf_iter):
    for pdf in pdf_iter:
        output_rows = []
        for scan_dir, path, content in zip(pdf['scan_dir'], pdf['dcm_file'], pdf['content']):
            try:
                ds = pydicom.dcmread(BytesIO(content), force=True)
                #ds.decompress()

                try:
                    accession_nbr = str(ds["AccessionNumber"].value)
                except:
                    accession_nbr = None

                try:
                    study_id = str(ds["StudyID"].value)
                except:
                    study_id = None

                try:
                    if "ProcedureCodeSequence" in ds:
                        code_value = ds.ProcedureCodeSequence[0].CodeValue
                    elif "RequestedProcedureCodeSequence" in ds:
                        seq = ds.RequestedProcedureCodeSequence
                        if len(seq) > 0 and "CodeValue" in seq[0]:
                            code_value = seq[0].CodeValue
                        else:
                            code_value = None
                    else:
                        code_value = ds.RequestedProcedureID
                except:
                    code_value = None

                try:
                    description = str(ds["StudyDescription"].value)
                except:
                    description = None
                
                output_rows.append((path, scan_dir, accession_nbr, study_id, code_value, description))
            except:
                output_rows.append((path, None, None, None, None, None))


        yield pd.DataFrame(output_rows, columns=["dcm_file", "scan_dir", "accession_number", "study_id", "code_value", "description"])

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, BinaryType

# Define output schema: path + redacted DICOM binary
output_schema = StructType([
    StructField("dcm_file", StringType(), True),
    StructField("scan_dir", StringType(), True),
    StructField("accession_number", StringType(), True),
    StructField("study_id", StringType(), True),
    StructField("code_value", StringType(), True),
    StructField("description", StringType(), True)
])

md_df = scan_df.mapInPandas(process_dicom_batch, schema=output_schema)
display(md_df.limit(10))



In [0]:
import pandas as pd

md_df = md_df.drop("dcm_file", "study_id")
md_collected = md_df.collect()

md_collected_df = pd.DataFrame(md_collected, columns=["scan_dir", "accession_number", "code_value", "description"])

In [0]:
md_collected_df.sort_values("scan_dir").to_csv(
    os.path.join(full_proj_path, 'dcm_metadata.csv'),
    index=False
)