In [0]:
# Notebook parameters

params = {
    "input_folder": "",
    "output_folder": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

# e.g. "input_folder": "/Volumes/1_inland/sectra/vone/example_ultrasound/000003/000002/"

In [0]:
import pydicom
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import time
import copy
import os
from io import BytesIO
from pydicom.filebase import DicomFileLike

In [0]:
def list_dcm_files(folder):
    files = os.listdir(folder)
    files = [x for x in files if x.endswith(".dcm")]
    return files

In [0]:
print(list_dcm_files(params["input_folder"]))

In [0]:
df = spark.read.format("binaryFile").option("pathGlobFilter", "*.dcm").load(params["input_folder"])
display(df)

In [0]:
import pandas as pd
def redact_dicom_batch(pdf_iter):
    for pdf in pdf_iter:
        redacted_rows = []
        for path, content in zip(pdf['path'], pdf['content']):
            ds = pydicom.dcmread(BytesIO(content), force=True)
            ds.decompress()

            # Modify pixel array
            pixel_array = ds.pixel_array.copy()
            pixel_array[0:80, 300:1000] = 0  # example region

            ds.PixelData = pixel_array.tobytes()
            ds.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRLittleEndian

            # Construct new file path
            filename = os.path.basename(path)
            new_path = os.path.join(params["output_folder"], filename)

            # Save to disk
            with open(new_path, "wb") as f:
                ds.save_as(f)
            
            redacted_rows.append((path, new_path))

        yield pd.DataFrame(redacted_rows, columns=["input_path", "output_path"])

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, BinaryType

# Define output schema: path + redacted DICOM binary
output_schema = StructType([
    StructField("input_path", StringType(), True),
    StructField("output_path", BinaryType(), True)
])

# Run mapInPandas redaction
redacted_df = df.mapInPandas(redact_dicom_batch, schema=output_schema)
# Show saved paths
_ = redacted_df.collect()
