In [0]:
# Notebook parameters

params = {
    "input_folder": "",
    "output_folder": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

# e.g. "input_folder": "/Volumes/1_inland/sectra/vone/example_ultrasound/000003/000002/"

In [0]:
import pydicom
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import time
import copy
import os
from io import BytesIO
from pydicom.filebase import DicomFileLike

In [0]:
def list_dcm_files(folder):
    files = os.listdir(folder)
    files = [x for x in files if x.endswith(".dcm")]
    return files

In [0]:
def redact_pixels(dcm_obj, vx=0, vy=80, hx=300, hy=1000):
    #ds = copy.deepcopy(dcm_obj)
    #ds.pixel_array[0:80, 300:1000] = 0
    #return ds
    arr_shape = np.shape(dcm_obj.pixel_array)
    if 0 <= vx <= 1 and 0 <= vy <= 1:
        vx = int(arr_shape[0]*vx)
        vy = int(arr_shape[0]*vy)
    if 0 <= hx <= 1 and 0 <= hy <= 1:
        hx = int(arr_shape[1]*hx)
        hy = int(arr_shape[1]*hy)

    vx = max(0, vx)
    vy = min(arr_shape[0], vy)
    hx = max(0, hx)
    hy = min(arr_shape[1], hy)

    
    dcm_obj.pixel_array[vx:vy, hx:hy] = 0
    return dcm_obj

In [0]:
print(list_dcm_files(params["input_folder"]))

In [0]:
df = spark.read.format("binaryFile").option("pathGlobFilter", "*.dcm").load(params["input_folder"])
display(df)

In [0]:
cdf = df.collect()

In [0]:
def read_dcm_dataset_from_byte_content(byte_content):
    return pydicom.dcmread(BytesIO(byte_content))

In [0]:
def write_dcm_dataset_to_byte_content(dcm_dataset):
    # create a buffer
    with BytesIO() as buffer:
        # create a DicomFileLike object that has some properties of DataSet
        memory_dataset = DicomFileLike(buffer)
        # write the dataset to the DicomFileLike object
        pydicom.dcmwrite(memory_dataset, ds)
        # to read from the object, you have to rewind it
        memory_dataset.seek(0)
        # read the contents as bytes
        return memory_dataset.read()

In [0]:
def redact_pixels_in_byte_content(byte_content):
    ds = read_dcm_dataset_from_byte_content(byte_content)
    ds.pixel_array[0:80, 300:1000] = 0
    return write_dcm_dataset_to_byte_content(ds)

In [0]:
ds = read_dcm_dataset_from_byte_content(cdf[0]["content"])

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType

redact_pixels_udf = udf(redact_pixels_in_byte_content, BinaryType())
df = df.withColumn("content", redact_pixels_udf(df["content"]))
display(df)

In [0]:
import os
from pyspark.sql.functions import col
from pathlib import Path

def write_binary_file(row):
    new_path = os.path.join(params["output_folder"], os.path.basename(row['path']))
    with open(new_path, 'wb') as f:
        f.write(row['content'])

outp = Path(params["output_folder"])
outp.mkdir(parents=True, exist_ok=True)

# rdd is not supported in serverless or shared compute
df.rdd.foreach(write_binary_file)