In [0]:
# Notebook parameters

params = {
    "input_dcm_folder": "",
    "output_dcm_folder": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

In [0]:
import pydicom
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import time
import copy
import os
from io import BytesIO
from pydicom.filebase import DicomFileLike

In [0]:
def list_dcm_files(folder):
    files = os.listdir(folder)
    files = [x for x in files if x.endswith(".dcm")]
    return files

In [0]:
def redact_pixels(dcm_obj, xcoords=(0,80), ycoords=(300,1000)):
    #ds = copy.deepcopy(dcm_obj)
    #ds.pixel_array[0:80, 300:1000] = 0
    #return ds
    dcm_obj.pixel_array[0:80, 300:1000] = 0
    return dcm_obj

In [0]:
print(list_dcm_files(params["input_dcm_folder"]))

In [0]:
df = spark.read.format("binaryFile").option("pathGlobFilter", "*.dcm").load(params["input_dcm_folder"])
display(df)

In [0]:
cdf = df.collect()

In [0]:
def read_dcm_dataset_from_byte_content(byte_content):
    return pydicom.dcmread(BytesIO(byte_content))

In [0]:
def write_dcm_dataset_to_byte_content(dcm_dataset):
    # create a buffer
    with BytesIO() as buffer:
        # create a DicomFileLike object that has some properties of DataSet
        memory_dataset = DicomFileLike(buffer)
        # write the dataset to the DicomFileLike object
        pydicom.dcmwrite(memory_dataset, ds)
        # to read from the object, you have to rewind it
        memory_dataset.seek(0)
        # read the contents as bytes
        return memory_dataset.read()

In [0]:
def redact_pixels_in_byte_content(byte_content):
    ds = read_dcm_dataset_from_byte_content(byte_content)
    ds.pixel_array[0:80, 300:1000] = 0
    return write_dcm_dataset_to_byte_content(ds)

In [0]:
ds = read_dcm_dataset_from_byte_content(cdf[0]["content"])

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType

redact_pixels_udf = udf(redact_pixels_in_byte_content, BinaryType())
df = df.withColumn("content", redact_pixels_udf(df["content"]))
display(df)

In [0]:
import os
from pyspark.sql.functions import col

def write_binary_file(row):
    new_path = f"/Volumes/1_inland/sectra/vone/{os.path.basename(row['path'])}"
    with open(new_path, 'wb') as f:
        f.write(row['content'])

df.rdd.foreach(write_binary_file)