In [0]:
# Notebook parameters

params = {
    "input_folder": "",
    "output_folder": "",
    "vx": "",
    "vy": "",
    "hx": "",
    "hy": ""
}

# create text widgets
for k in params.keys():
    dbutils.widgets.text(k, "", "")

# fetch values
for k in params.keys():
    params[k] = dbutils.widgets.get(k)
    print(k, ":", params[k])

# e.g. "input_folder": "/Volumes/1_inland/sectra/vone/example_ultrasound/000003/000002/"

In [0]:
vx = float(params["vx"])
vy = float(params["vy"])
hx = float(params["hx"])
hy = float(params["hy"])

print(vx)

In [0]:
import pydicom
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import time
import copy
import os
from io import BytesIO
from pydicom.filebase import DicomFileLike

In [0]:
def list_dcm_files(folder):
    files = os.listdir(folder)
    files = [x for x in files if x.endswith(".dcm")]
    return files

In [0]:
df = (
    spark.read.format("binaryFile")
    .option("pathGlobFilter", "*.dcm")
    .option("recursiveFileLookup", "true")
    .load(params["input_folder"])
)
print(df.count())
display(df)

In [0]:
import pandas as pd
from pathlib import Path

def redact_dicom_batch(pdf_iter):
    for pdf in pdf_iter:
        redacted_rows = []
        for path, content in zip(pdf['path'], pdf['content']):
            try:
                ds = pydicom.dcmread(BytesIO(content), force=True)
                ds.decompress()

                # Modify pixel array
                pixel_array = ds.pixel_array.copy()

                arr_shape = np.shape(pixel_array)

                if 0 <= vx <= 1 and 0 <= vy <= 1:
                    _vx = int(arr_shape[0]*vx)
                    _vy = int(arr_shape[0]*vy)
                if 0 <= hx <= 1 and 0 <= hy <= 1:
                    _hx = int(arr_shape[1]*hx)
                    _hy = int(arr_shape[1]*hy)

                _vx = max(0, _vx)
                _vy = min(arr_shape[0], _vy)
                _hx = max(0, _hx)
                _hy = min(arr_shape[1], _hy)

                #pixel_array[_vx:_vy, _hx:_hy] = 0
                pixel_array[_vx:_vy, _hx:_hy] = np.max(pixel_array)

                ds.PixelData = pixel_array.tobytes()
                ds.file_meta.TransferSyntaxUID = pydicom.uid.ExplicitVRLittleEndian

                # Construct new file path
                filename = os.path.basename(path)
                #new_path = os.path.join(params["output_folder"], filename)
                new_path = os.path.join(params["output_folder"], path.replace(params["input_folder"], "").replace("dbfs:", ""))

                outdirs = Path(new_path.replace(filename, ""))
                outdirs.mkdir(parents=True, exist_ok=True)

                # Save to disk
                with open(new_path, "wb") as f:
                    ds.save_as(f)
                
                redacted_rows.append((path, new_path))
            except:
                redacted_rows.append((path, None))


        yield pd.DataFrame(redacted_rows, columns=["input_path", "output_path"])

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, BinaryType

# Define output schema: path + redacted DICOM binary
output_schema = StructType([
    StructField("input_path", StringType(), True),
    StructField("output_path", BinaryType(), True)
])

# Run mapInPandas redaction
redacted_df = df.mapInPandas(redact_dicom_batch, schema=output_schema)
# Show saved paths
results = redacted_df.collect()




In [0]:
results[:min(10, len(results))]