## Initialization

In [None]:
%pip install dicomsdl

In [2]:
%matplotlib inline
import sys
import os
import glob
import dicomsdl
import numpy as np
import pandas as pd
from tqdm import tqdm
import cv2
from PIL import Image
from joblib import Parallel, delayed

sys.path.append("/kaggle/input/solution")
from config import *
from utils import *

In [3]:
NUM_OF_CORES=56
cfg.img_size = (1024, 512)

In [6]:
# data locations
cfg.data_dir = "/kaggle/input/rsna-breast-cancer-detection"
cfg.root_dir = "/kaggle/working/output/" #output directory
cfg.test_img_dir = cfg.data_dir + "/train_images/" # Directory of images to be preprocessed
os.makedirs(cfg.root_dir, exist_ok=True)

# df
df_test = pd.read_csv(f'{cfg.data_dir}/train.csv')
df_test["prediction_id"] = df_test.patient_id.apply(str) + "_" + df_test.laterality
test_images = glob.glob("/kaggle/input/rsna-breast-cancer-detection/test_images/*/*.dcm")
print("Number of images :", len(test_images))

# process df
df_test['view'] = df_test['view'].map({machine_id: idx for idx, machine_id in enumerate(sorted(df_test['view'].unique()))})
df_test['site_id'] -= 1
df_test['age'].fillna(df_test['age'].mean(), inplace=True)
df_test["age"] = df_test['age'] / 100
df_test = df_test.reset_index(drop=True)

Number of images : 54706


In [7]:
def load_image_dicomsdl(i):
    dcm_file = cfg.test_img_dir + df_test.patient_id.astype(str).iloc[i] + "/" + df_test.image_id.astype(str).iloc[i] + '.dcm'
    dataset = dicomsdl.open(dcm_file)
    img = dataset.pixelData()

    try:
            # Load only the variables we need
            center = dataset["WindowCenter"]
            width = dataset["WindowWidth"]
            bits_stored = dataset["BitsStored"]
            voi_lut_function = dataset["VOILUTFunction"]

            # For sigmoid it's a list, otherwise a single value
            if isinstance(center, list):
                center = center[0]
            if isinstance(width, list):
                width = width[0]

            # Set y_min, max & range
            y_min = 0
            y_max = float(2**bits_stored - 1)
            y_range = y_max

            # Function with default LINEAR (so for Nan, it will use linear)
            if voi_lut_function == "SIGMOID":
                img = y_range / (1 + np.exp(-4 * (img - center) / width)) + y_min
            else:
                # Checks width for < 1 (in our case not necessary, always >= 750)
                center -= 0.5
                width -= 1

                below = img <= (center - width / 2)
                above = img > (center + width / 2)
                between = np.logical_and(~below, ~above)

                img[below] = y_min
                img[above] = y_max
                if between.any():
                    img[between] = (
                        ((img[between] - center) / width + 0.5) * y_range + y_min
                    )
    except Exception as e:
#         dataset = dicomsdl.open(img_path)
        img = dataset.pixelData()


    img = (img - img.min()) / (img.max() - img.min())
    if dataset["PhotometricInterpretation"] == "MONOCHROME1":
        img = 1 - img

    img = (img * 255).astype(np.uint8)
    
    Y = img
    xmin = Y.min()
    xmax = Y.max()

    norm = np.empty_like(Y, dtype=np.uint8)

    dicomsdl.util.convert_to_uint8(Y, norm, xmin, xmax)

    X = norm
    X = X[5:-5, 5:-5]

    output= cv2.connectedComponentsWithStats((X > 10).astype(np.uint8)[:, :], 8, cv2.CV_32S) #
    stats = output[2]

    idx = stats[1:, 4].argmax() + 1
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h

    img = Y[y1: y2, x1: x2]
    img = Image.fromarray(img)
    img = img.resize([cfg.img_size[1],cfg.img_size[0]], resample=Image.Resampling.LANCZOS)
    img.save(cfg.root_dir + df_test.patient_id.astype(str).iloc[i] + "_" + df_test.image_id.astype(str).iloc[i] + ".png", "PNG")

In [None]:
Parallel(n_jobs=NUM_OF_CORES)(
    delayed(load_image_dicomsdl)(f)
    for f in tqdm(range(len(df_test.patient_id)))
    )