#This code snippet will preprocess the RSNA dataset into dataset that will be ready for input into the model.

#Pseudocode:

1. Read in the "all_data.csv" file which contains metadata of all images and put it in a pandas DF.
    Each row has data on a patient image.
    Patients can have a CC and MLO image per side. 
    Goal: create an object for 1 breast imaging study, left and right are kept separate. 
2. Map the pathology column to a labels column, which will be 0 for benign or benign without callback, and 1 malignant
3. Group the meta data rows by same patient IDs and laterality. 
4. For every grouping in groups:

    Set variable patientID to the patient ID
    Set variable laterality to the breast side
    Set variable label to the mapped label.
    Extract the path to the CC image.
    Extract the path to the MLO image.

    (to do- write a function that takes in a path to an image and returns the pixel data)
    (to do- write a function that takes in raw pixel data and outputs a grayscaled, rescaled image)
    
    Set variable CC_image to hold processed pixel data for Cranio-Caudal image.
    Set variable MLO_image to hold processed pixel data for Mediolateral image. 

    Package CC and MLO image into a tensor. Should have a shape (400,400,2)

    Generate a path to a tensor file and save it to tensor_file variable. 
    Save the tensor into the numpy file, .npy 

    Create a breast object that holds properties: patientID, laterality, label, tensor_file.

    Append the breast object to a breast objects list.

    End For Loop
   
   Write the breast object to a JSON file.
    


In [1]:
import os
import pydicom
import numpy as np
import pandas as pd
import re
import cv2
import json
from pydicom.pixel_data_handlers.util import apply_voi_lut
from pydicom.pixel_data_handlers.util import apply_modality_lut, apply_voi_lut

In [2]:
data_directory = "../data/rsna-breast-cancer-detection/"
metadata_file = "../data/rsna-breast-cancer-detection/train.csv"

In [3]:
#1. Import the metadata
metadata = pd.read_csv(metadata_file)

In [4]:
metadata

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True


In [5]:

metadata["label"] = metadata["cancer"]

In [6]:
metadata["label"]

0        0
1        0
2        0
3        0
4        0
        ..
54701    0
54702    0
54703    0
54704    0
54705    0
Name: label, Length: 54706, dtype: int64

In [7]:
metadata

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,label
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False,0
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False,0
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False,0
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False,0
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54701,1,9973,1729524723,R,MLO,43.0,0,0,0,1.0,0,C,49,False,0
54702,1,9989,63473691,L,MLO,60.0,0,0,0,,0,C,216,False,0
54703,1,9989,1078943060,L,CC,60.0,0,0,0,,0,C,216,False,0
54704,1,9989,398038886,R,MLO,60.0,0,0,0,0.0,0,C,216,True,0


In [8]:
#3. Group the metadata by same patient IDs and laterality
grouped = metadata.groupby(["patient_id", "laterality"])



In [9]:
grouped.groups.keys()

dict_keys([(5, 'L'), (5, 'R'), (25, 'L'), (25, 'R'), (28, 'L'), (28, 'R'), (30, 'L'), (30, 'R'), (33, 'L'), (33, 'R'), (36, 'L'), (36, 'R'), (45, 'L'), (45, 'R'), (49, 'L'), (49, 'R'), (65, 'L'), (65, 'R'), (71, 'L'), (71, 'R'), (72, 'L'), (72, 'R'), (79, 'L'), (79, 'R'), (87, 'L'), (87, 'R'), (90, 'L'), (90, 'R'), (105, 'L'), (105, 'R'), (106, 'L'), (106, 'R'), (111, 'L'), (111, 'R'), (115, 'L'), (115, 'R'), (122, 'L'), (122, 'R'), (126, 'L'), (126, 'R'), (127, 'L'), (127, 'R'), (128, 'L'), (128, 'R'), (129, 'L'), (129, 'R'), (142, 'L'), (142, 'R'), (152, 'L'), (152, 'R'), (158, 'L'), (158, 'R'), (162, 'L'), (162, 'R'), (177, 'L'), (177, 'R'), (188, 'L'), (188, 'R'), (194, 'L'), (194, 'R'), (204, 'L'), (204, 'R'), (209, 'L'), (209, 'R'), (211, 'L'), (211, 'R'), (233, 'L'), (233, 'R'), (236, 'L'), (236, 'R'), (243, 'L'), (243, 'R'), (247, 'L'), (247, 'R'), (259, 'L'), (259, 'R'), (260, 'L'), (260, 'R'), (262, 'L'), (262, 'R'), (263, 'L'), (263, 'R'), (272, 'L'), (272, 'R'), (278, 'L'),

In [10]:
def build_path_to_images(patient_id, image_id):
    path = "../data/rsna-breast-cancer-detection/train_images/" + str(patient_id) + "/" + str(image_id) + ".dcm"
    return path

In [11]:
build_path_to_images(5, 640805896)

'../data/rsna-breast-cancer-detection/train_images/5/640805896.dcm'

In [12]:
#This function finds the dicom file that may exist in a directory.
def get_dicom_path(directory_path):
    dicom_paths = []
    for dirpath, _, filenames in os.walk(directory_path):
        for filename in filenames:
            if filename.lower().endswith(".dcm"):
                full_path = os.path.join(dirpath, filename)
                return full_path


In [13]:
def min_max_normalize(values):
    values = values.astype(np.float32)
    min_val = np.min(values)
    max_val = np.max(values)
    if max_val == min_val:  # avoid divide by zero
        return np.zeros_like(values, dtype=np.float32)
    return (values - min_val) / (max_val - min_val)


In [14]:
def to_uint8(a):
    a = a.astype(np.float32)
    lo, hi = np.percentile(a, [0.5, 99.5])
    if hi <= lo: lo, hi = float(a.min()), float(a.max())
    a = np.clip(a, lo, hi)
    return ((a - lo) / (hi - lo) * 255).astype(np.uint8)

In [15]:
#Establish patch size
#establish the number of rows and columns you have in patches.
#num_rows_patches = image_height// patch size
#num_columns_patches = image_width// patch size

#create a phantom immage patch_black, in the form of a nested boolean array of size num_columns_patchs x num_row_patches

#for i in num_rows_patches
    #for j in num_coluumns_patches
        #obtain the pixels in that patch, y1 is i * patch size, y2 is (i+1) * patch size
        #x1 is j * patch size, x2 is (j + 1) * patch size
        #all pixels in patch, pixel_patch is described by image[x1,x2][y1,y2]
        #create a mask array, for each pixel, if it is between the lower and upper threshold, mark it as true (1). Otherwise false (0)
        #Take the average of the mask array. 
        #Since all values of the mask array are between 0 and 1, the average represents the fraction of the pixels that are black
        #If the fraction of pixels that are black is above threshold, save that pixel patch in the patch_black as True.

#Determine Which columns have all black patches
#Create a mask of columns "keep_pixel_columns" which determines which columns will be kept, where all patches are initialized to true.
#iterate over each column, and if that column is determined to be all black, set that column to false.

In [16]:
def crop_image(image, patch_size = 40,
                                black_low: int = 0, black_high =  300,
                                patch_black_ratio = 0.75):

    height, width = image.shape
    # Patch_size minus 1, to start from index 0 
    num_rows_patches = (height + patch_size - 1) // patch_size
    num_columns_patches = (width + patch_size - 1) // patch_size

    patch_black = np.zeros((num_rows_patches, num_columns_patches), dtype=bool)
    
    for i in range(num_rows_patches):
        y1 = i * patch_size
        #In case the array runs out of bounds, use the smaller value between calculated and total image height.
        y2 = min((i + 1) * patch_size, height)
        
        for j in range(num_columns_patches):
            x1 = j * patch_size 
            x2 = min((j + 1) * patch_size, width)
            patch = image[y1:y2, x1:x2]

            frac_in_range = np.mean((patch >= black_low) & (patch <= black_high))
            patch_black[i, j] = (frac_in_range >= patch_black_ratio)

    col_black = patch_black.all(axis=0)

    keep_pixel_cols = np.ones(width, dtype=bool)
    for j, is_black in enumerate(col_black):
        if is_black:
            x1 = j * patch_size
            #In case the array runs out of bounds, use the smaller value between calculated and total image height.
            x2 = min((j + 1) * patch_size, width)
            keep_pixel_cols[x1:x2] = False

    #Need a safety clause, in case no columns are removed.
    if not keep_pixel_cols.any():
        return image

    return image[:, keep_pixel_cols]

In [17]:
#This function takes in a file path and returns the raw DICOM pixel data 
def get_pixel_data(path):
    dicom_file = pydicom.dcmread(path)
    pixels = dicom_file.pixel_array
    pixels = apply_voi_lut(pixels, dicom_file)
    pixels = cv2.resize(pixels, (512,512))
    pixels = crop_image(pixels, patch_size = 32,
                                      black_low=0, black_high=100,
                                      patch_black_ratio=0.6)
    pixels = cv2.resize(pixels, (512,512))
    pixels = min_max_normalize(pixels)
    #cv2.imshow("cropped", to_uint8(pixels))
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()

    return pixels

breast_objects = []

for (patient_id, laterality), group in grouped:
    cc_rows  = group[group["view"] == "CC"]
    mlo_rows = group[group["view"] == "MLO"]


    if cc_rows.empty or mlo_rows.empty:
        # skip incomplete sides.
        continue

    cc_row = cc_rows.iloc[0]
    mlo_row = mlo_rows.iloc[0]
    #print (cc_row)


    cc_path = build_path_to_images(cc_row["patient_id"], cc_row["image_id"])
    mlo_path = build_path_to_images(mlo_row["patient_id"], mlo_row["image_id"])
    label   = int(group.iloc[0]["label"])
    #print (cc_path)


    cc_image = get_pixel_data(cc_path)
    mlo_image = get_pixel_data(mlo_path)


    
    # stack along last axis to get the input tensor to shape (H, W, 2)
    tensor = np.stack([cc_image, mlo_image], axis=-1)
    
    patient_directory = f"../data/Processed Data RSNA/{patient_id}"
    if not os.path.exists(patient_directory):
        os.mkdir(patient_directory)
    
    tensor_path = f"{patient_directory}/{patient_id}_{laterality}.npy"
    np.save(tensor_path, tensor)

    #Can still use dictionary object.
    breast_object = {
        "patient_id": patient_id,
        "laterality": laterality,
        "tensor_path": tensor_path,
        "label": label
    }
    #Append to total list of breast objects.
    breast_objects.append(breast_object)

#Convert the dictionary list to a pandas dataframe, and use panda's to.csv function to save it as a csv file.
pd.DataFrame(breast_objects).to_csv("../data/meta_data_RSNA.csv")


In [18]:
from concurrent.futures import ThreadPoolExecutor

def process_patient(patient_id, laterality, group):
    cc_rows  = group[group["view"] == "CC"]
    mlo_rows = group[group["view"] == "MLO"]

    if cc_rows.empty or mlo_rows.empty:
        return None  # skip incomplete cases

    cc_row = cc_rows.iloc[0]
    mlo_row = mlo_rows.iloc[0]

    cc_path = build_path_to_images(cc_row["patient_id"], cc_row["image_id"])
    mlo_path = build_path_to_images(mlo_row["patient_id"], mlo_row["image_id"])
    label   = int(group.iloc[0]["label"])

    # Load pixel data
    cc_image = get_pixel_data(cc_path)
    mlo_image = get_pixel_data(mlo_path)

    # Stack CC and MLO along channel axis → (H, W, 2)
    tensor = np.stack([cc_image, mlo_image], axis=-1)

    # Save per-patient tensor
    patient_directory = f"../data/Processed Data RSNA/{patient_id}"
    os.makedirs(patient_directory, exist_ok=True)

    tensor_path = f"{patient_directory}/{patient_id}_{laterality}.npy"
    np.save(tensor_path, tensor.astype(np.float16))

    return {
        "patient_id": patient_id,
        "laterality": laterality,
        "tensor_path": tensor_path,
        "label": label
    }

# ----------------------------
# Main loop with threading
# ----------------------------
def build_dataset(grouped, max_workers=8, out_csv="../data/meta_data_RSNA.csv"):
    breast_objects = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for (patient_id, laterality), group in grouped:
            futures.append(executor.submit(process_patient, patient_id, laterality, group))
        
        for f in futures:
            result = f.result()
            if result is not None:
                breast_objects.append(result)

    # Save metadata as CSV
    df = pd.DataFrame(breast_objects)
    df.to_csv(out_csv, index=False)
    
    return df

In [19]:
meta = build_dataset(grouped, max_workers=16)