#This code snippet will preprocess the CBIS-DDSM dataset into dataset that will be ready for input into the model.

#Pseudocode:

1. Read in the "all_data.csv" file which contains metadata of all images and put it in a pandas DF.
    Each row has data on a patient image.
    Patients can have a CC and MLO image per side. 
    Goal: create an object for 1 breast imaging study, left and right are kept separate. 
2. Map the pathology column to a labels column, which will be 0 for benign or benign without callback, and 1 malignant
3. Group the meta data rows by same patient IDs and laterality. 
4. For every grouping in groups:
    Set variable patientID to the patient ID
    Set variable laterality to the breast side
    Set variable label to the mapped label.
    Extract the path to the CC image.
    Extract the path to the MLO image.

    (to do- write a function that takes in a path to an image and returns the pixel data)
    (to do- write a function that takes in raw pixel data and outputs a grayscaled, rescaled image)
    
    Set variable CC_image to hold processed pixel data for Cranio-Caudal image.
    Set variable MLO_image to hold processed pixel data for Mediolateral image. 

    Package CC and MLO image into a tensor. Should have a shape (400,400,2)

    Generate a path to a tensor file and save it to tensor_file variable. 
    Save the tensor into the numpy file, .npy 

    Create a breast object that holds properties: patientID, laterality, label, tensor_file.

    Append the breast object to a breast objects list.

    End For Loop
   
   Write the breast object to a JSON file.
    


In [26]:
import os
import pydicom
import numpy as np
import pandas as pd
import re
import cv2
import json
from pydicom.pixel_data_handlers.util import apply_voi_lut
from pydicom.pixel_data_handlers.util import apply_modality_lut, apply_voi_lut

In [2]:
data_directory = "../data/Data/manifest/CBIS-DDSM/"
metadata_file = "../data/Data/manifest/all_data.csv"

In [3]:
#1. Import the metadata
metadata = pd.read_csv(metadata_file)

In [4]:
metadata

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00038,2,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....
3,P_00038,2,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....
4,P_00038,2,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563,P_02033,2,RIGHT,MLO,1,mass,IRREGULAR,ILL_DEFINED,3,MALIGNANT,4,Mass-Training_P_02033_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_02033_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_02033_RIGHT_MLO_1/1.3.6.1.4.1....
3564,P_02079,2,RIGHT,CC,1,mass,ROUND,SPICULATED,3,MALIGNANT,5,Mass-Training_P_02079_RIGHT_CC/1.3.6.1.4.1.959...,Mass-Training_P_02079_RIGHT_CC_1/1.3.6.1.4.1.9...,Mass-Training_P_02079_RIGHT_CC_1/1.3.6.1.4.1.9...
3565,P_02079,2,RIGHT,MLO,1,mass,ROUND,SPICULATED,3,MALIGNANT,5,Mass-Training_P_02079_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_02079_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_02079_RIGHT_MLO_1/1.3.6.1.4.1....
3566,P_02092,2,LEFT,CC,1,mass,IRREGULAR,SPICULATED,3,MALIGNANT,2,Mass-Training_P_02092_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_02092_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_02092_LEFT_CC_1/1.3.6.1.4.1.95...


In [5]:
#2. Map the pathology column to a labels column.
label_mapping = {
    "BENIGN" : 0,
    "BENIGN_WITHOUT_CALLBACK": 0, 
    "MALIGNANT" : 1
}

metadata["label"] = metadata["pathology"].map(label_mapping)

In [6]:
metadata["label"]

0       0
1       0
2       0
3       0
4       0
       ..
3563    1
3564    1
3565    1
3566    1
3567    1
Name: label, Length: 3568, dtype: int64

In [7]:
#Truncate image path for anything character that comes after "CC" or "MLO"
metadata["truncated_path"] = metadata["image file path"].str.extract(r"(.*(?:CC|MLO)/)")

In [8]:
metadata

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path,label,truncated_path
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,0,Calc-Test_P_00038_LEFT_CC/
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,0,Calc-Test_P_00038_LEFT_MLO/
2,P_00038,2,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,0,Calc-Test_P_00038_RIGHT_CC/
3,P_00038,2,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,0,Calc-Test_P_00038_RIGHT_CC/
4,P_00038,2,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,0,Calc-Test_P_00038_RIGHT_MLO/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3563,P_02033,2,RIGHT,MLO,1,mass,IRREGULAR,ILL_DEFINED,3,MALIGNANT,4,Mass-Training_P_02033_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_02033_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_02033_RIGHT_MLO_1/1.3.6.1.4.1....,1,Mass-Training_P_02033_RIGHT_MLO/
3564,P_02079,2,RIGHT,CC,1,mass,ROUND,SPICULATED,3,MALIGNANT,5,Mass-Training_P_02079_RIGHT_CC/1.3.6.1.4.1.959...,Mass-Training_P_02079_RIGHT_CC_1/1.3.6.1.4.1.9...,Mass-Training_P_02079_RIGHT_CC_1/1.3.6.1.4.1.9...,1,Mass-Training_P_02079_RIGHT_CC/
3565,P_02079,2,RIGHT,MLO,1,mass,ROUND,SPICULATED,3,MALIGNANT,5,Mass-Training_P_02079_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_02079_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_02079_RIGHT_MLO_1/1.3.6.1.4.1....,1,Mass-Training_P_02079_RIGHT_MLO/
3566,P_02092,2,LEFT,CC,1,mass,IRREGULAR,SPICULATED,3,MALIGNANT,2,Mass-Training_P_02092_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_02092_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_02092_LEFT_CC_1/1.3.6.1.4.1.95...,1,Mass-Training_P_02092_LEFT_CC/


In [9]:
#3. Group the metadata by same patient IDs and laterality
grouped = metadata.groupby(["patient_id", "left or right breast"])

In [10]:
grouped.groups.keys()

dict_keys([('P_00001', 'LEFT'), ('P_00004', 'LEFT'), ('P_00004', 'RIGHT'), ('P_00005', 'RIGHT'), ('P_00007', 'LEFT'), ('P_00008', 'LEFT'), ('P_00008', 'RIGHT'), ('P_00009', 'RIGHT'), ('P_00010', 'LEFT'), ('P_00011', 'LEFT'), ('P_00012', 'LEFT'), ('P_00013', 'RIGHT'), ('P_00014', 'LEFT'), ('P_00015', 'LEFT'), ('P_00016', 'LEFT'), ('P_00017', 'LEFT'), ('P_00018', 'RIGHT'), ('P_00019', 'RIGHT'), ('P_00020', 'LEFT'), ('P_00021', 'LEFT'), ('P_00021', 'RIGHT'), ('P_00022', 'LEFT'), ('P_00023', 'RIGHT'), ('P_00024', 'LEFT'), ('P_00026', 'LEFT'), ('P_00027', 'RIGHT'), ('P_00028', 'LEFT'), ('P_00029', 'LEFT'), ('P_00030', 'LEFT'), ('P_00030', 'RIGHT'), ('P_00031', 'LEFT'), ('P_00032', 'RIGHT'), ('P_00034', 'RIGHT'), ('P_00037', 'RIGHT'), ('P_00038', 'LEFT'), ('P_00038', 'RIGHT'), ('P_00039', 'RIGHT'), ('P_00041', 'LEFT'), ('P_00043', 'LEFT'), ('P_00044', 'RIGHT'), ('P_00045', 'LEFT'), ('P_00046', 'RIGHT'), ('P_00047', 'LEFT'), ('P_00048', 'RIGHT'), ('P_00049', 'RIGHT'), ('P_00051', 'LEFT'), ('P

In [11]:
#This function finds the dicom file that may exist in a directory.
def get_dicom_path(directory_path):
    dicom_paths = []
    directory_path = "../data/Data/manifest/CBIS-DDSM/" + directory_path
    for dirpath, _, filenames in os.walk(directory_path):
        for filename in filenames:
            if filename.lower().endswith(".dcm"):
                full_path = os.path.join(dirpath, filename)
                full_path = full_path
                dicom_paths.append(full_path)
    return dicom_paths

In [12]:

dicom_path = get_dicom_path(metadata.iloc[0]["truncated_path"])


In [25]:
metadata.iloc[1]["truncated_path"]

'Calc-Test_P_00038_RIGHT_MLO/'

In [14]:
def min_max_normalize(values):
    min_val = np.min(values)
    max_val = np.max(values)
    return (values - min_val) / (max_val - min_val)

In [21]:
#This function takes in a file path and returns the raw DICOM pixel data 
def get_pixel_data(path):
    dicom_file = pydicom.dcmread(path)
    pixels = dicom_file.pixel_array
    pixels = apply_voi_lut(pixels, dicom_file)
    pixels = cv2.resize(pixels, (512,512))
    #pixels = min_max_normalize(pixels)
    return pixels


pixel_data = get_pixel_data(dicom_path[0])
    

In [22]:
print(pixel_data.shape)

(512, 512)


breast_objects = []
for (patient_id, laterality), group in grouped:
    views = group.set_index("image view")

    try:
        cc_path = views.loc["CC", "truncated_path"]
        if isinstance(cc_path, pd.Series):
            cc_path = cc_path.iloc[0]  # pick first

        mlo_path = views.loc["MLO", "truncated_path"]
        if isinstance(mlo_path, pd.Series):
            mlo_path = mlo_path.iloc[0]  # pick first

        label = int(views.iloc[0]["label"])

        breast_object = {
            "patient_id": patient_id,
            "laterality": laterality,
            "cc_path": cc_path,
            "mlo_path": mlo_path,
            "label": label
        }

        cc_image = get_pixel_data(get_dicom_path(cc_path)[0])
        mlo_image = get_pixel_data(get_dicom_path(mlo_path)[0])

        np.save(f"../data/Processed Data/{patient_id}_{laterality}.npy", [cc_image, mlo_image])
        breast_objects.append(breast_object)

    except Exception as e:
        continue

with open("../data/meta_data.json", "w") as f:
    json.dump(breast_objects, f)


In [23]:
import os
import json

# Create output dirs
train_dir = "../data/Processed Data/train"
test_dir = "../data/Processed Data/test"
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

train_objects = []
test_objects = []

for (patient_id, laterality), group in grouped:
    views = group.set_index("image view")

    try:
        cc_path = views.loc["CC", "truncated_path"]
        if isinstance(cc_path, pd.Series):
            cc_path = cc_path.iloc[0]

        mlo_path = views.loc["MLO", "truncated_path"]
        if isinstance(mlo_path, pd.Series):
            mlo_path = mlo_path.iloc[0]

        label = int(views.iloc[0]["label"])

        # Decide split dir
        if "test" in cc_path.lower() or "test" in mlo_path.lower():
            split_dir = "test"
            save_dir = test_dir
            meta_list = test_objects
        elif "train" in cc_path.lower() or "train" in mlo_path.lower():
            split_dir = "train"
            save_dir = train_dir
            meta_list = train_objects
        else:
            # default fallback to train
            split_dir = "train"
            save_dir = train_dir
            meta_list = train_objects

        # Load images
        cc_image = get_pixel_data(get_dicom_path(cc_path)[0])
        mlo_image = get_pixel_data(get_dicom_path(mlo_path)[0])

        # Save processed .npy file
        save_path = os.path.join(save_dir, f"{patient_id}_{laterality}.npy")
        np.save(save_path, [cc_image, mlo_image])

        # Add to the appropriate metadata list
        breast_object = {
            "patient_id": patient_id,
            "laterality": laterality,
            "processed_path": save_path,
            "label": label,
            "split": split_dir
        }
        meta_list.append(breast_object)

    except Exception:
        continue

# Save separate metadata files
with open("../data/meta_train.json", "w") as f:
    json.dump(train_objects, f)

with open("../data/meta_test.json", "w") as f:
    json.dump(test_objects, f)

print(f"Saved {len(train_objects)} train entries and {len(test_objects)} test entries.")


Saved 1054 train entries and 270 test entries.
