----
# Convert and Organize Resized Mammograms Sparse Matrix Representations Into Folders 

In [1]:
# imports
import pandas as pd
import pydicom

import numpy as np
from scipy import sparse

import os
from skimage.transform import resize


In [7]:
# Read in patients data where mammogram ids linked
df = pd.read_csv("../../data/train.csv")


In [4]:
def image_single_to_three_channel(arr):
    """Create 3 channeled imaging 

    Args:
        arr (list): Image as list

    Returns:
        image: Converted single channel image to 3 channeled
    """
    # Create 3 channeled numpy array with image dimensions and fill them with zeros
    image = np.zeros((np.array(arr).shape[0], np.array(arr).shape[1], 3))
    # Store same value in each channel
    image[:, :, 0] = arr  
    image[:, :, 1] = arr
    image[:, :, 2] = arr

    # Return converted image
    return image


def read_xray(path, channels):
    """Read x-rays on given path

    Args:
        path (string): Path to x-ray
        channels (int): Number of channels

    Returns:
        list: Returns normalized image array with given number of channels
    """
    # Read .dcm image using pydicom library
    dicom = pydicom.read_file(path)

    # Get numpy array representation
    image = dicom.pixel_array

    # Check if channels need to be added
    if channels == 3:
        image = image_single_to_three_channel(image)

    # Return normalized pixel image array
    return image / 255


def create_directory(path):
    """Creates a directory on given path

    Args:
        path (string): Directory path
    """
    # Check if directory already exists
    if not os.path.exists(path):
        # Create directory
        os.mkdir(path)


----
Store sparse matrix with `1000` pxl size and `1000` records each

In [5]:
def save_processed_images(
    df, original_path, target_path, target_folder, IMG_PX_SIZE, channels=1
):
    """Extracts patient and image ids from given dataframe, finds the x-ray and
    stores sparse matrix representation to save storage room

    Args:
        df (DataFrame): Pandas dataframe
        original_path (string): Source path where x-rays stored
        target_path (string): Base path to store all processed x-rays
        target_folder (string): Folder name to stored processed x-rays
        IMG_PX_SIZE (int): Image dimension to convert
        channels (int, optional): Image will get converted to 3 channels. Defaults to 1.
    """
    # Create base path folder
    create_directory(target_path[0 : len(target_path) - 1])
    # Create target folder
    create_directory(target_path + target_folder)

    # Loop through rows in dataframe
    for index in df.index.tolist():
        # Get patient id
        pat_id = df.iloc[index, 1]
        # Get image id
        image_id = df.iloc[index, 2]

        # Build path to find dicom mammogram
        o_path = f"{original_path}{str(pat_id)}/{str(image_id)}.dcm"

        # Load dicom image
        loaded_image = read_xray(o_path, channels)

        # Change image dimensions
        reshaped_image = resize(
            loaded_image, (IMG_PX_SIZE, IMG_PX_SIZE), anti_aliasing=True
        )

        # Convert numpy array of image to sparse matrix
        sparse_matrix = sparse.csr_matrix(np.array(reshaped_image))

        # Create path to store sparse matrix
        path = f"{target_path}{target_folder}/{pat_id}/"

        # temp
        if os.path.isfile(f"{path}{target_folder}_{pat_id}_{index}.npz"):
            continue

        # Create a directory to store sparse matrix
        create_directory(path)

        # Save sparse matrix
        sparse.save_npz(f"{path}{target_folder}_{pat_id}_{index}.npz", sparse_matrix)


In [8]:
# Get cancer only patients from dataframe
cancer_only = df[df["cancer"] == 1][["patient_id", "image_id"]]
cancer_only.reset_index(inplace=True)

# Get non cancer only patients from dataframe
no_cancer_only = df[df["cancer"] == 0][["patient_id", "image_id"]]
no_cancer_only.reset_index(inplace=True)


In [64]:
# Save 1000x1000 image sparse matrix for cancer only patients
target_path = "../../data/breast_imaging/"
target_folder = "cancer"
original_path = "../../data/train_images/"

save_processed_images(
    cancer_only, original_path, target_path, target_folder, IMG_PX_SIZE=1000
)


In [5]:
# Save 1000x1000 image sparse matrix for no cancer only patients
target_path = "../../data/breast_imaging/"
target_folder = "no_cancer"
original_path = "../../data/train_images/"

save_processed_images(
    no_cancer_only, original_path, target_path, target_folder, IMG_PX_SIZE=1000
)


----
Store sparse matrix with `340` pxl size and `all` records each

In [94]:
# Save 340x340 image sparse matrix for cancer only patients
target_path = "../../data/breast_imaging_340px/"
target_folder = "cancer"
original_path = "../../data/train_images/"

save_processed_images(
    cancer_only, original_path, target_path, target_folder, IMG_PX_SIZE=340
)


In [100]:
temp_df = pd.read_csv('./no_dubs.csv')
temp_df.head()
no_cancer_only = temp_df[temp_df['cancer'] == 0]

In [109]:
# Save 340x340 image sparse matrix for no cancer only patients
target_path = "../../data/breast_imaging_340px/"
target_folder = "no_cancer"
original_path = "../../data/train_images/"

save_processed_images(
    no_cancer_only, original_path, target_path, target_folder, IMG_PX_SIZE=340
)


----
Under Sample dataset

In [3]:
under_df = pd.read_csv('../../data/under_sampling_train.csv')
under_df.head()

Unnamed: 0,patient_id,image_id,laterality,view,age,cancer
0,31046,147041592,0,0,58.0,0
1,10445,1559303163,0,1,57.0,0
2,36254,1559338389,1,1,70.0,0
3,27903,1559292995,0,1,47.0,0
4,43563,212890197,1,0,53.0,0


In [10]:
under_df.shape

(2312, 6)

In [9]:
# Get cancer only patients from dataframe
cancer_only = under_df[under_df["cancer"] == 1][["patient_id", "image_id"]]
cancer_only.reset_index(inplace=True)

# Get non cancer only patients from dataframe
no_cancer_only = under_df[under_df["cancer"] == 0][["patient_id", "image_id"]]
no_cancer_only.reset_index(inplace=True)


In [12]:
# Save 2500x2500 image sparse matrix for cancer only patients
target_path = "../../data/breast_imaging_2500px/"
target_folder = "cancer"
original_path = "../../data/train_images/"

save_processed_images(
    cancer_only, original_path, target_path, target_folder, IMG_PX_SIZE=2500
)


In [13]:
# Save 2500x2500 image sparse matrix for no cancer only patients
target_path = "../../data/breast_imaging_2500px/"
target_folder = "no_cancer"
original_path = "../../data/train_images/"

save_processed_images(
    no_cancer_only, original_path, target_path, target_folder, IMG_PX_SIZE=2500
)
