# **RNSA Breast Cancer Detector.**

## ML:

**Librerias**

In [9]:
#Librerias
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import numpy as np
import pandas as pd
import tensorflow as tf
import pydicom
from pathlib import Path

#Constantes
IMGFOLDER_PATH = "../data/raw/train_images"


In [2]:
def ReadData():
    print("Reading Dataset...\n==========\n")

    try:
        data = pd.read_csv("../data/processed/clean_data.csv")
        print(f"Dataset dimensions: {data.shape}\n\nData Preview:\n\n{data.head(5)}\n\n · Successfully data readed")
        return data
    except Exception as e:
        print(f"...Failed data readed. Error: {e}")
        return None

data = ReadData()

Reading Dataset...

Dataset dimensions: (54668, 11)

Data Preview:

   patient_id    image_id   age  cancer  biopsy  invasive  BIRADS  implant  \
0       10006   462822612  61.0       0       0         0     4.0        0   
1       10006  1459541791  61.0       0       0         0     4.0        0   
2       10006  1864590858  61.0       0       0         0     4.0        0   
3       10006  1874946579  61.0       0       0         0     4.0        0   
4       10011   220375232  55.0       0       0         0     0.0        0   

   density  laterality  view  
0      2.0         0.0   0.0  
1      2.0         0.0   1.0  
2      2.0         1.0   1.0  
3      2.0         1.0   0.0  
4      1.0         0.0   0.0  

 · Successfully data readed


In [3]:
def GetPaths(root, dataset):

    #Creamos una lista con el Path de las imagenes.

    root = Path(root) # conversion Str -> Path
    list_path = list(root.rglob("*.dcm")) # lista de paths

    #Mapeamos
    map_label = {}

    for p in list_path:
        patient_id = p.parts[-2] # -> folder del paciente donde están las imagenes.
        img_id = p.stem # -> la imagen en cuestion.

        try:    
            filter = dataset[(dataset["patient_id"].astype(str) == patient_id) & (dataset["image_id"].astype(str) == img_id)] #Importante convertir los valores del dataset en str para la comparacion.

            if not filter.empty:
                target = int(filter["cancer"].values[0]) # -> tomamos el valor del target.
                map_label[p] = target
            else:
                print("No coincidences in dataset.")
        except Exception as e:
            print(f"Error: {e} in row {p}.")

    return map_label



mappingIMG = GetPaths(IMGFOLDER_PATH, data)

In [20]:
#Primero vamos a crear una pequeña funcion para normalizar el tamaño de las imagenes.

def NormalizeIMG(img):
    
    max_val = np.max(img)
    img = img.astype(np.float32)

    if max_val > 0:
        img = img / max_val

    #Vamos a hacer una verificación adicional para convertir las imagenes en escala de grises a RGB si fuese necesario.
    
    if img.ndim == 2: #Escala de grises -> 1 dimension + canal
        img = np.stack([img] * 3, axis=-1)

    return img

In [21]:
#Cargamos las imagenes.
def ImgLoader(path):

    #Convertimos los paths de tf.str a str normal porque si no falla.
    try:
        str_path = path.numpy().decode("utf-8")
    except Exception as e:
        print(f"Error: {e}")
        return None

    #Cargamos imgs dicom
    dcm = pydicom.dcmread(str_path)
    img = dcm.pixel_array.astype(np.float32)

    #Normalizamos el tamaño de las imagenes
    img = NormalizeIMG(img)
    img = tf.image.resize(img, [224,224]) #Redimensionamos a 244x244

    #Convertimos la img a tensor

    img_tensor = tf.convert_to_tensor(img)

    return img_tensor


In [22]:
#Lo convertimos en tensores
def ImgToTensor(path, label):
    img = tf.py_function(func=ImgLoader, inp=[path], Tout=tf.float32)
    img.set_shape([224, 224, 3]) #Seteamos la imagen a 3 canales
    return img, label

In [26]:
def TensorMaker(datalabels):

    #Convertimos el diccionario en listas separando key y value.
    paths = list(datalabels.keys())
    labels = list(datalabels.values())

    #convertimos las listas en tensores.
    path_list = []
    label_list = []

    for p, l in zip(paths, labels):
        path_list.append(str(p)) #Path -> str
        label_list.append(l)

    tensor_paths = tf.constant(path_list)
    tensor_labels = tf.constant(label_list, dtype=tf.int32)

    #Crearmos el dataframe con los tensores.
    data_tensor = tf.data.Dataset.from_tensor_slices((tensor_paths, tensor_labels))

    data_tensor = data_tensor.map(ImgToTensor, num_parallel_calls=tf.data.AUTOTUNE)

    #Aplicamos el batching y el prefetching
    data_tensor = data_tensor.batch(32).prefetch(tf.data.AUTOTUNE)

    return data_tensor

dataTensor = TensorMaker(mappingIMG)

In [27]:
#Vamos a comprobar que se haya creado bien el dataset.

for img, label in dataTensor.take(1):
    print(f"Image shape: {img.shape}")
    print(f"Label batch shape: {label.shape}")

Image shape: (32, 224, 224, 3)
Label batch shape: (32,)
