In [1]:
# Instalando dependencias
!pip install tensorflow pandas scikit-learn ydata-profiling matplotlib numpy seaborn opencv-python pillow pydicom scikit-image lime tf-keras-vis xlrd googletrans nest_asyncio torch torchvision numpy




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Importando dependencias
# Calculos
import numpy as np
import pandas as pd

# Visualizacion
import matplotlib.pyplot as plt
import seaborn as sns

# EDA
from ydata_profiling import ProfileReport

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Deep Learning
from tensorflow import keras
from tensorflow.keras import layers, models, applications
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import tensorflow as tf
import torch


# Manejo de imagenes
from PIL import Image
import cv2
import pydicom
from skimage import io, transform

# Explicabilidad
import lime
from lime import lime_image

# Utilidades
from pathlib import Path
from googletrans import Translator
import os
import asyncio
import nest_asyncio


nest_asyncio.apply()
translator = Translator()

def count_df_duplicates(df):
    duplicate_counts = []
    
    for col in df.columns:
        num_duplicates = df[col].duplicated().sum()
        duplicate_counts.append({"Column": col, "Duplicate Count": num_duplicates})
    
    # Create a summary DataFrame
    duplicate_summary = pd.DataFrame(duplicate_counts)
    
    # Sort by highest number of duplicates if you want
    return duplicate_summary.sort_values(by="Duplicate Count", ascending=False).reset_index(drop=True)

In [3]:
MASS_CASE_DESCRIPTION_TRAIN_SET = '../data/CBIS-DDSM/mass_case_description_train_set.csv'
MASS_CASE_DESCRIPTION_TEST_SET  = '../data/CBIS-DDSM/mass_case_description_test_set.csv'
CALC_CASE_DESCRIPTION_TRAIN_SET = '../data/CBIS-DDSM/calc_case_description_train_set.csv'
CALC_CASE_DESCRIPTION_TEST_SET  = '../data/CBIS-DDSM/calc_case_description_test_set.csv'
METADATA_CSV                    = '../data/CBIS-DDSM/metadata.csv'

mass_train_df = pd.read_csv(MASS_CASE_DESCRIPTION_TRAIN_SET)
mass_test_df = pd.read_csv(MASS_CASE_DESCRIPTION_TEST_SET)
calc_train_df = pd.read_csv(CALC_CASE_DESCRIPTION_TRAIN_SET)
calc_test_df = pd.read_csv(CALC_CASE_DESCRIPTION_TEST_SET)
metadata_df = pd.read_csv(METADATA_CSV)

In [4]:
print(len(mass_train_df) + len(mass_test_df) + len(calc_train_df) + len(calc_test_df))

3568


In [5]:
metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6776 entries, 0 to 6775
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Series UID            6776 non-null   object 
 1   Collection            6776 non-null   object 
 2   3rd Party Analysis    0 non-null      float64
 3   Data Description URI  6776 non-null   object 
 4   Subject ID            6776 non-null   object 
 5   Study UID             6776 non-null   object 
 6   Study Description     0 non-null      float64
 7   Study Date            6776 non-null   object 
 8   Series Description    6776 non-null   object 
 9   Manufacturer          0 non-null      float64
 10  Modality              6776 non-null   object 
 11  SOP Class Name        6776 non-null   object 
 12  SOP Class UID         6776 non-null   object 
 13  Number of Images      6776 non-null   int64  
 14  File Size             6776 non-null   int64  
 15  File Size float      

In [6]:
metadata_df.head()

Unnamed: 0,Series UID,Collection,3rd Party Analysis,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class Name,SOP Class UID,Number of Images,File Size,File Size float,File Location,Download Timestamp
0,1.3.6.1.4.1.9590.100.1.2.419081637812053404913...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC_1,1.3.6.1.4.1.9590.100.1.2.161465562211359959230...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14,06 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC_1\1.3.6....,2025-05-01T13:51:10.401
1,1.3.6.1.4.1.9590.100.1.2.174390361112646747718...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.384159464510350889125...,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,28,97 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO\1.3.6.1...,2025-05-01T13:51:15.119
2,1.3.6.1.4.1.9590.100.1.2.374115997511889073021...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.859354343102033567126...,,08-29-2017,full mammogram images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,1,27,84 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_CC\1.3.6.1....,2025-05-01T13:51:16.796
3,1.3.6.1.4.1.9590.100.1.2.244876997513875090239...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.200764632211227648028...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,13,41 MB,.\CBIS-DDSM\Calc-Test_P_00038_RIGHT_CC_1\1.3.6...,2025-05-01T13:51:19.973
4,1.3.6.1.4.1.9590.100.1.2.188613955710170417803...,CBIS-DDSM,,https://doi.org/10.7937/K9/TCIA.2016.7O02S9CY,Calc-Test_P_00038_LEFT_MLO_1,1.3.6.1.4.1.9590.100.1.2.291121996131431385353...,,08-29-2017,ROI mask images,,MG,Secondary Capture Image Storage,1.2.840.10008.5.1.4.1.1.7,2,14,62 MB,.\CBIS-DDSM\Calc-Test_P_00038_LEFT_MLO_1\1.3.6...,2025-05-01T13:51:21.81


In [7]:
count_df_duplicates(metadata_df)

Unnamed: 0,Column,Duplicate Count
0,Collection,6775
1,3rd Party Analysis,6775
2,SOP Class Name,6775
3,Data Description URI,6775
4,Manufacturer,6775
5,Study Description,6775
6,Modality,6775
7,SOP Class UID,6775
8,Number of Images,6774
9,Series Description,6773


> Agregare una columna de ID en la metadata y en el resto de datasets para poder identificar a que elemento pertenece cada File Location. Este ID estara conformado por **'Subject ID'**/**'Study UID'**/**'Series UID'**. Esto se hara en la etapa de preprocesamiento mas adelante

In [8]:
mass_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1318 entries, 0 to 1317
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   patient_id               1318 non-null   object
 1   breast_density           1318 non-null   int64 
 2   left or right breast     1318 non-null   object
 3   image view               1318 non-null   object
 4   abnormality id           1318 non-null   int64 
 5   abnormality type         1318 non-null   object
 6   mass shape               1314 non-null   object
 7   mass margins             1275 non-null   object
 8   assessment               1318 non-null   int64 
 9   pathology                1318 non-null   object
 10  subtlety                 1318 non-null   int64 
 11  image file path          1318 non-null   object
 12  cropped image file path  1318 non-null   object
 13  ROI mask file path       1318 non-null   object
dtypes: int64(4), object(10)
memory usage: 14

In [9]:
mass_train_df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00001,3,LEFT,CC,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
1,P_00001,3,LEFT,MLO,1,mass,IRREGULAR-ARCHITECTURAL_DISTORTION,SPICULATED,4,MALIGNANT,4,Mass-Training_P_00001_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00001_LEFT_MLO_1/1.3.6.1.4.1.9...
2,P_00004,3,LEFT,CC,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_CC/1.3.6.1.4.1.9590...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...,Mass-Training_P_00004_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00004,3,LEFT,MLO,1,mass,ARCHITECTURAL_DISTORTION,ILL_DEFINED,4,BENIGN,3,Mass-Training_P_00004_LEFT_MLO/1.3.6.1.4.1.959...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...,Mass-Training_P_00004_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00004,3,RIGHT,MLO,1,mass,OVAL,CIRCUMSCRIBED,4,BENIGN,5,Mass-Training_P_00004_RIGHT_MLO/1.3.6.1.4.1.95...,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....,Mass-Training_P_00004_RIGHT_MLO_1/1.3.6.1.4.1....


In [10]:
count_df_duplicates(mass_train_df)

Unnamed: 0,Column,Duplicate Count
0,abnormality type,1317
1,left or right breast,1316
2,image view,1316
3,pathology,1315
4,breast_density,1314
5,abnormality id,1312
6,subtlety,1312
7,assessment,1312
8,mass margins,1302
9,mass shape,1299


In [11]:
mass_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   patient_id               378 non-null    object
 1   breast_density           378 non-null    int64 
 2   left or right breast     378 non-null    object
 3   image view               378 non-null    object
 4   abnormality id           378 non-null    int64 
 5   abnormality type         378 non-null    object
 6   mass shape               378 non-null    object
 7   mass margins             361 non-null    object
 8   assessment               378 non-null    int64 
 9   pathology                378 non-null    object
 10  subtlety                 378 non-null    int64 
 11  image file path          378 non-null    object
 12  cropped image file path  378 non-null    object
 13  ROI mask file path       378 non-null    object
dtypes: int64(4), object(10)
memory usage: 41.5

In [12]:
mass_test_df.head()

Unnamed: 0,patient_id,breast_density,left or right breast,image view,abnormality id,abnormality type,mass shape,mass margins,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00016,4,LEFT,CC,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00016_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00016,4,LEFT,MLO,1,mass,IRREGULAR,SPICULATED,5,MALIGNANT,5,Mass-Test_P_00016_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00016_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00017,2,LEFT,CC,1,mass,ROUND,CIRCUMSCRIBED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_CC/1.3.6.1.4.1.9590.100...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Mass-Test_P_00017_LEFT_CC_1/1.3.6.1.4.1.9590.1...
3,P_00017,2,LEFT,MLO,1,mass,ROUND,ILL_DEFINED,4,MALIGNANT,4,Mass-Test_P_00017_LEFT_MLO/1.3.6.1.4.1.9590.10...,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....,Mass-Test_P_00017_LEFT_MLO_1/1.3.6.1.4.1.9590....
4,P_00032,3,RIGHT,CC,1,mass,ROUND,OBSCURED,0,BENIGN,2,Mass-Test_P_00032_RIGHT_CC/1.3.6.1.4.1.9590.10...,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....,Mass-Test_P_00032_RIGHT_CC_1/1.3.6.1.4.1.9590....


In [13]:
count_df_duplicates(mass_test_df)

Unnamed: 0,Column,Duplicate Count
0,abnormality type,377
1,left or right breast,376
2,image view,376
3,pathology,375
4,abnormality id,374
5,breast_density,374
6,subtlety,373
7,assessment,372
8,mass shape,365
9,mass margins,361


In [14]:
calc_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1546 entries, 0 to 1545
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   patient_id               1546 non-null   object
 1   breast density           1546 non-null   int64 
 2   left or right breast     1546 non-null   object
 3   image view               1546 non-null   object
 4   abnormality id           1546 non-null   int64 
 5   abnormality type         1546 non-null   object
 6   calc type                1526 non-null   object
 7   calc distribution        1170 non-null   object
 8   assessment               1546 non-null   int64 
 9   pathology                1546 non-null   object
 10  subtlety                 1546 non-null   int64 
 11  image file path          1546 non-null   object
 12  cropped image file path  1546 non-null   object
 13  ROI mask file path       1546 non-null   object
dtypes: int64(4), object(10)
memory usage: 16

In [15]:
calc_train_df.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00005,3,RIGHT,CC,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...
1,P_00005,3,RIGHT,MLO,1,calcification,AMORPHOUS,CLUSTERED,3,MALIGNANT,3,Calc-Training_P_00005_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....
2,P_00007,4,LEFT,CC,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...
3,P_00007,4,LEFT,MLO,1,calcification,PLEOMORPHIC,LINEAR,4,BENIGN,4,Calc-Training_P_00007_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...
4,P_00008,1,LEFT,CC,1,calcification,,REGIONAL,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Training_P_00008_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...


In [16]:
print(calc_test_df.iloc[305]['image file path'])
print(calc_test_df.iloc[305]['cropped image file path'])
print(calc_test_df.iloc[305]['ROI mask file path'])

Calc-Test_P_02153_RIGHT_MLO/1.3.6.1.4.1.9590.100.1.2.261937969412080566713189946172564256965/1.3.6.1.4.1.9590.100.1.2.362749348011703146040770935232709266621/000000.dcm
Calc-Test_P_02153_RIGHT_MLO_1/1.3.6.1.4.1.9590.100.1.2.339911717712629760423912567650017052589/1.3.6.1.4.1.9590.100.1.2.288064409613511175800756051722103040699/000001.dcm

Calc-Test_P_02153_RIGHT_MLO_1/1.3.6.1.4.1.9590.100.1.2.339911717712629760423912567650017052589/1.3.6.1.4.1.9590.100.1.2.288064409613511175800756051722103040699/000000.dcm


In [24]:
calc_test_df.head(60)

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....
2,P_00038,2,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....
3,P_00038,2,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....
4,P_00038,2,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...
5,P_00038,2,RIGHT,MLO,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_2/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO_2/1.3.6.1.4.1.9590...
6,P_00041,1,LEFT,CC,2,calcification,LUCENT_CENTER,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00041_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00041_LEFT_CC_2/1.3.6.1.4.1.9590.1...,Calc-Test_P_00041_LEFT_CC_2/1.3.6.1.4.1.9590.1...
7,P_00041,1,LEFT,MLO,2,calcification,LUCENT_CENTER,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00041_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00041_LEFT_MLO_2/1.3.6.1.4.1.9590....,Calc-Test_P_00041_LEFT_MLO_2/1.3.6.1.4.1.9590....
8,P_00077,2,LEFT,CC,1,calcification,ROUND_AND_REGULAR,,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Test_P_00077_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00077_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00077_LEFT_CC_1/1.3.6.1.4.1.9590.1...
9,P_00077,2,LEFT,MLO,1,calcification,ROUND_AND_REGULAR,,2,BENIGN_WITHOUT_CALLBACK,3,Calc-Test_P_00077_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00077_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00077_LEFT_MLO_1/1.3.6.1.4.1.9590....


In [None]:
count_df_duplicates(calc_train_df)

In [None]:
calc_test_df.info()

In [None]:
calc_test_df.head(60)

In [17]:
count_df_duplicates(calc_test_df)

Unnamed: 0,Column,Duplicate Count
0,abnormality type,325
1,left or right breast,324
2,image view,324
3,pathology,323
4,abnormality id,321
5,breast density,321
6,subtlety,321
7,assessment,321
8,calc distribution,318
9,calc type,305


In [18]:
mass_train_df = mass_train_df.astype({col: "string" for col in mass_train_df.select_dtypes(include="object").columns})
mass_test_df  = mass_test_df.astype({col: "string" for col in mass_test_df.select_dtypes(include="object").columns})
calc_train_df = calc_train_df.astype({col: "string" for col in calc_train_df.select_dtypes(include="object").columns})
calc_test_df  = calc_test_df.astype({col: "string" for col in calc_test_df.select_dtypes(include="object").columns})

In [19]:
mass_train_df.drop(['image file path', 'cropped image file path', 'ROI mask file path'], axis=1).profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 261.90it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [20]:
mass_test_df.drop(['image file path', 'cropped image file path', 'ROI mask file path'], axis=1).profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 211.55it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [21]:
calc_train_df.drop(['image file path', 'cropped image file path', 'ROI mask file path'], axis=1).profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 175.90it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [22]:
calc_test_df.drop(['image file path', 'cropped image file path', 'ROI mask file path'], axis=1).profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|█████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 167.87it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [23]:
mass_train_df.describe()

Unnamed: 0,breast_density,abnormality id,assessment,subtlety
count,1318.0,1318.0,1318.0,1318.0
mean,2.203338,1.116085,3.504552,3.965857
std,0.873774,0.467013,1.414609,1.102032
min,1.0,1.0,0.0,0.0
25%,2.0,1.0,3.0,3.0
50%,2.0,1.0,4.0,4.0
75%,3.0,1.0,4.0,5.0
max,4.0,6.0,5.0,5.0


# Preparacion de datos para entrenamiento

Acá ahora quiero combinar los datos de los 4 datasets para obtener 2 datasets resultantes como train y test, combinando ambos datasets de Mass y Calc. Esto para tener un único conjunto de Train y uno de Test para que el modelo sea capaz de detectar patrones para ambas categorías.

Hay una discrepancia entre los nombres de los archivos que se descargan en el dataset y en las que se encuentran en los csv de etiquetas. Luego de confirmar con un programa para visualizar los archivos DICOM, se pudieron observar los siguientes patrones:

- **image file path**: Aparece con nombre de archivo 000000.dcm, pero todas se llama 1-1.dcm
- **cropped image file path**: Aparece con nombre de archivo 000000.dcm, pero todas se llaman 1-1.dcm y se encuentran en las carpetas con el sufijo _1 (ejemplo: CC_1, MLO_1, etc.)
- **ROI mak file path**: Aparece con nombre de archivo 000000.dcm, pero todas se llaman 1-2.dcm y se encuentra en las carpetas con el sufijo _1.

El archivo con la imagen DICOM completa, se encuentra en una carpeta como unico archivo, la imagen recortada y el ROI se encuentran en la misma carpeta _1 para cada caso, enumeradas entre 1 (cropped) y 2 (ROI).

Como parte del procesamiento **se agregaran nuevas columnas con la ubicacion real de los archivos**

In [25]:
dataframes = [mass_train_df, mass_test_df, calc_train_df, calc_test_df]

for df in dataframes:
    if 'item_id' not in df:
        df['item_id'] = df['image file path'].apply(
            lambda path: '/'.join(str(path).split('/')[:-1])
        )

In [26]:
# Agregando columnda de tipo en cada dataset
if not 'type' in mass_train_df.columns: mass_train_df['type'] = 'mass'
if not 'type' in mass_test_df.columns: mass_test_df['type'] = 'mass'
if not 'type' in calc_train_df.columns: calc_train_df['type'] = 'calc'
if not 'type' in calc_test_df.columns: calc_test_df['type'] = 'calc'

In [27]:
calc_test_df.head()

Unnamed: 0,patient_id,breast density,left or right breast,image view,abnormality id,abnormality type,calc type,calc distribution,assessment,pathology,subtlety,image file path,cropped image file path,ROI mask file path,item_id,type
0,P_00038,2,LEFT,CC,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC_1/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100...,calc
1,P_00038,2,LEFT,MLO,1,calcification,PUNCTATE-PLEOMORPHIC,CLUSTERED,4,BENIGN,2,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_LEFT_MLO_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_LEFT_MLO/1.3.6.1.4.1.9590.10...,calc
2,P_00038,2,RIGHT,CC,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_1/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,calc
3,P_00038,2,RIGHT,CC,2,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC_2/1.3.6.1.4.1.9590....,Calc-Test_P_00038_RIGHT_CC/1.3.6.1.4.1.9590.10...,calc
4,P_00038,2,RIGHT,MLO,1,calcification,VASCULAR,,2,BENIGN_WITHOUT_CALLBACK,5,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO_1/1.3.6.1.4.1.9590...,Calc-Test_P_00038_RIGHT_MLO/1.3.6.1.4.1.9590.1...,calc


In [31]:
# Combinando ambos dataets de train y ambos de tests
train_df = pd.concat([mass_train_df, calc_train_df], ignore_index=True)
test_df = pd.concat([mass_test_df, calc_test_df], ignore_index=True)

# Validacion

Verificamos nuestros dataframes resultantes para asegurarnos que tengan la informacion esperada (combinacion de mass y calc dfs)

In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704 entries, 0 to 703
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   patient_id               704 non-null    string 
 1   breast_density           378 non-null    float64
 2   left or right breast     704 non-null    string 
 3   image view               704 non-null    string 
 4   abnormality id           704 non-null    int64  
 5   abnormality type         704 non-null    string 
 6   mass shape               378 non-null    string 
 7   mass margins             361 non-null    string 
 8   assessment               704 non-null    int64  
 9   pathology                704 non-null    string 
 10  subtlety                 704 non-null    int64  
 11  image file path          704 non-null    string 
 12  cropped image file path  704 non-null    string 
 13  ROI mask file path       704 non-null    string 
 14  item_id                  7

In [33]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   patient_id               2864 non-null   string 
 1   breast_density           1318 non-null   float64
 2   left or right breast     2864 non-null   string 
 3   image view               2864 non-null   string 
 4   abnormality id           2864 non-null   int64  
 5   abnormality type         2864 non-null   string 
 6   mass shape               1314 non-null   string 
 7   mass margins             1275 non-null   string 
 8   assessment               2864 non-null   int64  
 9   pathology                2864 non-null   string 
 10  subtlety                 2864 non-null   int64  
 11  image file path          2864 non-null   string 
 12  cropped image file path  2864 non-null   string 
 13  ROI mask file path       2864 non-null   string 
 14  item_id                 

# Dataset class

Dado que estamos usando PyTorch y tenemos nuestra data separada en los CSV (tenemos las rutas de la imagenes y las etiquetas en el mismo CSV y no distribuida en carpetas), debemos crear una clase Dataset que herede de PyTroch Dataset y sobreescribiendo los siguientes métodos:

- __init__: Constructor. Recibe el dataframe, el transform y el tamaño de imagen (ésta dependerá del tamaño del input del modelo a usar).
- __len__: Para determinar cuántos elementos hay en el Dataset.
- **read_dicom**: Para abrir los archivos DICOM de CBIS-DDSM. Recibe la ruta del archivo.
- __getitem__ : Para obtener el item o muestra según su índice. Recibe el índice del item a retornar. 

#### Nota: El siguiente Dataset class se crea en función de la arquitectura [ResNet](https://keras.io/api/applications/resnet/) (de ahi que el image size sea de 224 por defecto)

In [34]:
# class CBISDDSMDataset(Dataset):
#     # Directorio donde se encuentra la data dentro del proyecto
#     DATA_PATH = os.path.abspath("../data/CBIS-DDSM/CBIS-DDSM/")
    
#     def __init__(self, df, transform=None, image_size=224):
#         self.df = df
#         self.transform = transform
#         self.image_size = image_size

#     def __len__(self):
#         return len(self.df)

#     def read_dicom(self, path):
#         dicom = pydicom.dcmread(self.DATA_PATH + path)
#         img = dicom.pixel_array.astype(np.float32)
#         img = (img - np.min(img)) / (np.max(img) - np.min(img) + 1e-5)  # Normalizar 0-1
#         img = (img * 255).astype(np.uint8)
#         return Image.fromarray(img)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]

#         img_path = self.read_dicom(row["cropped image file path"])
#         roi_img = self.read_dicom(row["ROI mask file path"])
#         full_img = self.read_dicom(row["image file path"])

#         # Redimensionamos todo a image_size x image_size
#         crop_img = crop_img.resize((self.image_size, self.image_size))
#         roi_img = roi_img.resize((self.image_size, self.image_size))
#         full_img = full_img.resize((self.image_size, self.image_size))

#         # Convertir a np arrays (1 canal) y apilar como RGB sintético
#         crop_np = np.array(crop_img)[..., None]
#         roi_np = np.array(roi_img)[..., None]
#         full_np = np.array(full_img)[..., None]

#         triplet = np.concatenate([crop_np, roi_np, full_np], axis=2)  # shape (H, W, 3)
#         image = Image.fromarray(triplet)

#         if self.transform:
#             image = self.transform(image)
#         else:
#             image = transforms.ToTensor()(image)

#         label = torch.tensor(row["label"], dtype=torch.float32)  # para BCEWithLogitsLoss
#         return image, label

In [35]:
# train_dataset = CBISDDSMDataset(train_df)
# test_dataset = CBISDDSMDataset(test_df)

In [36]:
# train_dataset.__len__()

2864

In [37]:
# test_dataset.__len__()

704

In [38]:
# # DataLoaders
# train_loader = DataLoader(
#     train_dataset,
#     batch_size=16,
#     shuffle=True,
#     num_workers=0
# )

# test_loader = DataLoader(
#     test_dataset,
#     batch_size=16,
#     shuffle=False,
#     num_workers=0
# )

In [39]:
# images, labels = next(iter(train_loader))
# print(images.shape)  # → [16, 3, 224, 224]
# print(labels)        # → [0., 1., 0., ..., 1.]

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\TFM\\breast_cancer_detection\\data\\CBIS-DDSM\\CBIS-DDSMCalc-Training_P_00824_LEFT_MLO_1/1.3.6.1.4.1.9590.100.1.2.233619215512707504408972816820117436754/1.3.6.1.4.1.9590.100.1.2.277121121613936339706146451553961237258/000001.dcm\n'

In [None]:
# file_path = 'D:/TFM/breast_cancer_detection/data/CBIS-DDSM/CBIS-DDSM/Mass-Training_P_01144_LEFT_CC_1/1.3.6.1.4.1.9590.100.1.2.359573892811582761021245518332666829900/1.3.6.1.4.1.9590.100.1.2.101979994513111795025203250011046951602/000000.dcm'

# # Load the DICOM file
# dicom_data = pydicom.dcmread(file_path)

# # Access the pixel data
# image = dicom_data.pixel_array

# plt.imshow(image, cmap="gray")
# plt.title("DICOM Image")
# plt.axis("off")
# plt.show()

### Nota:

Los paths en el CSV no coinciden con los de la descarga del NBIA Data Retriever. Hay que agregar una fase mas de preprocesamiento para mapear el que esta en el CSV con el que esta en el Metadata