# Conjunto de Dados 4: *ChestXray-NIH*
***
> Disponível em: <https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345>. Acesso em 19 fev. 2021.

### Importação dos pacotes necessários

In [38]:
from keras.preprocessing.image import ImageDataGenerator
from sklearn.utils import class_weight
import numpy as np
import urllib.request
path = 'https://raw.githubusercontent.com/Alyssonmach/cnn-lung-diseases/main/assets/cxr8_dados.py'
file_path = 'cxr8_dados.py'
urllib.request.urlretrieve(path, file_path)
from cxr8_dados import data_download, organize_csv, download_images, train_validation_test_split 
import glob
import time

import warnings
warnings.filterwarnings("ignore")

### Pré-processamento nos dados

In [2]:
# baixando dataframe
data_download('https://raw.githubusercontent.com/Alyssonmach/cnn-lung-diseases/main/assets/dataframe-info.csv', 'dataframe-info.csv')

In [20]:
# especificando o diretório com as imagens 
IMAGE_DIR = ['../0-datasets/nih/arquivos-descompactados/file01/images/',
             '../0-datasets/nih/arquivos-descompactados/file02/images/',
             '../0-datasets/nih/arquivos-descompactados/file03/images/']

In [4]:
# obtendo o dataframe organizando
dataframe, (normal, anormal) = organize_csv('dataframe-info.csv')

In [5]:
# visualizando a quantidade de dados disponíveis em cada classe
dataframe['finding_labels'].value_counts()

No Finding            60361
Infiltration           9547
Atelectasis            4215
Effusion               3955
Nodule                 2705
Pneumothorax           2194
Consolidation          1310
Pleural_Thickening     1126
Emphysema               892
Fibrosis                727
Edema                   628
Pneumonia               322
Name: finding_labels, dtype: int64

In [6]:
# visualizando o dataframe
print(dataframe.head())
print('dataframe shape:', dataframe.shape)

        Image Index finding_labels  labels
0  00011895_001.png     No Finding       0
1  00015384_004.png     No Finding       0
2  00028131_011.png     No Finding       0
3  00028792_001.png   Infiltration       1
4  00010716_000.png     No Finding       0
dataframe shape: (87982, 3)


In [7]:
# particionando o dataset em dados de treino, validação e teste  
train_df, validation_df, test_df = train_validation_test_split(dataframe)

In [8]:
# visualizando detalhes dos dados de treinamento
print(train_df.head())
print('train_df shape:', train_df.shape)

            Image Index finding_labels  labels
55527  00012919_007.png     No Finding       0
74217  00027706_024.png     No Finding       0
52448  00008850_010.png  Consolidation       1
77009  00022899_003.png    Atelectasis       1
25183  00007009_006.png     No Finding       0
train_df shape: (82772, 3)


In [9]:
# visualizando detalhes dos dados de treinamento
print(validation_df.head())
print('validation_df shape:', validation_df.shape)

            Image Index finding_labels  labels
17330  00007233_003.png     No Finding       0
29556  00013531_017.png     No Finding       0
54182  00006793_000.png     No Finding       0
83937  00005673_000.png     No Finding       0
47689  00027492_002.png     No Finding       0
validation_df shape: (1690, 3)


In [10]:
# visualizando detalhes dos dados de teste 
print(test_df.head())
print('validation_df shape:', test_df.shape)

            Image Index finding_labels  labels
8926   00014125_022.png     No Finding       0
53413  00011950_004.png     No Finding       0
25438  00016836_000.png     No Finding       0
54890  00021866_001.png     No Finding       0
38688  00029821_017.png     No Finding       0
validation_df shape: (3520, 3)


### Gerados de Dados para a rede pelo Tensorflow

In [21]:
# função para baixar os caminhos de todas as imagens
def find_files():

  file1 = glob.glob('../0-datasets/nih/arquivos-descompactados/file01/images/*.PNG')
  file2 = glob.glob('../0-datasets/nih/arquivos-descompactados/file02/images/*.png')
  file3 = glob.glob('../0-datasets/nih/arquivos-descompactados/file03/images/*.png')

  return file1, file2, file3

file1, file2, file3 = find_files()

# função para adicionar os caminhos das imagens nos diretórios a partir do arquivo csv
def replace_file(dataframe, new_file, image):

  index = int(dataframe[dataframe['Image Index'] == image].index.values)
  dataframe['Image Index'][index] = new_file

  return None

In [23]:
# organizando o arquivo csv 

result = ''

for image in train_df['Image Index']:
  for file_ in file1:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)
  for file_ in file2:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)
  for file_ in file3:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)

for image in validation_df['Image Index']:
  for file_ in file1:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)
  for file_ in file2:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)
  for file_ in file3:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)

for image in test_df['Image Index']:
  for file_ in file1:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)
  for file_ in file2:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)
  for file_ in file3:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)

In [40]:
# tornando as classes na coluna 'labels' categórica
train_df.loc[train_df.labels == 1, 'labels'] = 'abnormal'
train_df.loc[train_df.labels == 0, 'labels'] = 'normal'

validation_df.loc[validation_df.labels == 1, 'labels'] = 'abnormal'
validation_df.loc[validation_df.labels == 0, 'labels'] = 'normal'

test_df.loc[test_df.labels == 1, 'labels'] = 'abnormal'
test_df.loc[test_df.labels == 0, 'labels'] = 'normal'

In [44]:
# visualizando o novo formato dos dataframes
train_df.head()

Unnamed: 0,Image Index,finding_labels,labels
55527,../0-datasets/nih/arquivos-descompactados/file...,No Finding,normal
74217,../0-datasets/nih/arquivos-descompactados/file...,No Finding,normal
52448,../0-datasets/nih/arquivos-descompactados/file...,Consolidation,abnormal
77009,../0-datasets/nih/arquivos-descompactados/file...,Atelectasis,abnormal
25183,../0-datasets/nih/arquivos-descompactados/file...,No Finding,normal


In [45]:
# salvando os dataframes manipulados em arquivos csv
train_df.to_csv('train_df.csv')
validation_df.to_csv('validation_df.csv')
test_df.to_csv('test_df.csv')

In [34]:
# visualizando a quantidade de exemplos disponíveis por classe nos dados de treinamento
print(train_df['finding_labels'].value_counts())
print('\nQuantidade de exemplos disponíveis: {} imagens'.format(len(train_df)))

No Finding            56787
Infiltration           8982
Atelectasis            3974
Effusion               3746
Nodule                 2533
Pneumothorax           2058
Consolidation          1235
Pleural_Thickening     1044
Emphysema               841
Fibrosis                684
Edema                   587
Pneumonia               301
Name: finding_labels, dtype: int64

Quantidade de exemplos disponíveis: 82772 imagens


In [33]:
# visualizando a quantidade de exemplos disponíveis por classe nos dados de validação
print(validation_df['finding_labels'].value_counts())
print('\nQuantidade de exemplos disponíveis: {} imagens'.format(len(validation_df)))

No Finding            1159
Infiltration           182
Atelectasis             85
Effusion                61
Nodule                  55
Pneumothorax            39
Consolidation           32
Pleural_Thickening      26
Edema                   20
Emphysema               17
Fibrosis                10
Pneumonia                4
Name: finding_labels, dtype: int64

Quantidade de exemplos disponíveis: 1690 imagens


In [31]:
# visualizando a quantidade de exemplos disponíveis por classe nos dados de teste
print(test_df['finding_labels'].value_counts())
print('\nQuantidade de exemplos disponíveis: {} imagens'.format(len(test_df)))

No Finding            2415
Infiltration           383
Atelectasis            156
Effusion               148
Nodule                 117
Pneumothorax            97
Pleural_Thickening      56
Consolidation           43
Emphysema               34
Fibrosis                33
Edema                   21
Pneumonia               17
Name: finding_labels, dtype: int64

Quantidade de exemplos disponíveis: 3520 imagens


In [39]:
# organizando um dicionário para realizar o balanceamento nos dados das classes
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_df['finding_labels']),
                                                  train_df['finding_labels'])
class_weight = {0: class_weights[0], 1: class_weights[1], 2: class_weights[2], 
                3: class_weights[3], 4: class_weights[4], 5: class_weights[5],
                6: class_weights[6], 7: class_weights[7], 8: class_weights[8],
                9: class_weights[9], 10: class_weights[10], 11: class_weights[11]}

### Testando com o gerador de dados do Keras

In [28]:
# normalizando as imagens de treinamento e aplicando aumento de dados
image_generator = ImageDataGenerator(rescale = 1./255., rotation_range = 10, zoom_range = 0.2)

# criando o gerador de imagens de treinamento 
train_generator = image_generator.flow_from_dataframe(
                                                      dataframe = train_df,
                                                      x_col = 'Image Index',
                                                      y_col = 'finding_labels',
                                                      batch_size = 256,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'categorical',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))
# criando o gerador de imagens de validação 
valid_generator = image_generator.flow_from_dataframe(
                                                      dataframe = validation_df,
                                                      x_col = 'Image Index',
                                                      y_col = 'finding_labels',
                                                      batch_size = 128,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'categorical',
                                                      target_size = (256, 256))

# normalizando as imagens de teste 
test_datagen = ImageDataGenerator(rescale = 1./255.)

test_generator = test_datagen.flow_from_dataframe(
                                                  dataframe = test_df, 
                                                  x_col = 'Image Index',
                                                  y_col = 'finding_labels',
                                                  batch_size = 128,
                                                  seed = 42,
                                                  shuffle = True,
                                                  class_mode = 'categorical',
                                                  target_size = (256, 256))

Found 82772 validated image filenames belonging to 12 classes.
Found 1690 validated image filenames belonging to 12 classes.
Found 3520 validated image filenames belonging to 12 classes.


In [108]:
# observando os índices das classes encontradas pelo gerador de dados do keras
train_generator.class_indices

{'Atelectasis': 0,
 'Consolidation': 1,
 'Edema': 2,
 'Effusion': 3,
 'Emphysema': 4,
 'Fibrosis': 5,
 'Infiltration': 6,
 'No Finding': 7,
 'Nodule': 8,
 'Pleural_Thickening': 9,
 'Pneumonia': 10,
 'Pneumothorax': 11}