# Conjunto de Dados 4: *ChestXray-NIH*
***
> Disponível em: <https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345>. Acesso em 19 fev. 2021.

### Importação dos pacotes necessários

In [2]:
from keras.preprocessing.image import ImageDataGenerator
import urllib.request
path = 'https://raw.githubusercontent.com/Alyssonmach/cnn-lung-diseases/main/assets/cxr8_dados.py'
file_path = 'cxr8_dados.py'
urllib.request.urlretrieve(path, file_path)
from cxr8_dados import data_download, organize_csv, download_images, train_validation_test_split 
import glob
import time

import warnings
warnings.filterwarnings("ignore")

### Baixando os dados

In [3]:
# baixando dataframe
data_download('https://raw.githubusercontent.com/Alyssonmach/cnn-lung-diseases/main/assets/dataframe-info.csv', 'dataframe-info.csv')

In [None]:
# baixando as imagens a serem utilizadas
download_images()

In [5]:
# especificando o diretório com as imagens 
IMAGE_DIR = ['/content/drive/MyDrive/images-nih1',
             '/content/drive/MyDrive/images-nih2',
             '/content/drive/MyDrive/images-nih3',
             '/content/drive/MyDrive/images-nih4']

In [6]:
# obtendo o dataframe organizando
dataframe, (normal, anormal) = organize_csv('/content/dataframe-info.csv')

In [9]:
# visualizando a quantidade de dados disponíveis em cada classe
dataframe['finding_labels'].value_counts()

No Finding            60361
Infiltration           9547
Atelectasis            4215
Effusion               3955
Nodule                 2705
Pneumothorax           2194
Consolidation          1310
Pleural_Thickening     1126
Emphysema               892
Fibrosis                727
Edema                   628
Pneumonia               322
Name: finding_labels, dtype: int64

In [17]:
# visualizando o dataframe
print(dataframe.head())
print('dataframe shape:', dataframe.shape)

        Image Index  finding_labels  labels
0  00011895_001.png               0       0
1  00015384_004.png               0       0
2  00028131_011.png               0       0
3  00028792_001.png               1       1
4  00010716_000.png               0       0
dataframe shape: (87982, 3)


In [36]:
# particionando o dataset em dados de treino, validação e teste  
train_df, validation_df, test_df = train_validation_test_split(dataframe)

In [37]:
# visualizando detalhes dos dados de treinamento
print(train_df.head())
print('train_df shape:', train_df.shape)

            Image Index  finding_labels  labels
87745  00000322_011.png               6       1
82970  00006022_004.png               0       0
51704  00019766_008.png               0       0
38340  00017206_000.png               0       0
71153  00026810_041.png               1       1
train_df shape: (81074, 3)


In [38]:
# visualizando detalhes dos dados de treinamento
print(validation_df.head())
print('validation_df shape:', validation_df.shape)

            Image Index  finding_labels  labels
5623   00011667_001.png               0       0
63643  00009988_001.png               0       0
67737  00002664_000.png               0       0
76165  00011606_006.png               3       1
55740  00016719_016.png               6       1
validation_df shape: (4268, 3)


In [39]:
# visualizando detalhes dos dados de teste 
print(test_df.head())
print('validation_df shape:', test_df.shape)

            Image Index  finding_labels  labels
42532  00019214_000.png               0       0
5432   00011827_004.png               0       0
30045  00012485_001.png               0       0
10278  00025340_002.png               0       0
40900  00023741_000.png               0       0
validation_df shape: (2640, 3)


### Gerados de Dados para a rede pelo Tensorflow

In [44]:
# função para baixar os caminhos de todas as imagens
def find_files():

  file1 = glob.glob('/content/drive/MyDrive/images-nih1/images/*.png')
  time.sleep(10)
  file2 = glob.glob('/content/drive/MyDrive/images-nih2/images/*.png')
  time.sleep(10)
  file3 = glob.glob('/content/drive/MyDrive/images-nih3/images/*.png')
  time.sleep(10)
  file4 = glob.glob('/content/drive/MyDrive/images-nih4/images/*.png')

  return file1, file2, file3, file4

file1, file2, file3, file4 = find_files()

# função para adicionar os caminhos das imagens nos diretórios a partir do arquivo csv
def replace_file(dataframe, new_file, image):

  index = int(dataframe[dataframe['Image Index'] == image].index.values)
  dataframe['Image Index'][index] = new_file

  return None

In [48]:
# organizando o arquivo csv 

result = ''

for image in train_df['Image Index']:
  for file_ in file1:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)
  for file_ in file2:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)
  for file_ in file3:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)
  for file_ in file4:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = train_df, new_file = file_, image = image)

for image in validation_df['Image Index']:
  for file_ in file1:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)
  for file_ in file2:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)
  for file_ in file3:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)
  for file_ in file4:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = validation_df, new_file = file_, image = image)

for image in test_df['Image Index']:
  for file_ in file1:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)
  for file_ in file2:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)
  for file_ in file3:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)
  for file_ in file4:
    result = file_.find(image)
    if result > 0:
      replace_file(dataframe = test_df, new_file = file_, image = image)


In [79]:
# salvando os dataframes manipulados em arquivos csv
train_df.to_csv('train_df.csv')
validation_df.to_csv('validation_df.csv')
test_df.to_csv('test_df.csv')

In [107]:
# dicionário com o balancemanto de pesos das classes
class_weights = {0: 3902, 1: 1224, 2: 578, 3: 3645, 4: 818, 5: 671,
                 6: 8770, 7: 55622, 8: 2497, 9: 1028, 10: 295, 11: 2024}

> **Opção 1**

In [80]:
# normalizando as imagens de treinamento e aplicando aumento de dados
image_generator = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True,
                                     rotation_range = 10, zoom_range = 0.2)

# criando o gerador de imagens de treinamento 
train_generator = image_generator.flow_from_dataframe(
                                                      dataframe = train_df,
                                                      x_col = 'Image Index',
                                                      y_col = 'finding_labels',
                                                      batch_size = 256,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'categorical',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))
# criando o gerador de imagens de validação 
valid_generator = image_generator.flow_from_dataframe(
                                                      dataframe = validation_df, 
                                                      x_col = 'Image Index',
                                                      y_col = 'finding_labels',
                                                      batch_size = 128,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'categorical',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))

# normalizando as imagens de teste 
test_datagen = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True)

test_generator = test_datagen.flow_from_dataframe(
                                                  dataframe = test_df, 
                                                  x_col = 'Image Index',
                                                  y_col = 'finding_labels',
                                                  batch_size = 128,
                                                  seed = 42,
                                                  shuffle = True,
                                                  class_mode = 'categorical',
                                                  color_mode = 'rgb',
                                                  target_size = (256, 256))

Found 81074 validated image filenames belonging to 12 classes.
Found 4268 validated image filenames belonging to 12 classes.
Found 2640 validated image filenames belonging to 12 classes.


- **Opção 2**

In [81]:
# normalizando as imagens de treinamento e aplicando aumento de dados
image_generator = ImageDataGenerator(rescale = 1./255., rotation_range = 10, zoom_range = 0.2)

# criando o gerador de imagens de treinamento 
train_generator = image_generator.flow_from_dataframe(
                                                      dataframe = train_df,
                                                      x_col = 'Image Index',
                                                      y_col = 'finding_labels',
                                                      batch_size = 256,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'categorical',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))
# criando o gerador de imagens de validação 
valid_generator = image_generator.flow_from_dataframe(
                                                      dataframe = validation_df,
                                                      x_col = 'Image Index',
                                                      y_col = 'finding_labels',
                                                      batch_size = 128,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'categorical',
                                                      target_size = (256, 256))

# normalizando as imagens de teste 
test_datagen = ImageDataGenerator(rescale = 1./255.)

test_generator = test_datagen.flow_from_dataframe(
                                                  dataframe = test_df, 
                                                  x_col = 'Image Index',
                                                  y_col = 'finding_labels',
                                                  batch_size = 128,
                                                  seed = 42,
                                                  shuffle = True,
                                                  class_mode = 'categorical',
                                                  target_size = (256, 256))

Found 81074 validated image filenames belonging to 12 classes.
Found 4268 validated image filenames belonging to 12 classes.
Found 2640 validated image filenames belonging to 12 classes.


In [108]:
# observando os índices das classes encontradas
train_generator.class_indices

{'Atelectasis': 0,
 'Consolidation': 1,
 'Edema': 2,
 'Effusion': 3,
 'Emphysema': 4,
 'Fibrosis': 5,
 'Infiltration': 6,
 'No Finding': 7,
 'Nodule': 8,
 'Pleural_Thickening': 9,
 'Pneumonia': 10,
 'Pneumothorax': 11}