# Conjunto de Dados 4: *ChestXray-NIH*
***
> Disponível em: <https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/36938765345>. Acesso em 19 fev. 2021.

### Importação dos pacotes necessários

In [None]:
from keras.preprocessing.image import ImageDataGenerator
import urllib.request
urllib.request.urlretrieve('https://raw.githubusercontent.com/Alyssonmach/cnn-lung-diseases/main/cxr8_dados.py', 'cxr8_dados.py')
urllib.request.urlretrieve('https://raw.githubusercontent.com/Alyssonmach/histogram-equalization/main/histogram_equalization.py', 'histogram_equalization.py')
from histogram_equalization import histogram_equalization
from cxr8_dados import data_download, organize_csv, download_images, train_validation_test_split 

import warnings
warnings.filterwarnings("ignore")

### Baixando os dados

In [None]:
# baixando dataframe
data_download('https://raw.githubusercontent.com/Alyssonmach/cnn-lung-diseases/main/dataframe-info.csv', 'dataframe-info.csv')

In [None]:
# baixando as imagens a serem utilizadas
download_images()

In [None]:
# descompactando os arquivos e removendo os arquivos compactados
!tar -xvzf images_01.tar.gz -C /content/
!rm /content/images_01.tar.gz 
!tar -xvzf images_02.tar.gz -C /content/
!rm /content/images_02.tar.gz
!tar -xvzf images_03.tar.gz -C /content/
!rm /content/images_03.tar.gz
!tar -xvzf images_04.tar.gz -C /content/
!rm /content/images_04.tar.gz
!tar -xvzf images_05.tar.gz -C /content/
!rm /content/images_05.tar.gz
!tar -xvzf images_06.tar.gz -C /content/
!rm /content/images_06.tar.gz
!tar -xvzf images_07.tar.gz -C /content/
!rm /content/images_07.tar.gz
!tar -xvzf images_08.tar.gz -C /content/
!rm /content/images_08.tar.gz
!tar -xvzf images_09.tar.gz -C /content/
!rm /content/images_09.tar.gz
!tar -xvzf images_10.tar.gz -C /content/
!rm /content/images_10.tar.gz
!tar -xvzf images_11.tar.gz -C /content/
!rm /content/images_11.tar.gz
!tar -xvzf images_12.tar.gz -C /content/
!rm /content/images_12.tar.gz

In [None]:
# especificando o diretório com as imagens 
IMAGE_DIR = "/content/images/"

In [None]:
# obtendo o dataframe organizando
dataframe = organize_csv('/content/dataframe-info.csv')

In [None]:
# visualizando o dataframe
print(dataframe.head())
print('dataframe shape:', dataframe.shape)

In [None]:
# particionando o dataset em dados de treino, validação e teste  
train_df, validation_df, test_df = train_validation_test_split(dataframe)

In [None]:
# visualizando detalhes dos dados de treinamento
print(train_df.head())
print('train_df shape:', train_df.shape)

In [None]:
# visualizando detalhes dos dados de treinamento
print(validation_df.head())
print('validation_df shape:', validation_df.shape)

In [None]:
# visualizando detalhes dos dados de teste 
print(test_df.head())
print('validation_df shape:', test_df.shape)

### Gerados de Dados para a rede pelo Tensorflow

> **Opção 1**

In [None]:
# normalizando as imagens de treinamento e aplicando aumento de dados
image_generator = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True,
                                     rotation_range = 20, zoom_range = 0.2)

# criando o gerador de imagens de treinamento 
train_generator = image_generator.flow_from_dataframe(
                                                      dataframe = train_df,
                                                      directory = IMAGE_DIR,
                                                      x_col = 'Image Index',
                                                      y_col = 'labels',
                                                      batch_size = 256,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'raw',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))
# criando o gerador de imagens de validação 
valid_generator = image_generator.flow_from_dataframe(
                                                      dataframe = validation_df,
                                                      directory = IMAGE_DIR, 
                                                      x_col = 'Image Index',
                                                      y_col = 'labels',
                                                      batch_size = 128,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'raw',
                                                      target_size = (256, 256))

# normalizando as imagens de teste 
test_datagen = ImageDataGenerator(samplewise_center = True, samplewise_std_normalization = True)

test_generator = test_datagen.flow_from_dataframe(
                                                  dataframe = test_df, 
                                                  directory = IMAGE_DIR,
                                                  x_col = 'Image Index',
                                                  y_col = 'labels',
                                                  batch_size = 128,
                                                  seed = 42,
                                                  shuffle = True,
                                                  class_mode = 'raw',
                                                  target_size = (256, 256))

> **Opção 2**

In [None]:
# normalizando as imagens de treinamento e aplicando aumento de dados
image_generator = ImageDataGenerator(preprocessing_function = histogram_equalization,
                                     rescale = 1./255., rotation_range = 20, zoom_range = 0.2)

# criando o gerador de imagens de treinamento 
train_generator = image_generator.flow_from_dataframe(
                                                      dataframe = train_df,
                                                      directory = IMAGE_DIR,
                                                      x_col = 'Image Index',
                                                      y_col = 'labels',
                                                      batch_size = 256,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'raw',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))
# criando o gerador de imagens de validação 
valid_generator = image_generator.flow_from_dataframe(
                                                      dataframe = validation_df,
                                                      directory = IMAGE_DIR, 
                                                      x_col = 'Image Index',
                                                      y_col = 'labels',
                                                      batch_size = 128,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'raw',
                                                      target_size = (256, 256))

# normalizando as imagens de teste 
test_datagen = ImageDataGenerator(preprocessing_function = histogram_equalization,
                                  rescale = 1./255.)

test_generator = test_datagen.flow_from_dataframe(
                                                  dataframe = test_df, 
                                                  directory = IMAGE_DIR,
                                                  x_col = 'Image Index',
                                                  y_col = 'labels',
                                                  batch_size = 128,
                                                  seed = 42,
                                                  shuffle = True,
                                                  class_mode = 'raw',
                                                  target_size = (256, 256))

> **Opção 3**

In [None]:
# normalizando as imagens de treinamento e aplicando aumento de dados
image_generator = ImageDataGenerator(rescale = 1./255., rotation_range = 20, zoom_range = 0.2)

# criando o gerador de imagens de treinamento 
train_generator = image_generator.flow_from_dataframe(
                                                      dataframe = train_df,
                                                      directory = IMAGE_DIR,
                                                      x_col = 'Image Index',
                                                      y_col = 'labels',
                                                      batch_size = 256,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'raw',
                                                      color_mode = 'rgb',
                                                      target_size = (256, 256))
# criando o gerador de imagens de validação 
valid_generator = image_generator.flow_from_dataframe(
                                                      dataframe = validation_df,
                                                      directory = IMAGE_DIR, 
                                                      x_col = 'Image Index',
                                                      y_col = 'labels',
                                                      batch_size = 128,
                                                      seed = 42,
                                                      shuffle = True,
                                                      class_mode = 'raw',
                                                      target_size = (256, 256))

# normalizando as imagens de teste 
test_datagen = ImageDataGenerator(rescale = 1./255.)

test_generator = test_datagen.flow_from_dataframe(
                                                  dataframe = test_df, 
                                                  directory = IMAGE_DIR,
                                                  x_col = 'Image Index',
                                                  y_col = 'labels',
                                                  batch_size = 128,
                                                  seed = 42,
                                                  shuffle = True,
                                                  class_mode = 'raw',
                                                  target_size = (256, 256))