# Multiclassification with pre-established folders (flow from directory method) 

I will try to build a Convolutional Neural Network using the `flow_from_directory` method for three classes:
    - Normal
    - Not-notmal/Not-pneumonia
    - Pneumonia

## Imports

### Libraries

In [1]:
# General imports
import numpy as np
import random
import pandas as pd

# System and file management
import os
import zipfile
from glob import glob

# Visualization Tools
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
from skimage.io import imread

# Pandas defaults
pd.options.display.max_columns = 500
pd.options.display.max_rows = 500

# jupyter:
from IPython.core.display import display, HTML
display(HTML('<style>.container { width:100% !important; }</style>'))

In [None]:
# DICOM
import pydicom
from pydicom.filereader import dcmread

# SKLEARN
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Convolution2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense

### Functions 

In [4]:
def subfolder_creator(path):
    """
    This function checks wether a folder exist. In case it does not, it creates it.
    """
    if os.path.exists(path):
        path
    else:
        os.mkdir(path)

        
        
def dcm_to_img(origin_fpath, destination_fpath, PNG=False):
    """
    This function extracts the image information from DICOM (.dcm) files
    and converts it to either .png or .jpg files, saving it in the desired folder
    Inputs:
        - origin_fpath: path where the .dcm files are located.
        - destination_fpath: desired output path of the converted images.
        - PNG: default False. Wether you want the output files in .png (set as True) or .jpg format.
    Outputs:
        - Converted files, either to *.png or *.jpg.
    """
    
    images_path = os.listdir(origin_fpath) # List of files inside origin folder
    
    for n, image in enumerate(images_path):
        ds = pydicom.dcmread(os.path.join(origin_fpath, image))
        pixel_array_numpy = ds.pixel_array
        if PNG == False:
            image = image.replace('.dcm', '.jpg')
        else:
            image = image.replace('.dcm', '.png')
        cv2.imwrite(os.path.join(destination_fpath, image), pixel_array_numpy)
        if n % 1000 == 0:
            print('{} images converted'.format(n)) # Counter for every 1000 images converted


            
def move_files(origin_fpath, destination_fpath, file_list):
    """
    This function checks if the directories exists then moves the files whose names are contained 
    in the list to the desired destination directory.
    Inputs:
        - origin_fpath: path of origin of the files.
        - destination_fpath: desired destination path.
        - file_list: the list of names of the files that are to be moved.
    Outputs:
        - No outputs displayed.
    """
    # Check if both the are directories
    if os.path.isdir(origin_fpath) and os.path.isdir(destination_fpath) :
        # Iterate over all the files in a list, that we know are located in the source directory
        for file in file_list:
            # Define the route:
            file_path = os.path.join(origin_fpath, file)
            # Move each file to destination Directory
            shutil.move(file_path, destination_fpath);
    else:
        print("origin_fpath & destination_fpath should be Directories")

### Paths 

In [5]:
PATH = 'data/' # root
CSV_PATH = os.path.join(PATH,'csv') # folder with csv datasets
DICOM_PATH = os.path.join(PATH, 'pool') # folder containing all of the dicom files
JPG_PATH = os.path.join(PATH, 'pool_jpg') # folder containing all the converted jpg files
DESTINATION_PATH = os.path.join(PATH, 'sorted_balanced') # Folder where the train and test subsets will be located
MODELS_PATH = os.path.join(PATH,'model')

# Train folder
TRAIN_PATH = os.path.join(DESTINATION_PATH, 'train')
TRAIN_NORMAL_PATH = os.path.join(TRAIN_PATH, 'normal')
TRAIN_NNNP_PATH = os.path.join(TRAIN_PATH, 'nnnp')
TRAIN_PNEUMONIA_PATH = os.path.join(TRAIN_PATH, 'pneumonia')

# Validation folder
VAL_PATH = os.path.join(DESTINATION_PATH, 'validation')
VAL_NORMAL_PATH = os.path.join(VAL_PATH, 'normal')
VAL_NNNP_PATH = os.path.join(VAL_PATH, 'nnnp')
VAL_PNEUMONIA_PATH = os.path.join(VAL_PATH, 'pneumonia')

# Test folder
TEST_PATH = os.path.join(DESTINATION_PATH, 'test')
TEST_NORMAL_PATH = os.path.join(TEST_PATH, 'normal') 
TEST_NNNP_PATH = os.path.join(TEST_PATH, 'nnnp')
TEST_PNEUMONIA_PATH = os.path.join(TEST_PATH, 'pneumonia') 

### CSV

In [34]:
balanced = pd.read_csv(os.path.join(CSV_PATH, 'balanced_cxr_information.csv'))

## Model Building 

### Preprocessing images 

In [None]:
# Columns which contain the output:
columns = ['type_0', 'type_1', 'type_2']

# Batch size:
BATCH_SIZE = 32
SEED = 42
IMAGE_SIZE = (512, 512)

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
train_generator = train_datagen.flow_from_directory(
        TRAIN_PATH,  # This is the source directory for training images
        target_size=(512, 512),  # All images will be resized to 150x150
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

val_generator = train_datagen.flow_from_directory(
        VAL_PATH,  # This is the source directory for training images
        target_size=(512, 512),  # All images will be resized to 150x150
        batch_size=20,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

# Flow validation images in batches of 20 using val_datagen generator
test_generator = test_datagen.flow_from_directory(
        TEST_PATH,
        target_size=(512, 512),
        batch_size=20,
        class_mode='binary')

### Network creation 

In [None]:
# Our input feature map is 512x512x3: 150x150 for the image pixels, and 3 for
# the three color channels: R, G, and B
img_input = layers.Input(shape=(512, 512, 3))

# First convolution extracts 16 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
# max-pooling is followed by a dropout to try to avoid overfitting
x = layers.Conv2D(16, 3, activation='relu')(img_input)
x = layers.MaxPooling2D(2)(x)
x = layers.Dropout(0.25)(x)

# Second convolution extracts 32 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
# max-pooling is followed by a dropout to try to avoid overfitting
x = layers.Conv2D(32, 3, activation='relu')(x)
x = layers.MaxPooling2D(2)(x)
x = layers.Dropout(0.25)(x)

# Third convolution extracts 64 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
# max-pooling is followed by a dropout to try to avoid overfitting
x = layers.Conv2D(64, 3, activation='relu')(x)
x = layers.MaxPooling2D(2)(x)
x = layers.Dropout(0.25)(x)

In [None]:
# Flatten feature map to a 1-dim tensor so we can add fully connected layers
x = layers.Flatten()(x)

# Create a fully connected layer with ReLU activation and 512 hidden units
x = layers.Dense(512, activation='relu')(x)

# Create output layer with a single node and sigmoid activation
output = layers.Dense(3, activation='sigmoid')(x)

# Create model:
# input = input feature map
# output = input feature map + stacked convolution/maxpooling layers + fully 
# connected layer + sigmoid output layer
model = Model(img_input, output)

#### Summary

In [None]:
model.summary()

#### Compilation 

In [None]:
model.compile(optimizer = tensorflow.compat.v2.keras.optimizers.RMSprop(lr=0.0001, decay=1e-6),
              loss='categorical_crossentropy',
              metrics=['accuracy', 'AUC'])

### Fitting

#### Setting constants: 

In [None]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VAL = val_generator.n//val_generator.batch_size
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size

#### Fitting 

In [None]:
history = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=val_generator,
                    validation_steps=STEP_SIZE_VAL,
                    epochs=1,
                    verbose=1)

### Saving 

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("3_class_flow_dir_1.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("3_class_flow_dir_1.h5")
print("Saved model to disk")

## Predict the output

In [None]:
test_generator.reset()
pred=model.predict_generator(test_generator,
steps=STEP_SIZE_TEST,
verbose=1)

In [None]:
pred_bool = (pred >0.5)

In [None]:
predictions = pred_bool.astype(int)
columns = columns
#columns should be the same order of y_col
results=pd.DataFrame(predictions, columns=columns)
results["Filenames"]=test_generator.filenames
ordered_cols=["Filenames"]+columns
results=results[ordered_cols]#To get the same column order
results.to_csv("results.csv",index=False)

## Metrics

In [None]:
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.python.keras.utils.data_utils import Sequence
#Confusion Matrix and Classification Report
Y_pred = model.predict_generator(val_generator, STEP_SIZE_VAL)
y_pred = np.argmax(Y_pred, axis=1)
print('Confusion Matrix')
print(confusion_matrix(val_generator.classes, y_pred))
print('Classification Report')
target_names = ['Normal', 'NNNP', 'Pneumonia']
print(classification_report(val_generator.classes, y_pred, target_names=target_names))