# Experimentos de classificação do _dataset_ das faixas usando VGG16

In [1]:
!rm -rf /kaggle/working/*
!export TF_USE_LEGACY_KERAS=True

!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score

from typing import List

2024-05-25 02:07:43.252309: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-25 02:07:43.252402: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-25 02:07:43.383135: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import os
import splitfolders
import shutil

In [4]:
def recursive_rmdir(folder):
    for item in os.listdir(folder):
        item_path = os.path.join(folder, item)
        
        if os.path.isdir(item_path):
            recursive_rmdir(item_path)
            os.rmdir(item_path)
        else:
            os.remove(item_path)
            

def organize_dataset(source_folder, destination_folder):    
    if os.path.exists(destination_folder):
        recursive_rmdir(destination_folder)
    
    # Discover all classes
    classes = set()
    for subfolder in os.listdir(source_folder):
        current_folder = os.path.join(source_folder, subfolder)
        discovered_classes = os.listdir(current_folder)
        classes = classes.union(discovered_classes)
    classes = list(classes)
    classes.sort()

    # Create a folder for each class
    if not os.path.exists(destination_folder):
        os.mkdir(destination_folder)
    
    for _class in classes:
        class_directory = os.path.join(destination_folder, _class)
        os.mkdir(class_directory)
    
    for subfolder in os.listdir(source_folder):
        current_path = os.path.join(source_folder, subfolder)
        for class_folder in os.listdir(current_path):
            current_path = os.path.join(source_folder, subfolder, class_folder)
            for file in os.listdir(current_path):
                
                file_source_path = os.path.join(
                    source_folder,
                    subfolder,
                    class_folder,
                    file
                )
                
                file_destination_path = os.path.join(
                    destination_folder,
                    class_folder,
                    file
                )
                
                shutil.copy2(
                    file_source_path,
                    file_destination_path,
                )

    return classes

In [5]:
def train_test_validation_split(
    source: str,
    destination: str,
    seed: int | None = None
):
    if seed is None:
        seed = np.random.randint(999999)
    print(f"Dataset's split seed is {seed}")

    if not os.path.isdir(destination):
        splitfolders.ratio(source, output=destination,
            seed=seed, ratio=(.6, .2, .2), move=False)

In [6]:
import matplotlib.pyplot as plt

def plot_model_history(history):
    fig, [ax1, ax2] = plt.subplots(1, 2, figsize=(6, 4.4))

    ax1.plot(history.history['accuracy'])
    ax1.plot(history.history['val_accuracy'])
    ax1.set_title('Model accuracy')
    ax1.set_ylabel('accuracy')
    ax1.set_xlabel('epoch')
    ax1.legend(['train', 'test'], loc='upper left')

    ax2.plot(history.history['loss'])
    ax2.plot(history.history['val_loss'])
    ax2.set_title('Model loss')
    ax2.set_ylabel('loss')
    ax2.set_xlabel('loss')
    ax2.legend(['train', 'test'], loc='upper left')

    fig.show()

In [7]:
def build_model(
        model_name: str,
        loss: tf.keras.losses.Loss,
        optimizer: tf.keras.optimizers.legacy.Optimizer,
        metrics: List[str],
        use_transfer_learning=False
    ):
    model = tf.keras.models.Sequential(name=model_name)
    model.add(tf.keras.layers.Rescaling(1./255))

    if use_transfer_learning:
        vgg_16 = tf.keras.applications.VGG16(
            include_top=False,
            input_shape=(224, 224, 3),
            classes=4,
            weights="imagenet",
            pooling="max",
        )

        for layer in vgg_16.layers:
            layer.freeze = True

        model.add(vgg_16)
        model.add(tf.keras.layers.Flatten())
        model.add(tf.keras.layers.Dense(4096, activation="relu"))
        model.add(tf.keras.layers.Dense(4096, activation="relu"))
        model.add(tf.keras.layers.Dense(4, activation="softmax"))

    else:
        vgg_16 = tf.keras.applications.VGG16(
            include_top=True,
            input_shape=(224, 224, 3),
            classes=4,
            weights=None,
            classifier_activation="softmax"
        )

        model.add(vgg_16)

    model.compile(
        loss=loss,
        optimizer=optimizer,
        metrics=metrics,
    )
    model.build((None, 224, 224, 3))

    return model

In [8]:
def train_model(
        model: tf.keras.Model,
        train_dataset,
        validation_dataset,
    ):
    hist = model.fit(
        train_dataset,
        validation_data=validation_dataset,
        epochs=9999,
        callbacks=[
            tf.keras.callbacks.ModelCheckpoint(
                model.name + '.keras',
                monitor='val_accuracy',
                verbose=1,
                save_best_only=True,
                save_weights_only=False,
                mode='auto',
                save_freq="epoch",
            ),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_accuracy',
                min_delta=0,
                patience=20,
                verbose=1,
                mode='auto',
            ),
        ],
    )

    return hist

In [9]:
def get_true_and_predicted_labels(model, dataset):
    y_true = []
    y_pred = []
    
    for X_batch, y_batch in dataset:
        y_batch_pred = np.argmax(model.predict(X_batch, verbose=0), axis=1)
        
        y_true.append(y_batch)
        y_pred.append(y_batch_pred)
    
    y_true = [y_batch.numpy() for y_batch in y_true]
    
    return np.concatenate(y_true), np.concatenate(y_pred)

In [10]:
classes = organize_dataset(
    os.path.join("/", "kaggle", "input", "chest-xray-pneumoniacovid19tuberculosis"),
    os.path.join("/", "kaggle", "working", "temp"),
)
train_test_validation_split(
    os.path.join("/", "kaggle", "working", "temp"),
    os.path.join("/", "kaggle", "working", "dataset"),
    seed=892471
)

print("Found classes:", classes)

Dataset's split seed is 892471


Copying files: 7132 files [00:02, 2724.35 files/s]

Found classes: ['COVID19', 'NORMAL', 'PNEUMONIA', 'TURBERCULOSIS']





In [11]:
print("\n[ TRAIN DATASET ]")
train_dataset = train_set = tf.keras.utils.image_dataset_from_directory(
    os.path.join("/", "kaggle", "working", "dataset", "train"),
    labels='inferred',
    label_mode='int',
    color_mode='rgb',
    batch_size=32,
    image_size=(224, 224),
    shuffle=True,
    interpolation='bilinear',
)

print("\n[ VALIDATION DATASET ]")
validation_dataset = tf.keras.utils.image_dataset_from_directory(
    os.path.join("/", "kaggle", "working", "dataset", "val"),
    labels='inferred',
    label_mode='int',
    color_mode='rgb',
    batch_size=32,
    image_size=(224, 224),
    shuffle=True,
    interpolation='bilinear',
)

print("\n[ TEST DATASET ]")
test_dataset = tf.keras.utils.image_dataset_from_directory(
    os.path.join("/", "kaggle", "working", "dataset", "test"),
    labels='inferred',
    label_mode='int',
    color_mode='rgb',
    batch_size=32,
    image_size=(224, 224),
    shuffle=True,
    interpolation='bilinear',
)


[ TRAIN DATASET ]
Found 4277 files belonging to 4 classes.

[ VALIDATION DATASET ]
Found 1425 files belonging to 4 classes.

[ TEST DATASET ]
Found 1430 files belonging to 4 classes.


In [12]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.00000025, name="optimizer")
loss = tf.keras.losses.SparseCategoricalCrossentropy(name="loss")

ImportError: `keras.optimizers.legacy` is not supported in Keras 3. When using `tf.keras`, to continue using a `tf.keras.optimizers.legacy` optimizer, you can install the `tf_keras` package (Keras 2) and set the environment variable `TF_USE_LEGACY_KERAS=True` to configure TensorFlow to use `tf_keras` when accessing `tf.keras`.

In [None]:
model_name = "model"

model = build_model(
    model_name,
    loss,
    optimizer,
    ["accuracy"],
    use_transfer_learning=True,
)

model_history = train_model(model, train_dataset, validation_dataset)

# Save history
dataframe = pd.DataFrame(model_history.history)
dataframe.to_csv("history.csv", index_label="model_name", header=True, index=True)

# Testando o melhor modelo

In [None]:
best_model = tf.keras.saving.load_model("model.keras")
y_true, y_pred = get_true_and_predicted_labels(model, test_dataset)

model_accuracy = accuracy_score(y_true, y_pred)

# Saving confusion matrix
ConfusionMatrixDisplay.from_predictions(
    y_true, y_pred,
    display_labels=classes
)
plt.savefig("confusion_matrix.png")

print(classification_report(y_true, y_pred, target_names=classes))

In [None]:
!zip -r results.zip model.keras confusion_matrix.png history.csv