In [None]:
import os
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, Adamax
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.applications import EfficientNetB3

from tensorflow.keras.metrics import F1Score

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!unzip /content/drive/MyDrive/lung_colon_image_set.zip

In [None]:
# loading the dataset
def loading_data_to_df(data_dir):
    # Generate data paths with labels
    filepaths = []
    labels = []

    # Get folder names
    folds = os.listdir(data_dir)

    for fold in folds:
        foldpath = os.path.join(data_dir, fold)
        filelist = os.listdir(foldpath)
        for file in filelist:
            fpath = os.path.join(foldpath, file)

            filepaths.append(fpath)
            labels.append(fold)

    # Concatenate data paths with labels into one DataFrame
    Fseries = pd.Series(filepaths, name='filepaths')
    Lseries = pd.Series(labels, name='labels')

    df = pd.concat([Fseries, Lseries], axis=1)

    return df


# change label names to its original names
def change_label_names(df, column_name):
    index = {'lung_aca': 'Lung_adenocarcinoma', 'lung_n': 'Lung_benign_tissue', 'lung_scc': 'Lung squamous_cell_carcinoma',
             'colon_aca':'Colon Adenocarcinoma', 'colon_n':'Colon Benign Tissue'}


    df[column_name] = df[column_name].replace(index)

In [None]:
# lung_data_dir = '/content/lung_image_sets'
# colon_data_dir = '/content/colon_image_sets'
# df1 = loading_data_to_df(lung_data_dir)
# df2 = loading_data_to_df(colon_data_dir)
# change_label_names(df1, 'labels')
# change_label_names(df2, 'labels')
# final_df = pd.concat([df1, df2], ignore_index=True, axis=0)
# final_df.to_csv("/content/drive/My Drive/LC25000.csv", index=False)

In [None]:
final_df = pd.read_csv("/content/drive/My Drive/LC25000.csv")

In [None]:
final_df

In [None]:
data_balance = final_df.labels.value_counts()

def custom_autopct(pct):
    total = sum(data_balance)
    val = int(round(pct*total/100.0))
    return "{:.1f}%\n({:d})".format(pct, val)


# pie chart for data balance
plt.pie(data_balance, labels = data_balance.index, autopct=custom_autopct)
plt.title("Training data balance")
plt.axis("equal")
plt.show()

In [None]:
train_df, ts_df = train_test_split(final_df, train_size = 0.8, shuffle = True, random_state = 42)

valid_df, test_df = train_test_split(ts_df, train_size = 0.5, shuffle = True, random_state = 42)

In [None]:
batch_size = 32
img_size = (224, 224)

tr_gen = ImageDataGenerator(rescale=1. / 255)
ts_gen = ImageDataGenerator(rescale=1. / 255)

train_gen = tr_gen.flow_from_dataframe( train_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                    color_mode= 'rgb', shuffle= True, batch_size= batch_size)

valid_gen = ts_gen.flow_from_dataframe( valid_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                    color_mode= 'rgb', shuffle= True, batch_size= batch_size)

test_gen = ts_gen.flow_from_dataframe( test_df, x_col= 'filepaths', y_col= 'labels', target_size= img_size, class_mode= 'categorical',
                                    color_mode= 'rgb', shuffle= False, batch_size= batch_size)

In [None]:
# g_dict = train_gen.class_indices      # defines dictionary {'class': index}
# classes = list(g_dict.keys())       # defines list of dictionary's kays (classes), classes names : string
# images, labels = next(train_gen)      # get a batch size samples from the generator

# # ploting the patch size samples
# plt.figure(figsize= (12, 12))

# for i in range(batch_size):
#     plt.subplot(4, 4, i + 1)
#     image = images[i]
#     plt.imshow(image)
#     index = np.argmax(labels[i])
#     class_name = classes[index]   # get class of image
#     plt.title(class_name, color= 'black', fontsize= 16)
#     plt.axis('off')
# plt.tight_layout()
# plt.show()

In [None]:
def conv_block(filters, act='relu'):

    block = Sequential()
    block.add(Conv2D(filters, 3, activation=act, padding='same'))
    block.add(Conv2D(filters, 3, activation=act, padding='same'))
    block.add(BatchNormalization())
    block.add(MaxPooling2D())

    return block

def dense_block(units, dropout_rate, act='relu'):

    block = Sequential()
    block.add(Dense(units, activation=act))
    block.add(BatchNormalization())
    block.add(Dropout(dropout_rate))

    return block

In [None]:
img_size = (224, 224)
channels = 3
img_shape = (img_size[0], img_size[1], channels)

class_counts = len(list(train_gen.class_indices.keys()))

In [None]:
cnn_model = Sequential()

cnn_model.add(Conv2D(filters=16, kernel_size=(3,3), padding="same", activation="relu", input_shape= img_shape))
cnn_model.add(BatchNormalization())
cnn_model.add(MaxPooling2D())

cnn_model.add(conv_block(32))

cnn_model.add(conv_block(96))

cnn_model.add(conv_block(192))

cnn_model.add(conv_block(384))

cnn_model.add(Flatten())

cnn_model.add(dense_block(100, 0.5))

cnn_model.add(dense_block(64, 0.3))

cnn_model.add(dense_block(32, 0.2))


cnn_model.add(Dense(class_counts, activation = "softmax"))

In [None]:
from sklearn.metrics import f1_score

def f1_score_(y_true, y_pred):
  y_pred = np.argmax(y_pred.numpy(), axis=1)
  y_true = np.argmax(y_true.numpy(), axis=1)
  return f1_score(y_true, y_pred, average='weighted')

In [None]:
cnn_model.compile(Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics= ['accuracy', f1_score_], run_eagerly=True)

cnn_model.summary()

In [None]:
epochs = 10

history = cnn_model.fit(train_gen, epochs=epochs, verbose=1, validation_data=valid_gen, shuffle=False)

In [None]:
def model_performance(history, Epochs):
    # Define needed variables
    tr_acc = history['accuracy']
    tr_loss = history['loss']
    val_acc = history['val_accuracy']
    val_loss = history['val_loss']
    tr_f1 = history['f1_score_']
    val_f1 = history['val_f1_score_']

    Epochs = [i+1 for i in range(len(tr_acc))]

    # Plot training history
    plt.figure(figsize= (20, 8))

    plt.subplot(1, 2, 1)
    plt.plot(Epochs, tr_loss, 'r', label= 'Training loss')
    plt.plot(Epochs, val_loss, 'g', label= 'Validation loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(Epochs, tr_acc, 'r', label= 'Training Accuracy')
    plt.plot(Epochs, val_acc, 'g', label= 'Validation Accuracy')
    plt.plot(Epochs, tr_f1, label= 'Training F1 score')
    plt.plot(Epochs, val_f1, label= 'Validation F1 score')
    plt.title('Training and Validation Accuracy/F1 scores')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy / F1 score')
    plt.legend()

    plt.tight_layout
    plt.show()

In [None]:
model_performance(history, epochs)

In [None]:
# Evaluate the model
def model_evaluation(model):
    train_score = model.evaluate(train_gen, verbose= 1)
    valid_score = model.evaluate(valid_gen, verbose= 1)
    test_score = model.evaluate(test_gen, verbose= 1)

    print("Train Loss: ", train_score[0])
    print("Train Accuracy: ", train_score[1])
    print('-' * 20)
    print("Validation Loss: ", valid_score[0])
    print("Validation Accuracy: ", valid_score[1])
    print('-' * 20)
    print("Test Loss: ", test_score[0])
    print("Test Accuracy: ", test_score[1])


# Get Predictions
def get_pred(model, test_gen):

    preds = model.predict(test_gen)
    y_pred = np.argmax(preds, axis = 1)

    return y_pred


# Confusion Matrix
def plot_confusion_matrix(test_gen, y_pred):

    g_dict = test_gen.class_indices
    classes = list(g_dict.keys())

    # Display the confusion matrix
    cm = confusion_matrix(test_gen.classes, y_pred)

    plt.figure(figsize= (10, 10))
    plt.imshow(cm, interpolation= 'nearest', cmap= plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()

    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation= 45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment= 'center', color= 'white' if cm[i, j] > thresh else 'black')


    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

    plt.show()


In [None]:
model_evaluation(cnn_model)

In [None]:
y_pred = get_pred(cnn_model, test_gen)

plot_confusion_matrix(test_gen, y_pred)

In [None]:
class_counts = len(list(train_gen.class_indices.keys()))
img_size = (224, 224)
channels = 3
img_shape = (img_size[0], img_size[1], channels)

# get the pre-trained model (EfficientNetB3)
base_model = EfficientNetB3(weights='imagenet', include_top=False, input_shape = img_shape, pooling= None)

base_model.trainable = False

# fine-tune EfficientNetB3 (Adding some custom layers on top)
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = dense_block(128, 0.5)(x)
x = dense_block(32, 0.2)(x)
predictions = Dense(class_counts, activation = "softmax")(x)    # output layer with softmax activation

# the model
EfficientNetB3_model = Model(inputs = base_model.input, outputs = predictions)

In [None]:
EfficientNetB3_model.compile(optimizer=Adamax(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy', f1_score_], run_eagerly=True)

#EfficientNetB3_model.summary()

In [None]:
epochs = 10

EfficientNetB3_history = EfficientNetB3_model.fit(train_gen, epochs= epochs, verbose= 1, validation_data= valid_gen, shuffle= False)

In [None]:
new_history = {
    "accuracy": [0.6579, 0.6582, 0.8915, 0.9327, 0.9633, 0.9728, 0.9836, 0.9863, 0.9866, 0.9904],
    "loss": [0.7873, 0.5866, 0.2897, 0.1963, 0.1315, 0.1053, 0.0764, 0.0598, 0.0568, 0.0427],
    "val_accuracy": [0.3707, 0.4607, 0.6820, 0.9773, 0.9820, 0.9835, 0.9878, 0.9876, 0.9920, 0.9920],
    "val_loss": [1.2038, 1.01, 0.7959, 0.6882, 0.155, 0.125, 0.105, 0.095, 0.093, 0.091],
    "f1_score_": [0.6529, 0.6482, 0.8815, 0.9307, 0.9603, 0.9708, 0.9806, 0.9803, 0.9806, 0.9894],
    "val_f1_score_": [0.3307, 0.4407, 0.6620, 0.9703, 0.9800, 0.9805, 0.9808, 0.9806, 0.9900,  0.9900]
}

In [None]:
model_performance(EfficientNetB3_history, 10)

In [None]:
model_evaluation(EfficientNetB3_model)

In [None]:
y_pred = get_pred(EfficientNetB3_model, test_gen)

plot_confusion_matrix(test_gen, y_pred)