## Libraries

In [7]:
import os

import pandas as pd
import numpy as np

from glob import glob
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder


import tensorflow as tf
from keras.utils import to_categorical # convert to one-hot-encoding

import warnings
warnings.filterwarnings('ignore')

import cv2

from utils import *

## Data Download

In [2]:
# Load data
project_path = 'C:\\Users\\Martin\\OneDrive - NOVAIMS\\Documents\\Universidade\\3º Ano\\Deep Learning\\Project'

# As we know, there are 2 folders with images, one for training and one for testing, and there is also the metadata of all those pictures in a single excel file.
# Let's load the csv file and see what it looks like.

metadata = pd.read_csv(os.path.join(project_path, 'HAM10000_metadata.csv'))

# Now to get all the images from the train folder
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(project_path, '*', '*.jpg'))}

## Train, Test Slip

In [3]:
# Now we can add a new column to the dataframe with the path to the image
metadata['path'] = metadata['image_id'].map(imageid_path_dict.get)

# We can also add a column with the actual image
metadata['image'] = metadata['path'].map(lambda x: cv2.imread(x))

# Let's now separate the dataset into train and test based on whether train or test is in the path
train = metadata[metadata['path'].str.contains("train")]
test = metadata[metadata['path'].str.contains("test")]

## EDA

### Check for duplicates

In [4]:
train.lesion_id.duplicated().sum()

1513

Let's drop them

In [5]:
train.drop_duplicates(subset='lesion_id', keep='first', inplace=True)

## One-Hot Encoding

Using categorical data is not the best approach, we will use `LabelEncoder` to transform the target column into *numerical values*, which we will then use the function `to_categorical` to transform into *one-hot encoding*.


In [8]:
label_encoder = LabelEncoder()
train['CancerType'] = label_encoder.fit_transform(train['dx'])
test['CancerType'] = label_encoder.fit_transform(test['dx'])

In [9]:
#quick check of the values in the training set
train.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,image,CancerType
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,C:\Users\Martin\OneDrive - NOVAIMS\Documents\U...,,2
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,C:\Users\Martin\OneDrive - NOVAIMS\Documents\U...,,2
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,C:\Users\Martin\OneDrive - NOVAIMS\Documents\U...,,2
6,HAM_0002761,ISIC_0029176,bkl,histo,60.0,male,face,C:\Users\Martin\OneDrive - NOVAIMS\Documents\U...,,2
8,HAM_0005132,ISIC_0025837,bkl,histo,70.0,female,back,C:\Users\Martin\OneDrive - NOVAIMS\Documents\U...,,2


In [10]:
# train['dx'].value_counts()
# make sense to leave this here since marcel did the transformation  two cells ago from "dx" to "CancerType"??

dx
nv       4253
bkl       603
mel       534
bcc       280
akiec     189
vasc       80
df         59
Name: count, dtype: int64

In [11]:
# checking for unbalances
train['CancerType'].value_counts()

CancerType
5    4253
2     603
4     534
1     280
0     189
6      80
3      59
Name: count, dtype: int64

### Balancing the Dataset
As we saw before, the dataset is not balanced, so we should balance it before training the model.
Some of the approaches we can use are:
- Undersampling
- Oversampling
- SMOTE
- Class Weights
- Ensemble Methods
- Data Augmentation

We should extract the `target` column from both train and test and remove it from their dataframes

In [12]:
y_train = train[['image_id', 'CancerType']]
y_train.set_index('image_id', inplace=True)
y_train = to_categorical(y_train, num_classes = 7)

In [13]:
y_test = test[['image_id', 'CancerType']]
y_test.set_index('image_id', inplace=True)
y_test = to_categorical(y_test, num_classes = 7)

In [14]:
X = train.drop(['dx', 'CancerType'], axis=1)
X.set_index('image_id', inplace=True)

In [15]:
X_test = test.drop(['dx', 'CancerType'], axis=1)
X_test.set_index('image_id', inplace=True)

In [16]:
y_integers = np.argmax(y_train, axis=1)
class_weights = compute_class_weight(class_weight = 'balanced',
                                                  classes = np.unique(y_integers),
                                                  y = y_integers)
d_class_weights = dict(enumerate(class_weights))


NameError: name 'compute_class_weight' is not defined

In [None]:
d_class_weights

In [None]:
df_train.head()

In [None]:
X.head()

In [None]:
y_train

In [None]:
# Convert the images to numpy arrays
X_image = np.asarray(X['image'].tolist())
X_test_image = np.asarray(X_test['image'].tolist())

In [None]:
X_image.shape

In [None]:
X_image

## 2. Verify data quality

In [None]:
def get_image_dimensions(image_list):
    """
    This function prints the largest and smallest dimensions of the images in the list
    Args:
        image_list: list of images
    """
    
    # List for storing image dimensions
    largest_width, largest_height = 0, 0
    smallest_width, smallest_height = float('inf'), float('inf')
    
    for image in image_list:
        # Get the width and height of the image
        height, width, _ = image.shape
    
        # Update largest and smallest dimensions if necessary
        largest_width = max(largest_width, width)
        largest_height = max(largest_height, height)
        smallest_width = min(smallest_width, width)
        smallest_height = min(smallest_height, height)
        
    print("Largest Image : {}x{}".format(largest_width, largest_height))
    print("Smallest Image : {}x{}".format(smallest_width, smallest_height))

In [None]:
get_image_dimensions(X_image)

All images have the same size, so we don't need to resize them.

## 3. Image Enhancement

In [None]:
def apply_contrast_enhancement(images_data, size = (299, 224), alpha = 1.3, beta = 0.5, display = False):
    """
    This function applies contrast enhancement to the images in a dataset.
    Args:
        images_data: list of images
        size: size to which the images should be resized
        alpha: contrast control (1.0 means no change)
        beta: brightness control (0 means no change)
        display: whether to display the images before and after the contrast enhancement
    Returns:
        images_data_processed: numpy array of processed images
    """

    # Lists to store the processed images
    images_data_processed = []

    # Apply contrast enhancement to each image
    for img in images_data:
        # Resize it
        img = cv2.resize(img, size)

        # Apply contrast enhancement
        enhanced_img = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)

        # Append the processed image to the list
        images_data_processed.append(enhanced_img)

    # Convert the processed lists to numpy arrays
    images_data_processed = np.array(images_data_processed)

    # Display first 6 images and compare them with the original images
    if display:
        fig, ax = plt.subplots(1, 6, figsize=(15, 15))
        for i in range(6):
            ax[i].imshow(images_data[i])
            ax[i].set_title("Original Image")
        plt.show()

        fig, ax = plt.subplots(1, 6, figsize=(15, 15))
        for i in range(6):
            ax[i].imshow(images_data_processed[i])
            ax[i].set_title("Contrast Enhanced Image")
        plt.show()
    return images_data_processed


In [None]:
X_processed = apply_contrast_enhancement(X_image,alpha = 1.15, beta=4, display=True)

In [None]:
X_test_processed = apply_contrast_enhancement(X_test_image,alpha = 1.15, beta=4, display=True)

## 4. Image Data Augmentation

In [None]:
from keras.preprocessing.image import ImageDataGenerator

IDG = ImageDataGenerator(rescale= 1./255,
                             rotation_range=35,  # randomly rotate images in the range (degrees, 0 to 180)
                             zoom_range = 0.12, # Randomly zoom image
                             width_shift_range=0.2,  # randomly shift images horizontally (fraction of total width)
                             height_shift_range=0.15,  # randomly shift images vertically (fraction of total height)
                             horizontal_flip=True,  # randomly flip images
                             vertical_flip=True # randomly flip images
                                )   

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(X_processed, y_train, test_size=0.2, stratify=y_train, shuffle=True)


In [None]:
IDG.fit(X_train)

In [None]:
X_processed.shape

In [None]:
X_image.shape

In [None]:
from keras.models import Sequential
import tensorflow
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications import (
    ResNet50,
    InceptionV3,
    DenseNet121,
    VGG16,
    Xception)


base_model= tensorflow.keras.applications.VGG16(
    include_top=False,
    weights="imagenet",
    input_shape=(224, 299, 3),
    pooling=None,
)

for layer in base_model.layers:
    layer.trainable = False


model=Sequential()
model.add(base_model)
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dense(728, activation='relu'))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(728, activation='relu'))
model.add(Dense(7, activation='softmax'))


early_stop = EarlyStopping(monitor='val_loss',patience=3)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(IDG.flow(X_train, Y_train, batch_size=128), epochs=7 , validation_data=(X_val, Y_val), callbacks=[early_stop] )

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(IDG.flow(X_processed, y_train, batch_size=32), epochs=5, callbacks=[early_stop], class_weight=d_class_weights, validation_split=0.2)

In [None]:
history.history['val_accuracy'][-1]

In [None]:
# Let's now split the train dataset into train and validation
#X_train, X_val, y_train_2, y_val = train_test_split(df_train, y_train, test_size=0.2, random_state=42, stratify=y_train, shuffle=True)

In [None]:
#y_train_2.value_counts()

# 1. Class Weights

In [None]:
# We can see that the dataset is not balanced at all, so we should use class weights to compensate for that
from sklearn.utils import class_weight

#class_weights = class_weight.compute_class_weight('balanced',
#                                                 np.unique(y_train_2),
 #                                                y_train_2)

#class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
